How do i Extract text from document scanned and save as pdf file? I am using C# to develop program.

Tuấn Anh Nguyễn 1 Reputation point
2022-06-23T04:38:40.453+00:00

I have a document scanned and this document be saved as a pdf file. I want to make a tool to read data from this document and convert it to a Word or text file. I tried to convert this pdf file to an image and used OCR and bitmap to convert it to text but when it finished, it got a font error. Anyone can help me?

here is my code:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Tesseract;
using IronOcr;

namespace IMG_To_Text
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

    private void button1_Click(object sender, EventArgs e)  
    {  
        OpenFileDialog openFileDialog1 = new OpenFileDialog  
        {  
            InitialDirectory = @"D:\",  
            Title = "Browse Text Files",  

            CheckFileExists = true,  
            CheckPathExists = true,  

            DefaultExt = "png",  
            Filter = "png files (*.png)|*.png|jpg files (*.jpg)|*.jpg",  
            FilterIndex = 2,  
            RestoreDirectory = true,  

            ReadOnlyChecked = true,  
            ShowReadOnly = true  
        };  

        if (openFileDialog.ShowDialog() == DialogResult.OK)  
        {  
            pictureBox1.LoadAsync(openFileDialog.FileName);  
        }        }  

  

    private string OCR(Bitmap b)  
    {  
        string res = "";  
        using (var engine = new TesseractEngine(@"tessdata", "vie", EngineMode.Default))  
        {    
            using (var page = engine.Process(b, PageSegMode.AutoOnly))  
                res = page.GetText();  
        }  
        return res;  
    }  


    private void button2_Click(object sender, EventArgs e)  
    {  
          
            string result = "";  
            Task.Factory.StartNew(() => {  
                picloading.BeginInvoke(new Action(() =>  
                {  
                    picloading.Visible = true;  
                }));  

                result = OCR((Bitmap)pictureBox1.Image);  
                richTextBox1.BeginInvoke(new Action(() => {  

                    richTextBox1.Text = result;  

                }));  
                picloading.BeginInvoke(new Action(() =>  
                {  
                    picloading.Visible = false;  
                }));  

            });  
          
         
    }  
}  

}

Windows Forms
Windows Forms
A set of .NET Framework managed libraries for developing graphical user interfaces.
1,836 questions
{count} votes