How do i Extract text from document scanned and save as pdf file? I am using C# to develop program.
I have a document scanned and this document be saved as a pdf file. I want to make a tool to read data from this document and convert it to a Word or text file. I tried to convert this pdf file to an image and used OCR and bitmap to convert it to text but when it finished, it got a font error. Anyone can help me?
here is my code:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Tesseract;
using IronOcr;
namespace IMG_To_Text
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog1 = new OpenFileDialog
{
InitialDirectory = @"D:\",
Title = "Browse Text Files",
CheckFileExists = true,
CheckPathExists = true,
DefaultExt = "png",
Filter = "png files (*.png)|*.png|jpg files (*.jpg)|*.jpg",
FilterIndex = 2,
RestoreDirectory = true,
ReadOnlyChecked = true,
ShowReadOnly = true
};
if (openFileDialog.ShowDialog() == DialogResult.OK)
{
pictureBox1.LoadAsync(openFileDialog.FileName);
} }
private string OCR(Bitmap b)
{
string res = "";
using (var engine = new TesseractEngine(@"tessdata", "vie", EngineMode.Default))
{
using (var page = engine.Process(b, PageSegMode.AutoOnly))
res = page.GetText();
}
return res;
}
private void button2_Click(object sender, EventArgs e)
{
string result = "";
Task.Factory.StartNew(() => {
picloading.BeginInvoke(new Action(() =>
{
picloading.Visible = true;
}));
result = OCR((Bitmap)pictureBox1.Image);
richTextBox1.BeginInvoke(new Action(() => {
richTextBox1.Text = result;
}));
picloading.BeginInvoke(new Action(() =>
{
picloading.Visible = false;
}));
});
}
}
}