Code: Select all
using System.Collections.Generic;
using System.IO;
using Vintasoft.Imaging;
using Vintasoft.Imaging.ImageProcessing.Document;
using Vintasoft.Imaging.ImageProcessing.Info;
using Vintasoft.Imaging.Ocr;
using Vintasoft.Imaging.Ocr.Results;
using Vintasoft.Imaging.Ocr.Tesseract;
using Vintasoft.Imaging.Pdf;
using Vintasoft.Imaging.Pdf.Ocr;
namespace ConvertMultipageTiffToSearchablePdfDocument
{
class Program
{
static void Main(string[] args)
{
// create Tesseract OCR
using (TesseractOcr tesseractOcr = new TesseractOcr(@"..\..\TesseractOCR"))
{
// create OCR engine manager
OcrEngineManager engineManager = new OcrEngineManager(tesseractOcr);
// create OCR settings
OcrEngineSettings ocrSettings = new OcrEngineSettings(OcrLanguage.Turkish);
// create image collection
ImageCollection images = new ImageCollection();
// add multipage TIFF file to the image collection
images.Add("multipage.tif");
// create a dictionary: image => OCR page
Dictionary<VintasoftImage, OcrPage> imagesToOcrPages = new Dictionary<VintasoftImage, OcrPage>();
// for each image in image collection
for (int i = 0; i < images.Count; i++)
{
// get image
VintasoftImage image = images[i];
// clone image
using (VintasoftImage clonedImage = (VintasoftImage)image.Clone())
{
// remove tables from image
LineRemovalCommand lineRemovalCommand = new LineRemovalCommand(RemovingLinesType.Tables);
lineRemovalCommand.ExecuteInPlace(clonedImage);
// remove halftone from image
HalftoneRemovalCommand halftoneRemovalCommand = new HalftoneRemovalCommand();
halftoneRemovalCommand.ExecuteInPlace(image);
// clear border on image
BorderClearCommand borderClearCommand = new BorderClearCommand();
borderClearCommand.ExecuteInPlace(image);
// remove hole punches from image
HolePunchRemovalCommand holePunchRemovalCommand = new HolePunchRemovalCommand();
holePunchRemovalCommand.ExecuteInPlace(image);
// remove noise from image
DespeckleCommand despeckleCommand = new DespeckleCommand();
despeckleCommand.ExecuteInPlace(clonedImage);
// deskew image
DeskewCommand deskewCommand = new DeskewCommand();
deskewCommand.ExecuteInPlace(image);
// detect regions (text, images, etc) on image
DocumentSegmentationCommand documentSegmentationCommand = new DocumentSegmentationCommand();
documentSegmentationCommand.BorderSize = 30;
documentSegmentationCommand.ExecuteInPlace(image);
// recognize text in image regions
OcrPage ocrPage = engineManager.Recognize(clonedImage, ocrSettings, documentSegmentationCommand.Regions);
// save information about recognized text in dictionary
imagesToOcrPages.Add(image, ocrPage);
}
}
// create new PDF document
using (PdfDocument document = new PdfDocument("searchable.pdf", System.IO.FileMode.Create, PdfFormat.Pdf_14))
{
// create PDF document builder
PdfDocumentBuilder documentBuilder = new PdfDocumentBuilder(document);
// specify that PDF document must contain image over text
documentBuilder.PageCreationMode = PdfPageCreationMode.ImageOverText;
// create file font programs controller
FileFontProgramsController fileFontProgramsController = new FileFontProgramsController(true);
// get a stream, which contains font program, which contains all recognized characters
Stream fontStream = fileFontProgramsController.GetTrueTypeFontProgram(null, "Times New Roman");
// if stream with font program is found
if (fontStream != null)
{
// set Times New Roman font as text font in PDF document
documentBuilder.Font = document.FontManager.CreateCIDFontFromTrueTypeFont(fontStream);
// dispose stream with font program
fontStream.Dispose();
}
// for each image in image collection
for (int i = 0; i < images.Count; i++)
{
// get image
VintasoftImage image = images[i];
// if image has recognized text
if (imagesToOcrPages.ContainsKey(image))
{
// add image with text as a new page of PDF document
documentBuilder.AddPage(image, imagesToOcrPages[image]);
}
// if image does NOT have recognized text
else
{
// add image as a new page of PDF document
documentBuilder.AddPage(image, null);
}
}
// pack fonts in PDF document for removing unused characters from fonts
document.FontManager.PackAllFonts();
// pack PDF document
document.Pack();
}
// clear image collection and dispose images
images.ClearAndDisposeItems();
}
}
}
}