Here was my process. I first had to rasterize PDFs (which may not be your requirement)
1.) Install Ghostcript 9.26 from here later versions don't work with the next step
2.) Install Ghostscript.NET NuGet Install-Package Ghostscript.NET -Version 1.2.1
3.) Install Tesseract NuGet Install-Package Tesseract -Version 3.3.0
Here is my PDF rasterization routine, using Ghostscript.NET
public static List<MemoryStream> GetPdfImages(FileInfo pdfFile, DirectoryInfo workingDir, string fileNamingToken, TextWriter _logger)
{
int desired_x_dpi = 150;
int desired_y_dpi = 150;
string inputPdfPath = pdfFile.FullName;
var streams = new List<MemoryStream>();
using (var rasterizer = new GhostscriptRasterizer())
{
GhostscriptVersionInfo gsVersionInfo = GhostscriptVersionInfo.GetLastInstalledVersion(GhostscriptLicense.GPL | GhostscriptLicense.AFPL, GhostscriptLicense.GPL);
try
{
rasterizer.Open(inputPdfPath, gsVersionInfo, true);
}
catch (Ghostscript.NET.GhostscriptAPICallException exc)
{
_logger.WriteLine("There is an issue with this version of Ghostscript or how Ghostscript was installed. As of Winter 2020, GS 9.26 will work the best with Ghostscript.NET");
}
for (var pageNumber = 1; pageNumber <= rasterizer.PageCount; pageNumber++)
{
var memoryStrm = new MemoryStream();
var img = rasterizer.GetPage(desired_x_dpi, desired_y_dpi, pageNumber);
//save to a memory stream to be returned
img.Save(memoryStrm, System.Drawing.Imaging.ImageFormat.Tiff);
//or save to the file system to see how well it's working
img.Save($"{workingDir.FullName}\\{fileNamingToken}_{pageNumber}.TIF");
_logger.WriteLine($"Image Dimensions: {img.Width} x {img.Height}");
streams.Add(memoryStrm);
}
}
return streams;
}
Once I've created a list of memorystreams, I choose to loop through them and OCR a rectangle out of them with Tesseract. If you have a lot of files to process, you shouldn't be invoking the engine over and over again .. you'd keep it around somewhere else
var _engine = new TesseractEngine("./tessdata", "eng", EngineMode.Default, "letters");
var topHalfPageRect = Rect.FromCoords(1, 1, 1275, 825);//at 150 DPI, get top of 8.5x11 page
for(int i =0;i< _streams.Count;i++)
{
var imgStm = _streams[i];//my list of memorystreams created by Ghostcript 9.26
imgStm.Position = 0;//set memorystream playhead back to start
using (var imageWithText = Pix.LoadTiffFromMemory(imgStm.ToArray()))
{
using (var page = _engine.Process(imageWithText, topHalfPageRect , PageSegMode.SparseText))
{
var text = page.GetText();
var processedText = text.Replace("\n", "").Trim();
Console.WriteLine(processedText);
if (MyRegexPatterns.Pattern1.IsMatch(processedText))
{
Console.WriteLine("*** FOUND IT!! ***");
}
}
}
imgStm.Dispose();//but not matter what, disppose of the stream now
}