PDF: Search and extract text in PDF document
In This Topic
TextRegion class is intended for searching and extracting a text from the whole PDF page or from a region of PDF page.
A text region which represents a whole PDF page can be obtained using
PdfPage.TextRegion property.
A text region which represents some region of PDF page can be obtained using
TextRegion.GetSubregion method.
IMPORTANT! All coordinates, which define text location on PDF page, are specified in the coordinate system of PDF page.
All sizes, which define sizes of text regions, are specified in units of measure of PDF page.
Information about coordinate system and measurement units of PDF page is available
here.
Text search
TextRegion class allows to:
- search text in a whole page or in a region of page
- search for case sensitive or case insensitive text
- specify the direction of text search
- use regular expressions in text search
- define custom text search algorithm
Here is an example that demonstrates how to find a text in a PDF page:
public static Vintasoft.Imaging.Text.TextRegion FindTextOnPdfPage(
Vintasoft.Imaging.Pdf.PdfDocument document,
int pageIndex,
string text)
{
// specify that non-case sensitive text must be searched
Vintasoft.Imaging.Text.TextSearchEngine searchEngine =
Vintasoft.Imaging.Text.TextSearchEngine.Create(text, true);
// find text
int startIndex = 0;
return document.Pages[pageIndex].TextRegion.FindText(text, ref startIndex, false);
}
Public Shared Function FindTextOnPdfPage(document As Vintasoft.Imaging.Pdf.PdfDocument, pageIndex As Integer, text As String) As Vintasoft.Imaging.Text.TextRegion
' specify that non-case sensitive text must be searched
Dim searchEngine As Vintasoft.Imaging.Text.TextSearchEngine = Vintasoft.Imaging.Text.TextSearchEngine.Create(text, True)
' find text
Dim startIndex As Integer = 0
Return document.Pages(pageIndex).TextRegion.FindText(text, startIndex, False)
End Function
Here is an example that demonstrates how to search for text in PDF document using a regular expression:
/// <summary>
/// Outputs the information about digits in content of PDF document.
/// </summary>
/// <param name="document">PDF document where digits should be searched.</param>
public void SearchDigitsInTextOfPdfDocument(Vintasoft.Imaging.Pdf.PdfDocument document)
{
System.Console.WriteLine("Searching the digits in text of PDF document is started.");
for (int i = 0; i < document.Pages.Count; i++)
{
Vintasoft.Imaging.Text.TextRegion[] textRegions =
SimpleDigitsSearchOnPdfPage(document.Pages[i], new System.Text.RegularExpressions.Regex(@"\d+"));
if (textRegions != null)
{
for (int j = 0; j < textRegions.Length; j++)
{
System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
textRegions[j].TextContent,
textRegions[j].Rectangle));
}
}
}
System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
}
/// <summary>
/// Searches a text, defined with regular expression, on PDF page.
/// </summary>
/// <param name="page">PDF page where text should be searched.</param>
/// <param name="regex">Regular expression which defines the searching text.</param>
/// <returns>An array of text regions on PDF page where text was found.</returns>
public Vintasoft.Imaging.Text.TextRegion[] SimpleDigitsSearchOnPdfPage(
Vintasoft.Imaging.Pdf.Tree.PdfPage page,
System.Text.RegularExpressions.Regex regex)
{
System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions =
new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
Vintasoft.Imaging.Text.TextSearchEngine textSearchEngine =
Vintasoft.Imaging.Text.TextSearchEngine.Create(regex);
Vintasoft.Imaging.Text.TextRegion textRegion = null;
int startIndex = 0;
do
{
// search text
textRegion = page.TextRegion.FindText(textSearchEngine, ref startIndex, false);
// if found text is not empty
if (textRegion != null)
{
// add result
textRegions.Add(textRegion);
// shitf start index
startIndex += textRegion.TextContent.Length;
}
} while (textRegion != null);
return textRegions.ToArray();
}
''' <summary>
''' Outputs the information about digits in content of PDF document.
''' </summary>
''' <param name="document">PDF document where digits should be searched.</param>
Public Sub SearchDigitsInTextOfPdfDocument(document As Vintasoft.Imaging.Pdf.PdfDocument)
System.Console.WriteLine("Searching the digits in text of PDF document is started.")
For i As Integer = 0 To document.Pages.Count - 1
Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = SimpleDigitsSearchOnPdfPage(document.Pages(i), New System.Text.RegularExpressions.Regex("\d+"))
If textRegions IsNot Nothing Then
For j As Integer = 0 To textRegions.Length - 1
System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
Next
End If
Next
System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
End Sub
''' <summary>
''' Searches a text, defined with regular expression, on PDF page.
''' </summary>
''' <param name="page">PDF page where text should be searched.</param>
''' <param name="regex">Regular expression which defines the searching text.</param>
''' <returns>An array of text regions on PDF page where text was found.</returns>
Public Function SimpleDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage, regex As System.Text.RegularExpressions.Regex) As Vintasoft.Imaging.Text.TextRegion()
Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
Dim textSearchEngine As Vintasoft.Imaging.Text.TextSearchEngine = Vintasoft.Imaging.Text.TextSearchEngine.Create(regex)
Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
Dim startIndex As Integer = 0
Do
' search text
textRegion = page.TextRegion.FindText(textSearchEngine, startIndex, False)
' if found text is not empty
If textRegion IsNot Nothing Then
' add result
textRegions.Add(textRegion)
' shitf start index
startIndex += textRegion.TextContent.Length
End If
Loop While textRegion IsNot Nothing
Return textRegions.ToArray()
End Function
Here is an example that demonstrates how to search for text in PDF document using user-defined algorithm of text search:
/// <summary>
/// Outputs the information about digits in content of PDF document.
/// </summary>
/// <param name="document">PDF document where digits should be searched.</param>
public void SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(Vintasoft.Imaging.Pdf.PdfDocument document)
{
System.Console.WriteLine("Searching the digits in text of PDF document.");
for (int i = 0; i < document.Pages.Count; i++)
{
Vintasoft.Imaging.Text.TextRegion[] textRegions =
AdvancedDigitsSearchOnPdfPage(document.Pages[i]);
if (textRegions != null)
{
for (int j = 0; j < textRegions.Length; j++)
{
System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
textRegions[j].TextContent,
textRegions[j].Rectangle));
}
}
}
System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
}
/// <summary>
/// Searches digits on PDF page.
/// </summary>
/// <param name="page">PDF page where digits should be searched.</param>
/// <returns>An array of text regions on PDF page where text was found.</returns>
public Vintasoft.Imaging.Text.TextRegion[] AdvancedDigitsSearchOnPdfPage(
Vintasoft.Imaging.Pdf.Tree.PdfPage page)
{
System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions =
new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
DigitsSearchEngine digitsSearchEngine = new DigitsSearchEngine();
Vintasoft.Imaging.Text.TextRegion textRegion = null;
int startIndex = 0;
do
{
// search text
textRegion = page.TextRegion.FindText(digitsSearchEngine, ref startIndex, false);
if (textRegion != null)
{
// add result
textRegions.Add(textRegion);
// shitf start index
startIndex += textRegion.TextContent.Length;
}
} while (textRegion != null);
return textRegions.ToArray();
}
/// <summary>
/// Class for searching the digits in text of PDF page.
/// </summary>
class DigitsSearchEngine : Vintasoft.Imaging.Text.TextSearchEngine
{
/// <summary>
/// Searches the first text matching in the string of PDF page.
/// </summary>
/// <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
/// <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
/// <param name="length">The number of characters, in the sourceString, to analyze.</param>
/// <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
/// <returns>
/// Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
/// contains information about searched text if text is found; otherwise, null.
/// </returns>
public override Vintasoft.Imaging.Text.TextSearchResult Find(
string sourceString, int startIndex, int length, bool rightToLeft)
{
int startDigitIndex = -1;
int endDigitIndex = -1;
int start = 0;
int end = 0;
// if searching text from the right to the left
if (rightToLeft)
{
start = startIndex + length;
end = 0;
for (int index = start - 1; index >= end; index--)
{
if (char.IsDigit(sourceString[index]) && endDigitIndex == -1)
endDigitIndex = index + 1;
else if (!char.IsDigit(sourceString[index]) && endDigitIndex != -1)
{
startDigitIndex = index + 1;
break;
}
}
if (endDigitIndex != -1 && startDigitIndex == -1)
startDigitIndex = 0;
}
// if searching text from the left to the right
else
{
start = startIndex;
end = startIndex + length;
for (int index = start; index < end; index++)
{
if (char.IsDigit(sourceString[index]) && startDigitIndex == -1)
startDigitIndex = index;
else if (!char.IsDigit(sourceString[index]) && startDigitIndex != -1)
{
endDigitIndex = index;
break;
}
}
if (startDigitIndex != -1 && endDigitIndex == -1)
endDigitIndex = end;
}
// if digit is not found
if (startDigitIndex == -1)
return null;
// return the text search result
return new Vintasoft.Imaging.Text.TextSearchResult(
startDigitIndex, endDigitIndex - startDigitIndex);
}
}
''' <summary>
''' Outputs the information about digits in content of PDF document.
''' </summary>
''' <param name="document">PDF document where digits should be searched.</param>
Public Sub SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(document As Vintasoft.Imaging.Pdf.PdfDocument)
System.Console.WriteLine("Searching the digits in text of PDF document.")
For i As Integer = 0 To document.Pages.Count - 1
Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = AdvancedDigitsSearchOnPdfPage(document.Pages(i))
If textRegions IsNot Nothing Then
For j As Integer = 0 To textRegions.Length - 1
System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
Next
End If
Next
System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
End Sub
''' <summary>
''' Searches digits on PDF page.
''' </summary>
''' <param name="page">PDF page where digits should be searched.</param>
''' <returns>An array of text regions on PDF page where text was found.</returns>
Public Function AdvancedDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As Vintasoft.Imaging.Text.TextRegion()
Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
Dim digitsSearchEngine As New DigitsSearchEngine()
Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
Dim startIndex As Integer = 0
Do
' search text
textRegion = page.TextRegion.FindText(digitsSearchEngine, startIndex, False)
If textRegion IsNot Nothing Then
' add result
textRegions.Add(textRegion)
' shitf start index
startIndex += textRegion.TextContent.Length
End If
Loop While textRegion IsNot Nothing
Return textRegions.ToArray()
End Function
''' <summary>
''' Class for searching the digits in text of PDF page.
''' </summary>
Private Class DigitsSearchEngine
Inherits Vintasoft.Imaging.Text.TextSearchEngine
''' <summary>
''' Searches the first text matching in the string of PDF page.
''' </summary>
''' <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
''' <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
''' <param name="length">The number of characters, in the sourceString, to analyze.</param>
''' <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
''' <returns>
''' Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
''' contains information about searched text if text is found; otherwise, null.
''' </returns>
Public Overrides Function Find(sourceString As String, startIndex As Integer, length As Integer, rightToLeft As Boolean) As Vintasoft.Imaging.Text.TextSearchResult
Dim startDigitIndex As Integer = -1
Dim endDigitIndex As Integer = -1
Dim start As Integer = 0
Dim [end] As Integer = 0
' if searching text from the right to the left
If rightToLeft Then
start = startIndex + length
[end] = 0
For index As Integer = start - 1 To [end] Step -1
If Char.IsDigit(sourceString(index)) AndAlso endDigitIndex = -1 Then
endDigitIndex = index + 1
ElseIf Not Char.IsDigit(sourceString(index)) AndAlso endDigitIndex <> -1 Then
startDigitIndex = index + 1
Exit For
End If
Next
If endDigitIndex <> -1 AndAlso startDigitIndex = -1 Then
startDigitIndex = 0
End If
Else
' if searching text from the left to the right
start = startIndex
[end] = startIndex + length
For index As Integer = start To [end] - 1
If Char.IsDigit(sourceString(index)) AndAlso startDigitIndex = -1 Then
startDigitIndex = index
ElseIf Not Char.IsDigit(sourceString(index)) AndAlso startDigitIndex <> -1 Then
endDigitIndex = index
Exit For
End If
Next
If startDigitIndex <> -1 AndAlso endDigitIndex = -1 Then
endDigitIndex = [end]
End If
End If
' if digit is not found
If startDigitIndex = -1 Then
Return Nothing
End If
' return the text search result
Return New Vintasoft.Imaging.Text.TextSearchResult(startDigitIndex, endDigitIndex - startDigitIndex)
End Function
End Class
Text extraction
TextRegion class allows to extract:
While extracting text from a page region is necessary to specify how the text must be extracted. SDK allows to extract text:
- by full lines, i.e. there are extracted all text lines, which are fully or partially located in the specified region.
- strictly from the specified region, i.e. there is extracted only the text, that is located in the specified region.
By default the text is extracted by full lines.
Here is an example that demonstrates how to extract all text from the whole PDF page:
public static string ExtractTextFromPdfPage(Vintasoft.Imaging.Pdf.PdfDocument document, int pageIndex)
{
return document.Pages[pageIndex].TextRegion.TextContent;
}
Public Shared Function ExtractTextFromPdfPage(document As Vintasoft.Imaging.Pdf.PdfDocument, pageIndex As Integer) As String
Return document.Pages(pageIndex).TextRegion.TextContent
End Function
Also the
TextRegion class allows to extract text from PDF page as a tree structure, i.e. it is possible to obtain
a region representing all text of whole page -
PdfPage.TextRegion, then all text lines -
TextRegion.Lines,
then all symbols of the text line -
TextRegionLine.Symbols.