public class PDFTextStripperByArea extends PDFTextStripper
Modifier and Type | Field and Description |
---|---|
private java.util.Map<java.lang.String,java.awt.geom.Rectangle2D> |
regionArea |
private java.util.Map<java.lang.String,java.util.ArrayList<java.util.List<TextPosition>>> |
regionCharacterList |
private java.util.List<java.lang.String> |
regions |
private java.util.Map<java.lang.String,java.io.StringWriter> |
regionText |
charactersByArticle, document, LINE_SEPARATOR, output
Constructor and Description |
---|
PDFTextStripperByArea()
Constructor.
|
Modifier and Type | Method and Description |
---|---|
void |
addRegion(java.lang.String regionName,
java.awt.geom.Rectangle2D rect)
Add a new region to group text by.
|
void |
extractRegions(PDPage page)
Process the page to extract the region text.
|
java.util.List<java.lang.String> |
getRegions()
Get the list of regions that have been setup.
|
java.lang.String |
getTextForRegion(java.lang.String regionName)
Get the text for the region, this should be called after extractRegions().
|
protected void |
processTextPosition(TextPosition text)
This will process a TextPosition object and add the text to the list of characters on a page.
|
void |
removeRegion(java.lang.String regionName)
Delete a region to group text by.
|
void |
setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
This method does nothing in this derived class, because beads and regions are incompatible.
|
protected void |
writePage()
This will print the processed page text to the output stream.
|
endArticle, endDocument, endPage, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getCurrentPageNo, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getStartPage, getSuppressDuplicateOverlappingText, getText, getWordSeparator, matchPattern, processPage, processPages, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndBookmark, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageStart, setParagraphEnd, setParagraphStart, setSortByPosition, setSpacingTolerance, setStartBookmark, setStartPage, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startArticle, startDocument, startPage, writeCharacters, writeLineSeparator, writePageEnd, writePageStart, writeParagraphEnd, writeParagraphSeparator, writeParagraphStart, writeString, writeString, writeText, writeWordSeparator
showGlyph
addOperator, applyTextAdjustment, beginMarkedContentSequence, beginText, decreaseLevel, endMarkedContentSequence, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getLevel, getResources, getTextLineMatrix, getTextMatrix, increaseLevel, operatorException, processAnnotation, processChildStream, processOperator, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showForm, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, transformedPoint, transformWidth, unsupportedOperator
private final java.util.List<java.lang.String> regions
private final java.util.Map<java.lang.String,java.awt.geom.Rectangle2D> regionArea
private final java.util.Map<java.lang.String,java.util.ArrayList<java.util.List<TextPosition>>> regionCharacterList
private final java.util.Map<java.lang.String,java.io.StringWriter> regionText
public PDFTextStripperByArea() throws java.io.IOException
java.io.IOException
- If there is an error loading properties.public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
setShouldSeparateByBeads
in class PDFTextStripper
aShouldSeparateByBeads
- The new grouping of beads.public void addRegion(java.lang.String regionName, java.awt.geom.Rectangle2D rect)
regionName
- The name of the region.rect
- The rectangle area to retrieve the text from. The y-coordinates are java
coordinates (y == 0 is top), not PDF coordinates (y == 0 is bottom).public void removeRegion(java.lang.String regionName)
regionName
- The name of the region to delete.public java.util.List<java.lang.String> getRegions()
public java.lang.String getTextForRegion(java.lang.String regionName)
regionName
- The name of the region to get the text from.public void extractRegions(PDPage page) throws java.io.IOException
page
- The page to extract the regions from.java.io.IOException
- If there is an error while extracting text.protected void processTextPosition(TextPosition text)
processTextPosition
in class PDFTextStripper
text
- The text to process.protected void writePage() throws java.io.IOException
writePage
in class PDFTextStripper
java.io.IOException
- If there is an error writing the text.