public class COSParser extends BaseParser
PDFParser.parse()
or FDFParser.parse()
must be called before page objects
can be retrieved, e.g. PDFParser.getPDDocument()
.
This class is a much enhanced version of QuickParser
presented in PDFBOX-1104 by Jeremy Villalobos.Modifier and Type | Field and Description |
---|---|
private AccessPermission |
accessPermission |
private java.util.Map<COSObjectKey,java.lang.Long> |
bfSearchCOSObjectKeyOffsets
Contains all found objects of a brute force search.
|
private java.util.List<java.lang.Long> |
bfSearchXRefStreamsOffsets |
private java.util.List<java.lang.Long> |
bfSearchXRefTablesOffsets |
private static int |
DEFAULT_TRAIL_BYTECOUNT
How many trailing bytes to read for EOF marker.
|
private PDEncryption |
encryption |
private static byte[] |
ENDOBJ |
private static byte[] |
ENDSTREAM |
protected static char[] |
EOF_MARKER
EOF-marker.
|
private static java.lang.String |
FDF_DEFAULT_VERSION |
private static java.lang.String |
FDF_HEADER |
protected long |
fileLen
file length.
|
protected boolean |
initialParseDone |
private boolean |
isLenient
is parser using auto healing capacity ?
|
private java.lang.String |
keyAlias |
private java.io.InputStream |
keyStoreInputStream |
private java.lang.Long |
lastEOFMarker |
private static org.apache.commons.logging.Log |
LOG |
private static long |
MINIMUM_SEARCH_OFFSET |
protected static char[] |
OBJ_MARKER
obj-marker.
|
private static char[] |
OBJ_STREAM
ObjStream-marker.
|
private java.lang.String |
password |
private static java.lang.String |
PDF_DEFAULT_VERSION |
private static java.lang.String |
PDF_HEADER |
private int |
readTrailBytes
how many trailing bytes to read for EOF marker.
|
protected SecurityHandler |
securityHandler
The security handler.
|
protected RandomAccessRead |
source |
private static char[] |
STARTXREF |
private byte[] |
streamCopyBuf |
private static int |
STREAMCOPYBUFLEN |
private byte[] |
strmBuf |
private static int |
STRMBUFLEN |
static java.lang.String |
SYSPROP_EOFLOOKUPRANGE
The range within the %%EOF marker will be searched.
|
static java.lang.String |
SYSPROP_PARSEMINIMAL
Only parse the PDF file minimally allowing access to basic information.
|
static java.lang.String |
TMP_FILE_PREFIX
The prefix for the temp file being used.
|
private static char[] |
TRAILER_MARKER
trailer-marker.
|
private long |
trailerOffset |
private boolean |
trailerWasRebuild |
private static int |
X |
private static char[] |
XREF_STREAM |
private static char[] |
XREF_TABLE |
protected XrefTrailerResolver |
xrefTrailerResolver
Collects all Xref/trailer objects and resolves them into single
object using startxref reference.
|
A, ASCII_CR, ASCII_LF, B, D, DEF, document, E, ENDOBJ_STRING, ENDSTREAM_STRING, J, M, MAX_LENGTH_LONG, N, O, R, S, seqSource, STREAM_STRING, T
Constructor and Description |
---|
COSParser(RandomAccessRead source)
Default constructor.
|
COSParser(RandomAccessRead source,
java.lang.String password,
java.io.InputStream keyStore,
java.lang.String keyAlias)
Constructor for encrypted pdfs.
|
Modifier and Type | Method and Description |
---|---|
private void |
addExcludedToList(COSName[] excludeObjects,
COSDictionary dict,
java.util.Set<java.lang.Long> parsedObjects) |
private void |
addNewToList(java.util.Queue<COSBase> toBeParsedList,
java.util.Collection<COSBase> newObjects,
java.util.Set<java.lang.Long> addedObjects)
Adds all from newObjects to toBeParsedList if it is not an COSObject or
we didn't add this COSObject already (checked via addedObjects).
|
private void |
addNewToList(java.util.Queue<COSBase> toBeParsedList,
COSBase newObject,
java.util.Set<java.lang.Long> addedObjects)
Adds newObject to toBeParsedList if it is not an COSObject or we didn't
add this COSObject already (checked via addedObjects).
|
private void |
bfSearchForLastEOFMarker()
Brute force search for the last EOF marker.
|
private void |
bfSearchForObjects()
Brute force search for every object in the pdf.
|
private void |
bfSearchForObjStreams()
Brute force search for all object streams.
|
private boolean |
bfSearchForTrailer(COSDictionary trailer)
Brute force search for all trailer marker.
|
private long |
bfSearchForXRef(long xrefOffset,
boolean streamsOnly)
Search for the offset of the given xref table/stream among those found by a brute force search.
|
private void |
bfSearchForXRefStreams()
Brute force search for all /XRef entries (streams).
|
private void |
bfSearchForXRefTables()
Brute force search for all xref entries (tables).
|
private long |
calculateXRefFixedOffset(long objectOffset,
boolean streamsOnly)
Try to find a fixed offset for the given xref table/stream.
|
private boolean |
checkObjectKey(COSObjectKey objectKey,
long offset)
Check if the given object can be found at the given offset.
|
protected void |
checkPages(COSDictionary root)
Check if all entries of the pages dictionary are present.
|
private int |
checkPagesDictionary(COSDictionary pagesDict,
java.util.Set<COSObject> set) |
private long |
checkXRefOffset(long startXRefOffset)
Check if the cross reference table/stream can be found at the current offset.
|
private void |
checkXrefOffsets()
Check the XRef table by dereferencing all objects and fixing the offset if necessary.
|
private boolean |
checkXRefStreamOffset(long startXRefOffset)
Check if the cross reference stream can be found at the current offset.
|
AccessPermission |
getAccessPermission()
This will get the AccessPermission.
|
COSDocument |
getDocument()
This will get the document that was parsed.
|
PDEncryption |
getEncryption()
This will get the encryption dictionary.
|
private COSNumber |
getLength(COSBase lengthBaseObj,
COSName streamType)
Returns length value referred to or defined in given object.
|
private long |
getObjectId(COSObject obj)
Creates a unique object id using object number and object generation
number.
|
protected long |
getStartxrefOffset()
Looks for and parses startxref.
|
protected boolean |
isCatalog(COSDictionary dictionary)
Tell if the dictionary is a PDF catalog.
|
private boolean |
isInfo(COSDictionary dictionary)
Tell if the dictionary is an info dictionary.
|
boolean |
isLenient()
Return true if parser is lenient.
|
private boolean |
isString(byte[] string)
Checks if the given string can be found at the current offset.
|
private boolean |
isString(char[] string)
Checks if the given string can be found at the current offset.
|
protected int |
lastIndexOf(char[] pattern,
byte[] buf,
int endOff)
Searches last appearance of pattern within buffer.
|
protected COSStream |
parseCOSStream(COSDictionary dic)
This will read a COSStream from the input stream using length attribute within dictionary.
|
private void |
parseDictionaryRecursive(COSObject dictionaryObject)
Resolves all not already parsed objects of a dictionary recursively.
|
protected void |
parseDictObjects(COSDictionary dict,
COSName... excludeObjects)
Will parse every object necessary to load a single page from the pdf document.
|
protected boolean |
parseFDFHeader()
Parse the header of a fdf.
|
private void |
parseFileObject(java.lang.Long offsetOrObjstmObNr,
COSObjectKey objKey,
COSObject pdfObject) |
private boolean |
parseHeader(java.lang.String headerMarker,
java.lang.String defaultVersion) |
protected COSBase |
parseObjectDynamically(COSObject obj,
boolean requireExistingNotCompressedObj)
This will parse the next object from the stream and add it to the local state.
|
protected COSBase |
parseObjectDynamically(long objNr,
int objGenNr,
boolean requireExistingNotCompressedObj)
This will parse the next object from the stream and add it to the local state.
|
private void |
parseObjectStream(int objstmObjNr) |
protected boolean |
parsePDFHeader()
Parse the header of a pdf.
|
private long |
parseStartXref()
This will parse the startxref section from the stream.
|
private boolean |
parseTrailer()
This will parse the trailer from the stream and add it to the state.
|
protected COSBase |
parseTrailerValuesDynamically(COSDictionary trailer)
Parse the values of the trailer dictionary and return the root object.
|
protected COSDictionary |
parseXref(long startXRefOffset)
Parses cross reference tables.
|
private long |
parseXrefObjStream(long objByteOffset,
boolean isStandalone)
Parses an xref object stream starting with indirect object id.
|
private void |
parseXrefStream(COSStream stream,
long objByteOffset,
boolean isStandalone)
Fills XRefTrailerResolver with data of given stream.
|
protected boolean |
parseXrefTable(long startByteOffset)
This will parse the xref table from the stream and add it to the state
The XrefTable contents are ignored.
|
private void |
prepareDecryption()
Prepare for decryption.
|
private void |
readUntilEndStream(java.io.OutputStream out)
This method will read through the current stream object until
we find the keyword "endstream" meaning we're at the end of this
object.
|
private void |
readValidStream(java.io.OutputStream out,
COSNumber streamLengthObj) |
protected COSDictionary |
rebuildTrailer()
Rebuild the trailer dictionary if startxref can't be found.
|
private COSDictionary |
retrieveCOSDictionary(COSObject object) |
private COSDictionary |
retrieveCOSDictionary(COSObjectKey key,
long offset) |
protected COSDictionary |
retrieveTrailer()
Read the trailer information and provide a COSDictionary containing the trailer information.
|
private boolean |
searchForTrailerItems(COSDictionary trailer)
Search for the different parts of the trailer dictionary.
|
private long |
searchNearestValue(java.util.List<java.lang.Long> values,
long offset) |
void |
setEOFLookupRange(int byteCount)
Sets how many trailing bytes of PDF file are searched for EOF marker and 'startxref' marker.
|
void |
setLenient(boolean lenient)
Change the parser leniency flag.
|
private boolean |
validateStreamLength(long streamLength) |
private boolean |
validateXrefOffsets(java.util.Map<COSObjectKey,java.lang.Long> xrefOffset) |
isClosing, isClosing, isDigit, isDigit, isEndOfName, isEOL, isEOL, isSpace, isSpace, isWhitespace, isWhitespace, parseBoolean, parseCOSArray, parseCOSDictionary, parseCOSName, parseCOSString, parseDirObject, readExpectedChar, readExpectedString, readExpectedString, readGenerationNumber, readInt, readLine, readLong, readObjectNumber, readString, readString, readStringNumber, skipSpaces, skipWhiteSpaces
private static final java.lang.String PDF_HEADER
private static final java.lang.String FDF_HEADER
private static final java.lang.String PDF_DEFAULT_VERSION
private static final java.lang.String FDF_DEFAULT_VERSION
private static final char[] XREF_TABLE
private static final char[] XREF_STREAM
private static final char[] STARTXREF
private static final byte[] ENDSTREAM
private static final byte[] ENDOBJ
private static final long MINIMUM_SEARCH_OFFSET
private static final int X
private static final int STRMBUFLEN
private final byte[] strmBuf
protected final RandomAccessRead source
private AccessPermission accessPermission
private java.io.InputStream keyStoreInputStream
private java.lang.String password
private java.lang.String keyAlias
public static final java.lang.String SYSPROP_PARSEMINIMAL
public static final java.lang.String SYSPROP_EOFLOOKUPRANGE
private static final int DEFAULT_TRAIL_BYTECOUNT
protected static final char[] EOF_MARKER
protected static final char[] OBJ_MARKER
private static final char[] TRAILER_MARKER
private static final char[] OBJ_STREAM
private long trailerOffset
protected long fileLen
private boolean isLenient
protected boolean initialParseDone
private boolean trailerWasRebuild
private java.util.Map<COSObjectKey,java.lang.Long> bfSearchCOSObjectKeyOffsets
private java.lang.Long lastEOFMarker
private java.util.List<java.lang.Long> bfSearchXRefTablesOffsets
private java.util.List<java.lang.Long> bfSearchXRefStreamsOffsets
private PDEncryption encryption
protected SecurityHandler securityHandler
private int readTrailBytes
private static final org.apache.commons.logging.Log LOG
protected XrefTrailerResolver xrefTrailerResolver
public static final java.lang.String TMP_FILE_PREFIX
private static final int STREAMCOPYBUFLEN
private final byte[] streamCopyBuf
public COSParser(RandomAccessRead source)
source
- input representing the pdf.public COSParser(RandomAccessRead source, java.lang.String password, java.io.InputStream keyStore, java.lang.String keyAlias)
source
- input representing the pdf.password
- password to be used for decryption.keyStore
- key store to be used for decryption when using public key securitykeyAlias
- alias to be used for decryption when using public key securitypublic void setEOFLookupRange(int byteCount)
DEFAULT_TRAIL_BYTECOUNT
.
We check that new value is at least 16. However for practical use cases this value should not be lower than 1000; even 2000 was found to not be enough in some cases where some trailing garbage like HTML snippets followed the EOF marker.
In case system property SYSPROP_EOFLOOKUPRANGE
is defined this value will be set on initialization but
can be overwritten later.
byteCount
- number of trailing bytesprotected COSDictionary retrieveTrailer() throws java.io.IOException
java.io.IOException
- if something went wrongprotected COSDictionary parseXref(long startXRefOffset) throws java.io.IOException
startXRefOffset
- start offset of the first tablejava.io.IOException
- if something went wrongprivate long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws java.io.IOException
-1
if no such item existsjava.io.IOException
protected final long getStartxrefOffset() throws java.io.IOException
DEFAULT_TRAIL_BYTECOUNT
bytes (or range set via setEOFLookupRange(int)
) and go back to find
startxref
.java.io.IOException
- If something went wrong.protected int lastIndexOf(char[] pattern, byte[] buf, int endOff)
pattern
- pattern to search forbuf
- buffer to search pattern inendOff
- offset (exclusive) where lookup starts at-1
if pattern could not be foundpublic boolean isLenient()
public void setLenient(boolean lenient)
lenient
- try to handle malformed PDFs.private long getObjectId(COSObject obj)
private void addNewToList(java.util.Queue<COSBase> toBeParsedList, java.util.Collection<COSBase> newObjects, java.util.Set<java.lang.Long> addedObjects)
private void addNewToList(java.util.Queue<COSBase> toBeParsedList, COSBase newObject, java.util.Set<java.lang.Long> addedObjects)
protected void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws java.io.IOException
dict
- the COSObject from the parent pages.excludeObjects
- dictionary object reference entries with these names will not be parsedjava.io.IOException
- if something went wrongprivate void addExcludedToList(COSName[] excludeObjects, COSDictionary dict, java.util.Set<java.lang.Long> parsedObjects)
protected final COSBase parseObjectDynamically(COSObject obj, boolean requireExistingNotCompressedObj) throws java.io.IOException
obj
- object to be parsed (we only take object number and generation number for lookup start offset)requireExistingNotCompressedObj
- if true
object to be parsed must not be contained within
compressed streamjava.io.IOException
- If an IO error occurs.protected COSBase parseObjectDynamically(long objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws java.io.IOException
objNr
- object number of object to be parsedobjGenNr
- object generation number of object to be parsedrequireExistingNotCompressedObj
- if true
the object to be parsed must be defined in xref
(comment: null objects may be missing from xref) and it must not be a compressed object within object stream
(this is used to circumvent being stuck in a loop in a malicious PDF)java.io.IOException
- If an IO error occurs.private void parseFileObject(java.lang.Long offsetOrObjstmObNr, COSObjectKey objKey, COSObject pdfObject) throws java.io.IOException
java.io.IOException
private void parseObjectStream(int objstmObjNr) throws java.io.IOException
java.io.IOException
private COSNumber getLength(COSBase lengthBaseObj, COSName streamType) throws java.io.IOException
java.io.IOException
protected COSStream parseCOSStream(COSDictionary dic) throws java.io.IOException
dic
- dictionary that goes with this stream.java.io.IOException
- if an error occurred reading the stream, like problems with reading
length attribute, stream does not end with 'endstream' after data read, stream too short etc.private void readUntilEndStream(java.io.OutputStream out) throws java.io.IOException
out
- stream we write out to.java.io.IOException
- if something went wrongprivate void readValidStream(java.io.OutputStream out, COSNumber streamLengthObj) throws java.io.IOException
java.io.IOException
private boolean validateStreamLength(long streamLength) throws java.io.IOException
java.io.IOException
private long checkXRefOffset(long startXRefOffset) throws java.io.IOException
startXRefOffset
- java.io.IOException
private boolean checkXRefStreamOffset(long startXRefOffset) throws java.io.IOException
startXRefOffset
- the expected start offset of the XRef streamjava.io.IOException
- if something went wrongprivate long calculateXRefFixedOffset(long objectOffset, boolean streamsOnly) throws java.io.IOException
objectOffset
- the given offset where to look atstreamsOnly
- search for xref streams onlyjava.io.IOException
- if something went wrongprivate boolean validateXrefOffsets(java.util.Map<COSObjectKey,java.lang.Long> xrefOffset) throws java.io.IOException
java.io.IOException
private void checkXrefOffsets() throws java.io.IOException
java.io.IOException
- if something went wrong.private boolean checkObjectKey(COSObjectKey objectKey, long offset) throws java.io.IOException
objectKey
- the object we are looking foroffset
- the offset where to lookjava.io.IOException
- if something went wrongprivate void bfSearchForObjects() throws java.io.IOException
java.io.IOException
- if something went wrongprivate long bfSearchForXRef(long xrefOffset, boolean streamsOnly) throws java.io.IOException
streamsOnly
- search for xref streams onlyjava.io.IOException
- if something went wrongprivate long searchNearestValue(java.util.List<java.lang.Long> values, long offset)
private boolean bfSearchForTrailer(COSDictionary trailer) throws java.io.IOException
java.io.IOException
- if something went wrongprivate void bfSearchForLastEOFMarker() throws java.io.IOException
java.io.IOException
- if something went wrongprivate void bfSearchForObjStreams() throws java.io.IOException
java.io.IOException
- if something went wrongprivate void bfSearchForXRefTables() throws java.io.IOException
java.io.IOException
- if something went wrongprivate void bfSearchForXRefStreams() throws java.io.IOException
java.io.IOException
- if something went wrongprotected final COSDictionary rebuildTrailer() throws java.io.IOException
java.io.IOException
- if something went wrongprivate boolean searchForTrailerItems(COSDictionary trailer) throws java.io.IOException
trailer
- java.io.IOException
private COSDictionary retrieveCOSDictionary(COSObject object) throws java.io.IOException
java.io.IOException
private COSDictionary retrieveCOSDictionary(COSObjectKey key, long offset) throws java.io.IOException
java.io.IOException
protected void checkPages(COSDictionary root)
root
- the root dictionary of the pdfprivate int checkPagesDictionary(COSDictionary pagesDict, java.util.Set<COSObject> set)
protected boolean isCatalog(COSDictionary dictionary)
dictionary
- private boolean isInfo(COSDictionary dictionary)
dictionary
- private long parseStartXref() throws java.io.IOException
java.io.IOException
- If an IO error occurs.private boolean isString(byte[] string) throws java.io.IOException
string
- the bytes of the string to look forjava.io.IOException
- if something went wrongprivate boolean isString(char[] string) throws java.io.IOException
string
- the bytes of the string to look forjava.io.IOException
- if something went wrongprivate boolean parseTrailer() throws java.io.IOException
java.io.IOException
- If an IO error occurs.protected boolean parsePDFHeader() throws java.io.IOException
java.io.IOException
- if something went wrongprotected boolean parseFDFHeader() throws java.io.IOException
java.io.IOException
- if something went wrongprivate boolean parseHeader(java.lang.String headerMarker, java.lang.String defaultVersion) throws java.io.IOException
java.io.IOException
protected boolean parseXrefTable(long startByteOffset) throws java.io.IOException
startByteOffset
- the offset to start atjava.io.IOException
- If an IO error occurs.private void parseXrefStream(COSStream stream, long objByteOffset, boolean isStandalone) throws java.io.IOException
stream
- the stream to be readobjByteOffset
- the offset to start atisStandalone
- should be set to true if the stream is not part of a hybrid xref tablejava.io.IOException
- if there is an error parsing the streampublic COSDocument getDocument() throws java.io.IOException
java.io.IOException
- If there is an error getting the document.public PDEncryption getEncryption() throws java.io.IOException
java.io.IOException
- If there is an error getting the document.public AccessPermission getAccessPermission() throws java.io.IOException
java.io.IOException
- If there is an error getting the document.protected COSBase parseTrailerValuesDynamically(COSDictionary trailer) throws java.io.IOException
trailer
- The trailer dictionary.java.io.IOException
- If an IO error occurs or if the root object is missing in the trailer dictionary.private void prepareDecryption() throws java.io.IOException
InvalidPasswordException
- If the password is incorrect.java.io.IOException
- if something went wrongprivate void parseDictionaryRecursive(COSObject dictionaryObject) throws java.io.IOException
dictionaryObject
- dictionary to be parsedjava.io.IOException
- if something went wrong