1 /*
2    Source File : PDFParser.h
3 
4 
5    Copyright 2011 Gal Kahana PDFWriter
6 
7    Licensed under the Apache License, Version 2.0 (the "License");
8    you may not use this file except in compliance with the License.
9    You may obtain a copy of the License at
10 
11        http://www.apache.org/licenses/LICENSE-2.0
12 
13    Unless required by applicable law or agreed to in writing, software
14    distributed under the License is distributed on an "AS IS" BASIS,
15    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16    See the License for the specific language governing permissions and
17    limitations under the License.
18 
19 
20 */
21 #pragma once
22 
23 #include "EStatusCode.h"
24 #include "PDFObjectParser.h"
25 #include "IOBasicTypes.h"
26 #include "ObjectsBasicTypes.h"
27 #include "RefCountPtr.h"
28 #include "PDFDictionary.h"
29 #include "IByteReaderWithPosition.h"
30 #include "AdapterIByteReaderWithPositionToIReadPositionProvider.h"
31 
32 #include <map>
33 #include <utility>
34 
35 
36 class PDFArray;
37 class PDFStreamInput;
38 class PDFDictionary;
39 class PDFName;
40 class IPDFParserExtender;
41 
42 typedef std::pair<PDFHummus::EStatusCode,IByteReader*> EStatusCodeAndIByteReader;
43 
44 #define LINE_BUFFER_SIZE 1024
45 
46 enum EXrefEntryType
47 {
48 	eXrefEntryExisting,
49 	eXrefEntryDelete,
50 	eXrefEntryStreamObject,
51 	eXrefEntryUndefined
52 };
53 
54 struct XrefEntryInput
55 {
XrefEntryInputXrefEntryInput56 	XrefEntryInput(){mObjectPosition = 0;mRivision=0;mType = eXrefEntryUndefined;}
57 
58 	// well...it's more like...the first number in a pair on an xref, and the second one. the names
59 	// are true only for "n" type of entries
60 	LongFilePositionType mObjectPosition;
61 	unsigned long mRivision;
62 	EXrefEntryType mType;
63 };
64 
65 struct ObjectStreamHeaderEntry
66 {
67 	ObjectIDType mObjectNumber;
68 	LongFilePositionType mObjectOffset;
69 };
70 
71 class ReadPositionProviderForStreamWithPosition : public IReadPositionProvider
72 {
73 public:
Assign(IByteReaderWithPosition * inStream)74 	void Assign(IByteReaderWithPosition* inStream)
75 	{
76 		mStream = inStream;
77 	}
78 
GetCurrentPosition()79 	virtual LongFilePositionType GetCurrentPosition()
80 	{
81 		return mStream->GetCurrentPosition();
82 	}
83 private:
84 	IByteReaderWithPosition* mStream;
85 
86 };
87 
88 typedef std::map<ObjectIDType,ObjectStreamHeaderEntry*> ObjectIDTypeToObjectStreamHeaderEntryMap;
89 
90 class PDFParser
91 {
92 public:
93 	PDFParser(void);
94 	virtual ~PDFParser(void);
95 
96 	// sets the stream to parse, then parses for enough information to be able
97 	// to parse objects later
98 	PDFHummus::EStatusCode StartPDFParsing(IByteReaderWithPosition* inSourceStream);
99 
100 	// get a parser that can parse objects
101 	PDFObjectParser& GetObjectParser();
102 
103 	// below become available after initial parsing [this level is from the header]
104 	double GetPDFLevel();
105 
106 	// GetTrailer, not calling AddRef
107 	PDFDictionary* GetTrailer();
108 
109 	// IMPORTANT! All non "Get" prefix methods below return an object after calling AddRef (or at least make sure reference is added)
110 	// to handle refcount use the RefCountPtr object, or just make sure to call Release when you are done.
111 
112 	// Creates a new object, use smart pointers to control ownership
113 	PDFObject* ParseNewObject(ObjectIDType inObjectId);
114 	ObjectIDType GetObjectsCount();
115 
116 	// Query a dictinary object, if indirect, go and fetch the indirect object and return it instead
117 	// [if you want the direct dictionary value, use PDFDictionary::QueryDirectObject [will AddRef automatically]
118 	PDFObject* QueryDictionaryObject(PDFDictionary* inDictionary,const std::string& inName);
119 
120 	// Query an array object, if indirect, go and fetch the indirect object and return it instead
121 	// [if you want the direct array value, use the PDFArray direct access to the vector [and use AddRef, cause it won't]
122 	PDFObject* QueryArrayObject(PDFArray* inArray,unsigned long inIndex);
123 
124 	unsigned long GetPagesCount();
125 	// don't be confused - pass number of pages here. returns the dictionary, and verifies that it's actually a page (via type)
126 	PDFDictionary* ParsePage(unsigned long inPageIndex);
127 	// get page object ID for an input index
128 	ObjectIDType GetPageObjectID(unsigned long inPageIndex);
129 
130 	// Create a reader that will be able to read the stream. when filters are included
131     // in the stream definition it will add them. delete the returned object when done.
132     // Note that it DOES NOT setup the reading position of the file for reading the stream,
133     // so if you want to read it, you have to also move the strem position, or use StartReadingFromStream instead
134 	IByteReader* CreateInputStreamReader(PDFStreamInput* inStream);
135 
136     // prepare parser so that you can read from the input stream object.
137     // create filters and move the stream to the beginning of the stream position.
138     // delete the result when done
139     IByteReader* StartReadingFromStream(PDFStreamInput* inStream);
140 
141 	// use this to explictly free used objects. quite obviously this means that you'll have to parse the file again
142 	void ResetParser();
143 
144 	// using PDFParser also for state information reading. this is a specialized version of the StartParsing for reading state
145 	PDFHummus::EStatusCode StartStateFileParsing(IByteReaderWithPosition* inSourceStream);
146 
147 	// check if this file is encrypted. considering that the library can't really handle these files, this shoud be handy.
148 	bool IsEncrypted();
149 	// encryption is supported if there's an extender that supports it in the parser
150 	bool IsEncryptionSupported();
151 
152 	// set extender for parser, to enhance parsing capabilities
153 	void SetParserExtender(IPDFParserExtender* inParserExtender);
154 
155     // advanced, direct xref access
156     ObjectIDType GetXrefSize();
157     XrefEntryInput* GetXrefEntry(ObjectIDType inObjectID);
158     LongFilePositionType GetXrefPosition();
159 
160     IByteReaderWithPosition* GetParserStream();
161 
162 private:
163 	PDFObjectParser mObjectParser;
164 	IByteReaderWithPosition* mStream;
165 	AdapterIByteReaderWithPositionToIReadPositionProvider mCurrentPositionProvider;
166 
167 	// we'll use this items for bacwkards reading. might turns this into a proper stream object
168 	IOBasicTypes::Byte mLinesBuffer[LINE_BUFFER_SIZE];
169 	IOBasicTypes::Byte* mCurrentBufferIndex;
170 	IOBasicTypes::Byte* mLastAvailableIndex;
171 	LongBufferSizeType mLastReadPositionFromEnd;
172 	bool mEncounteredFileStart;
173 	ObjectIDTypeToObjectStreamHeaderEntryMap mObjectStreamsCache;
174 
175 	double mPDFLevel;
176 	LongFilePositionType mLastXrefPosition;
177 	RefCountPtr<PDFDictionary> mTrailer;
178 	ObjectIDType mXrefSize;
179 	XrefEntryInput* mXrefTable;
180 	unsigned long mPagesCount;
181 	ObjectIDType* mPagesObjectIDs;
182 	IPDFParserExtender* mParserExtender;
183     bool mAllowExtendingSegments;
184 
185 	PDFHummus::EStatusCode ParseHeaderLine();
186 	PDFHummus::EStatusCode ParseEOFLine();
187 	PDFHummus::EStatusCode ParseLastXrefPosition();
188 	PDFHummus::EStatusCode ParseTrailerDictionary();
189 	PDFHummus::EStatusCode BuildXrefTableFromTable();
190 	PDFHummus::EStatusCode DetermineXrefSize();
191 	PDFHummus::EStatusCode InitializeXref();
192 	PDFHummus::EStatusCode ParseXrefFromXrefTable(XrefEntryInput* inXrefTable,
193                                                   ObjectIDType inXrefSize,
194                                                   LongFilePositionType inXrefPosition,
195                                                   XrefEntryInput** outExtendedTable,
196                                                   ObjectIDType* outExtendedTableSize);
197     XrefEntryInput* ExtendXrefTableToSize(XrefEntryInput* inXrefTable,ObjectIDType inOldSize,ObjectIDType inNewSize);
198 	PDFObject*  ParseExistingInDirectObject(ObjectIDType inObjectID);
199 	PDFHummus::EStatusCode ParsePagesObjectIDs();
200 	PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID);
201 	PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex);
202 	PDFHummus::EStatusCode ParsePreviousXrefs(PDFDictionary* inTrailer);
203 	void MergeXrefWithMainXref(XrefEntryInput* inTableToMerge,ObjectIDType inMergedTableSize);
204 	PDFHummus::EStatusCode ParseFileDirectory();
205 	PDFHummus::EStatusCode BuildXrefTableAndTrailerFromXrefStream(long long inXrefStreamObjectID);
206 	// an overload for cases where the xref stream object is already parsed
207 	PDFHummus::EStatusCode ParseXrefFromXrefStream(XrefEntryInput* inXrefTable,
208                                                    ObjectIDType inXrefSize,
209                                                    PDFStreamInput* inXrefStream,
210                                                    XrefEntryInput** outExtendedTable,
211                                                    ObjectIDType* outExtendedTableSize);
212 	// an overload for cases where the position should hold a stream object, and it should be parsed
213 	PDFHummus::EStatusCode ParseXrefFromXrefStream(XrefEntryInput* inXrefTable,
214                                                    ObjectIDType inXrefSize,
215                                                    LongFilePositionType inXrefPosition,
216                                                    XrefEntryInput** outExtendedTable,
217                                                    ObjectIDType* outExtendedTableSize);
218 	PDFHummus::EStatusCode ReadXrefStreamSegment(XrefEntryInput* inXrefTable,
219 									 ObjectIDType inSegmentStartObject,
220 									 ObjectIDType inSegmentCount,
221 									 IByteReader* inReadFrom,
222 									 int* inEntryWidths,
223 									 unsigned long inEntryWidthsSize);
224 	PDFHummus::EStatusCode ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,long long& outValue);
225 	PDFHummus::EStatusCode ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,ObjectIDType& outValue);
226 	PDFHummus::EStatusCode ParseDirectory(LongFilePositionType inXrefPosition,
227                                           XrefEntryInput* inXrefTable,
228                                           ObjectIDType inXrefSize,
229                                           PDFDictionary** outTrailer,
230                                           XrefEntryInput** outExtendedTable,
231                                           ObjectIDType* outExtendedTableSize);
232 	PDFObject* ParseExistingInDirectStreamObject(ObjectIDType inObjectId);
233 	PDFHummus::EStatusCode ParseObjectStreamHeader(ObjectStreamHeaderEntry* inHeaderInfo,ObjectIDType inObjectsCount);
234 	void MovePositionInStream(LongFilePositionType inPosition);
235 	EStatusCodeAndIByteReader CreateFilterForStream(IByteReader* inStream,PDFName* inFilterName,PDFDictionary* inDecodeParams);
236 
237 	// Backward reading
238 	bool ReadNextBufferFromEnd();
239 	LongBufferSizeType GetCurrentPositionFromEnd();
240 	bool ReadBack(IOBasicTypes::Byte& outValue);
241 	bool IsBeginOfFile();
242 
243 	bool GoBackTillToken();
244 	bool GoBackTillNonToken();
245 	void GoBackTillLineStart();
246 	bool IsPDFWhiteSpace(IOBasicTypes::Byte inCharacter);
247 
248 };
249