1 /* 2 Source File : PDFParser.h 3 4 5 Copyright 2011 Gal Kahana PDFWriter 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 19 20 */ 21 #pragma once 22 23 #include "EStatusCode.h" 24 #include "PDFObjectParser.h" 25 #include "IOBasicTypes.h" 26 #include "ObjectsBasicTypes.h" 27 #include "RefCountPtr.h" 28 #include "PDFDictionary.h" 29 #include "IByteReaderWithPosition.h" 30 #include "AdapterIByteReaderWithPositionToIReadPositionProvider.h" 31 32 #include <map> 33 #include <utility> 34 35 36 class PDFArray; 37 class PDFStreamInput; 38 class PDFDictionary; 39 class PDFName; 40 class IPDFParserExtender; 41 42 typedef std::pair<PDFHummus::EStatusCode,IByteReader*> EStatusCodeAndIByteReader; 43 44 #define LINE_BUFFER_SIZE 1024 45 46 enum EXrefEntryType 47 { 48 eXrefEntryExisting, 49 eXrefEntryDelete, 50 eXrefEntryStreamObject, 51 eXrefEntryUndefined 52 }; 53 54 struct XrefEntryInput 55 { XrefEntryInputXrefEntryInput56 XrefEntryInput(){mObjectPosition = 0;mRivision=0;mType = eXrefEntryUndefined;} 57 58 // well...it's more like...the first number in a pair on an xref, and the second one. the names 59 // are true only for "n" type of entries 60 LongFilePositionType mObjectPosition; 61 unsigned long mRivision; 62 EXrefEntryType mType; 63 }; 64 65 struct ObjectStreamHeaderEntry 66 { 67 ObjectIDType mObjectNumber; 68 LongFilePositionType mObjectOffset; 69 }; 70 71 class ReadPositionProviderForStreamWithPosition : public IReadPositionProvider 72 { 73 public: Assign(IByteReaderWithPosition * inStream)74 void Assign(IByteReaderWithPosition* inStream) 75 { 76 mStream = inStream; 77 } 78 GetCurrentPosition()79 virtual LongFilePositionType GetCurrentPosition() 80 { 81 return mStream->GetCurrentPosition(); 82 } 83 private: 84 IByteReaderWithPosition* mStream; 85 86 }; 87 88 typedef std::map<ObjectIDType,ObjectStreamHeaderEntry*> ObjectIDTypeToObjectStreamHeaderEntryMap; 89 90 class PDFParser 91 { 92 public: 93 PDFParser(void); 94 virtual ~PDFParser(void); 95 96 // sets the stream to parse, then parses for enough information to be able 97 // to parse objects later 98 PDFHummus::EStatusCode StartPDFParsing(IByteReaderWithPosition* inSourceStream); 99 100 // get a parser that can parse objects 101 PDFObjectParser& GetObjectParser(); 102 103 // below become available after initial parsing [this level is from the header] 104 double GetPDFLevel(); 105 106 // GetTrailer, not calling AddRef 107 PDFDictionary* GetTrailer(); 108 109 // IMPORTANT! All non "Get" prefix methods below return an object after calling AddRef (or at least make sure reference is added) 110 // to handle refcount use the RefCountPtr object, or just make sure to call Release when you are done. 111 112 // Creates a new object, use smart pointers to control ownership 113 PDFObject* ParseNewObject(ObjectIDType inObjectId); 114 ObjectIDType GetObjectsCount(); 115 116 // Query a dictinary object, if indirect, go and fetch the indirect object and return it instead 117 // [if you want the direct dictionary value, use PDFDictionary::QueryDirectObject [will AddRef automatically] 118 PDFObject* QueryDictionaryObject(PDFDictionary* inDictionary,const std::string& inName); 119 120 // Query an array object, if indirect, go and fetch the indirect object and return it instead 121 // [if you want the direct array value, use the PDFArray direct access to the vector [and use AddRef, cause it won't] 122 PDFObject* QueryArrayObject(PDFArray* inArray,unsigned long inIndex); 123 124 unsigned long GetPagesCount(); 125 // don't be confused - pass number of pages here. returns the dictionary, and verifies that it's actually a page (via type) 126 PDFDictionary* ParsePage(unsigned long inPageIndex); 127 // get page object ID for an input index 128 ObjectIDType GetPageObjectID(unsigned long inPageIndex); 129 130 // Create a reader that will be able to read the stream. when filters are included 131 // in the stream definition it will add them. delete the returned object when done. 132 // Note that it DOES NOT setup the reading position of the file for reading the stream, 133 // so if you want to read it, you have to also move the strem position, or use StartReadingFromStream instead 134 IByteReader* CreateInputStreamReader(PDFStreamInput* inStream); 135 136 // prepare parser so that you can read from the input stream object. 137 // create filters and move the stream to the beginning of the stream position. 138 // delete the result when done 139 IByteReader* StartReadingFromStream(PDFStreamInput* inStream); 140 141 // use this to explictly free used objects. quite obviously this means that you'll have to parse the file again 142 void ResetParser(); 143 144 // using PDFParser also for state information reading. this is a specialized version of the StartParsing for reading state 145 PDFHummus::EStatusCode StartStateFileParsing(IByteReaderWithPosition* inSourceStream); 146 147 // check if this file is encrypted. considering that the library can't really handle these files, this shoud be handy. 148 bool IsEncrypted(); 149 // encryption is supported if there's an extender that supports it in the parser 150 bool IsEncryptionSupported(); 151 152 // set extender for parser, to enhance parsing capabilities 153 void SetParserExtender(IPDFParserExtender* inParserExtender); 154 155 // advanced, direct xref access 156 ObjectIDType GetXrefSize(); 157 XrefEntryInput* GetXrefEntry(ObjectIDType inObjectID); 158 LongFilePositionType GetXrefPosition(); 159 160 IByteReaderWithPosition* GetParserStream(); 161 162 private: 163 PDFObjectParser mObjectParser; 164 IByteReaderWithPosition* mStream; 165 AdapterIByteReaderWithPositionToIReadPositionProvider mCurrentPositionProvider; 166 167 // we'll use this items for bacwkards reading. might turns this into a proper stream object 168 IOBasicTypes::Byte mLinesBuffer[LINE_BUFFER_SIZE]; 169 IOBasicTypes::Byte* mCurrentBufferIndex; 170 IOBasicTypes::Byte* mLastAvailableIndex; 171 LongBufferSizeType mLastReadPositionFromEnd; 172 bool mEncounteredFileStart; 173 ObjectIDTypeToObjectStreamHeaderEntryMap mObjectStreamsCache; 174 175 double mPDFLevel; 176 LongFilePositionType mLastXrefPosition; 177 RefCountPtr<PDFDictionary> mTrailer; 178 ObjectIDType mXrefSize; 179 XrefEntryInput* mXrefTable; 180 unsigned long mPagesCount; 181 ObjectIDType* mPagesObjectIDs; 182 IPDFParserExtender* mParserExtender; 183 bool mAllowExtendingSegments; 184 185 PDFHummus::EStatusCode ParseHeaderLine(); 186 PDFHummus::EStatusCode ParseEOFLine(); 187 PDFHummus::EStatusCode ParseLastXrefPosition(); 188 PDFHummus::EStatusCode ParseTrailerDictionary(); 189 PDFHummus::EStatusCode BuildXrefTableFromTable(); 190 PDFHummus::EStatusCode DetermineXrefSize(); 191 PDFHummus::EStatusCode InitializeXref(); 192 PDFHummus::EStatusCode ParseXrefFromXrefTable(XrefEntryInput* inXrefTable, 193 ObjectIDType inXrefSize, 194 LongFilePositionType inXrefPosition, 195 XrefEntryInput** outExtendedTable, 196 ObjectIDType* outExtendedTableSize); 197 XrefEntryInput* ExtendXrefTableToSize(XrefEntryInput* inXrefTable,ObjectIDType inOldSize,ObjectIDType inNewSize); 198 PDFObject* ParseExistingInDirectObject(ObjectIDType inObjectID); 199 PDFHummus::EStatusCode ParsePagesObjectIDs(); 200 PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID); 201 PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex); 202 PDFHummus::EStatusCode ParsePreviousXrefs(PDFDictionary* inTrailer); 203 void MergeXrefWithMainXref(XrefEntryInput* inTableToMerge,ObjectIDType inMergedTableSize); 204 PDFHummus::EStatusCode ParseFileDirectory(); 205 PDFHummus::EStatusCode BuildXrefTableAndTrailerFromXrefStream(long long inXrefStreamObjectID); 206 // an overload for cases where the xref stream object is already parsed 207 PDFHummus::EStatusCode ParseXrefFromXrefStream(XrefEntryInput* inXrefTable, 208 ObjectIDType inXrefSize, 209 PDFStreamInput* inXrefStream, 210 XrefEntryInput** outExtendedTable, 211 ObjectIDType* outExtendedTableSize); 212 // an overload for cases where the position should hold a stream object, and it should be parsed 213 PDFHummus::EStatusCode ParseXrefFromXrefStream(XrefEntryInput* inXrefTable, 214 ObjectIDType inXrefSize, 215 LongFilePositionType inXrefPosition, 216 XrefEntryInput** outExtendedTable, 217 ObjectIDType* outExtendedTableSize); 218 PDFHummus::EStatusCode ReadXrefStreamSegment(XrefEntryInput* inXrefTable, 219 ObjectIDType inSegmentStartObject, 220 ObjectIDType inSegmentCount, 221 IByteReader* inReadFrom, 222 int* inEntryWidths, 223 unsigned long inEntryWidthsSize); 224 PDFHummus::EStatusCode ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,long long& outValue); 225 PDFHummus::EStatusCode ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,ObjectIDType& outValue); 226 PDFHummus::EStatusCode ParseDirectory(LongFilePositionType inXrefPosition, 227 XrefEntryInput* inXrefTable, 228 ObjectIDType inXrefSize, 229 PDFDictionary** outTrailer, 230 XrefEntryInput** outExtendedTable, 231 ObjectIDType* outExtendedTableSize); 232 PDFObject* ParseExistingInDirectStreamObject(ObjectIDType inObjectId); 233 PDFHummus::EStatusCode ParseObjectStreamHeader(ObjectStreamHeaderEntry* inHeaderInfo,ObjectIDType inObjectsCount); 234 void MovePositionInStream(LongFilePositionType inPosition); 235 EStatusCodeAndIByteReader CreateFilterForStream(IByteReader* inStream,PDFName* inFilterName,PDFDictionary* inDecodeParams); 236 237 // Backward reading 238 bool ReadNextBufferFromEnd(); 239 LongBufferSizeType GetCurrentPositionFromEnd(); 240 bool ReadBack(IOBasicTypes::Byte& outValue); 241 bool IsBeginOfFile(); 242 243 bool GoBackTillToken(); 244 bool GoBackTillNonToken(); 245 void GoBackTillLineStart(); 246 bool IsPDFWhiteSpace(IOBasicTypes::Byte inCharacter); 247 248 }; 249