1 /////////////////////////////////////////////////////////////////////////////// 2 // Name: pdfparser.h 3 // Purpose: 4 // Author: Ulrich Telle 5 // Created: 2006-05-15 6 // Copyright: (c) Ulrich Telle 7 // Licence: wxWindows licence 8 /////////////////////////////////////////////////////////////////////////////// 9 10 /// \file pdfparser.h Interface of the wxPdfParser classes 11 12 #ifndef _PDF_PARSER_H_ 13 #define _PDF_PARSER_H_ 14 15 // wxWidgets headers 16 #include <wx/dynarray.h> 17 #include <wx/filesys.h> 18 #include <wx/mstream.h> 19 #include <wx/string.h> 20 21 // wxPdfDocument headers 22 #include "wx/pdfdocdef.h" 23 #include "wx/pdfarraydouble.h" 24 #include "wx/pdfobjects.h" 25 26 class WXDLLIMPEXP_FWD_PDFDOC wxPdfEncrypt; 27 class WXDLLIMPEXP_FWD_PDFDOC wxPdfInfo; 28 29 /// Permissions required for import of a document 30 // Permission bit 3: Print 31 // Permission bit 5: Copy or extract text and graphics 32 // Permission bit 10: Extract text and graphics 33 // THIS MUST NOT BE CHANGED! 34 #define REQUIRED_PERMISSIONS 0x0214 35 36 /// Token types 37 #define TOKEN_COMMENT 1 38 #define TOKEN_BOOLEAN 2 39 #define TOKEN_NUMBER 3 40 #define TOKEN_STRING 4 41 #define TOKEN_NAME 5 42 #define TOKEN_START_ARRAY 6 43 #define TOKEN_END_ARRAY 7 44 #define TOKEN_START_DICTIONARY 8 45 #define TOKEN_END_DICTIONARY 9 46 #define TOKEN_REFERENCE 10 47 #define TOKEN_NULL 12 48 #define TOKEN_OTHER 13 49 50 /// Class representing a tokenizer for parsing PDF documenst. 51 class WXDLLIMPEXP_PDFDOC wxPdfTokenizer 52 { 53 public: 54 /// Constructor 55 wxPdfTokenizer(wxInputStream* inputStream); 56 57 /// Destructor 58 virtual ~wxPdfTokenizer(); 59 60 /// Set current offset position in stream 61 off_t Seek(off_t pos); 62 63 /// Get current offset position in stream 64 off_t Tell(); 65 66 /// Go back one position in the stream 67 void BackOnePosition(int ch); 68 69 /// Get length of stream 70 off_t GetLength(); 71 72 /// Read one byte from stream 73 int ReadChar(); 74 75 /// Read size bytes from stream 76 wxMemoryOutputStream* ReadBuffer(size_t size); 77 78 /// Find the offset of the startxref tag 79 off_t GetStartXRef(); 80 81 /// Read a string 82 wxString ReadString(int size); 83 84 /// Check the header of the document stream 85 wxString CheckPdfHeader(); 86 87 /// Get the next token 88 bool NextToken(); 89 90 /// Get the next valid token 91 void NextValidToken(); 92 93 /// Get the type of the last token 94 int GetTokenType(); 95 96 /// Get the token value as a string 97 wxString GetStringValue(); 98 99 /// Get the token value as an integer 100 int GetIntValue(); 101 102 /// Check whether the token is a hexadecimal string IsHexString()103 bool IsHexString() { return m_hexString; } 104 105 /// Get object reference 106 int GetReference(); 107 108 /// Get object generation 109 int GetGeneration(); 110 111 /// Check byte whether it represents a white space character 112 static bool IsWhitespace(int ch); 113 114 /// Check byte whether it is a delimiter 115 static bool IsDelimiter(int ch); 116 117 /// Check byte whether it is a delimiter or a whitespace character 118 static bool IsDelimiterOrWhitespace(int ch); 119 120 /// Get hexadecimal character 121 static int GetHex(int v); 122 123 private: 124 wxInputStream* m_inputStream; ///< Stream of document data 125 int m_type; ///< Type of last token 126 wxString m_stringValue; ///< Value of last token 127 int m_reference; ///< Reference number of object 128 int m_generation; ///< Generation number of object 129 bool m_hexString; ///< Flag for hexadeciaml strings 130 131 }; 132 133 /// Class representing an XRef entry (for internal use only) 134 class WXDLLIMPEXP_PDFDOC wxPdfXRefEntry 135 { 136 public: 137 /// Constructor 138 wxPdfXRefEntry(); 139 140 /// Destructor 141 virtual ~wxPdfXRefEntry(); 142 143 int m_type; ///< Type of XRef entry 144 int m_ofs_idx; ///< Offset or index of object 145 int m_gen_ref; ///< Generation of object or reference of object stream containing the object 146 }; 147 148 WX_DECLARE_USER_EXPORTED_OBJARRAY(wxPdfXRefEntry, wxPdfXRef, WXDLLIMPEXP_PDFDOC); 149 150 /// Class representing a PDF parser. (For internal use only) 151 class WXDLLIMPEXP_PDFDOC wxPdfParser 152 { 153 public: 154 /// Constructor 155 wxPdfParser(const wxString& filename, 156 const wxString& password = wxEmptyString); 157 158 /// Destructor 159 virtual ~wxPdfParser(); 160 161 /// Check whether the PDF document to be parsed is valid 162 bool IsOk(); 163 164 /// Get PDF version of parsed document GetPdfVersion()165 wxString GetPdfVersion() { return m_pdfVersion; } 166 167 /// Get number of pages in the parsed document 168 unsigned int GetPageCount(); 169 170 /// Get the document information dictionary 171 bool GetSourceInfo(wxPdfInfo& info); 172 173 /// Get the queue of referenced objects GetObjectQueue()174 wxPdfObjectQueue* GetObjectQueue() { return m_objectQueue; } 175 176 /// Get the map of referenced objects GetObjectMap()177 wxPdfObjectMap* GetObjectMap() { return m_objectMap; } 178 179 /// Append a referenced object to the queue 180 void AppendObject(int originalObjectId, int actualObjectId, wxPdfObject* obj); 181 182 /// Get the resources of a specific page 183 wxPdfObject* GetPageResources(unsigned int pageno); 184 185 /// Get the content stream collection of a specific page 186 void GetContent(unsigned int pageno, wxArrayPtrVoid& contents); 187 188 /// Get the media box of a specific page 189 wxPdfArrayDouble* GetPageMediaBox(unsigned int pageno); 190 191 /// Get the crop box of a specific page 192 wxPdfArrayDouble* GetPageCropBox(unsigned int pageno); 193 194 /// Get the bleed box of a specific page 195 wxPdfArrayDouble* GetPageBleedBox(unsigned int pageno); 196 197 /// Get the trim box of a specific page 198 wxPdfArrayDouble* GetPageTrimBox(unsigned int pageno); 199 200 /// Get the art box of a specific page 201 wxPdfArrayDouble* GetPageArtBox(unsigned int pageno); 202 203 /// Get the rotation of a specific page 204 int GetPageRotation (unsigned int pageno); 205 206 /// Resolve an object 207 wxPdfObject* ResolveObject(wxPdfObject* obj); 208 209 /// Set flag whether a stream should be decoded or not SetUseRawStream(bool useRawStream)210 void SetUseRawStream(bool useRawStream) { m_useRawStream = useRawStream; } 211 212 /// Get flag whether a stream should be decoded or not GetUseRawStream()213 bool GetUseRawStream() { return m_useRawStream; } 214 215 protected: 216 /// Get the resources of a specific page identified by a page object 217 wxPdfObject* GetPageResources(wxPdfObject* page); 218 219 /// Get the content stream collection of a specific page 220 void GetPageContent(wxPdfObject* contentRef, wxArrayPtrVoid& contents); 221 222 /// Get a page box 223 wxPdfArrayDouble* GetPageBox(wxPdfDictionary* page, const wxString& boxIndex); 224 225 /// Get a page rotation 226 int GetPageRotation (wxPdfDictionary* page); 227 228 /// Parse PDF document 229 bool ParseDocument(); 230 231 /// Setup a decryptor 232 bool SetupDecryptor(); 233 234 /// Parse the cross reference 235 bool ParseXRef(); 236 237 /// Parse the page tree of the PDF document 238 bool ParsePageTree(wxPdfDictionary* pages); 239 240 /// Parse a cross reference section 241 wxPdfDictionary* ParseXRefSection(); 242 243 /// Parse a cross reference stream 244 bool ParseXRefStream(int ptr, bool setTrailer); 245 246 /// Parse an object 247 wxPdfObject* ParseObject(); 248 249 /// Parse a dictionary 250 wxPdfDictionary* ParseDictionary(); 251 252 /// Parse an array 253 wxPdfArray* ParseArray(); 254 255 /// Parse a specific object 256 wxPdfObject* ParseSpecificObject(int idx); 257 258 /// Parse a direct object 259 wxPdfObject* ParseDirectObject(int k); 260 261 /// Parse an object from an object stream 262 wxPdfObject* ParseObjectStream(wxPdfStream* stream, int idx); 263 264 /// Parse the content of a stream object 265 void GetStreamBytes(wxPdfStream* stream); 266 267 /// Parse the raw content of a stream object 268 void GetStreamBytesRaw(wxPdfStream* stream); 269 270 /// Decode a stream predictor 271 wxMemoryOutputStream* DecodePredictor(wxMemoryOutputStream* in, wxPdfObject* dicPar); 272 273 /// Decode a stream that has the FlateDecode filter. 274 /** 275 * \param osIn the input data 276 * \return the decoded data 277 */ 278 wxMemoryOutputStream* FlateDecode(wxMemoryOutputStream* osIn); 279 280 /// Decode a stream that has the ASCIIHexDecode filter. 281 /** 282 * \param osIn the input data 283 * \return the decoded data 284 */ 285 wxMemoryOutputStream* ASCIIHexDecode(wxMemoryOutputStream* osIn); 286 287 /// Decode a stream that has the ASCII85Decode filter. 288 /** 289 * \param osIn the input data 290 * \return the decoded data 291 */ 292 wxMemoryOutputStream* ASCII85Decode(wxMemoryOutputStream* osIn); 293 294 /// Decode a stream that has the ASCII85Decode filter. 295 /** 296 * \param osIn the input data 297 * \return the decoded data 298 */ 299 wxMemoryOutputStream* LZWDecode(wxMemoryOutputStream* osIn); 300 301 /// Get wxWidgets file system 302 static wxFileSystem* GetFileSystem(); 303 304 private: 305 /// Reserve at least count cross reference entries 306 void ReserveXRef(size_t count); 307 308 bool m_initialized; ///< Flag whether parser is properly initialized 309 int m_fileSize; ///< File size 310 wxString m_filename; ///< File name of PDF document 311 wxString m_password; ///< 312 wxString m_pdfVersion; ///< Version of PDF document 313 wxFSFile* m_pdfFile; ///< File system file object of PDF document 314 wxPdfTokenizer* m_tokens; ///< Tokenizer 315 wxPdfDictionary* m_trailer; ///< Trailer dictionary 316 wxPdfDictionary* m_root; ///< Root object 317 wxArrayPtrVoid m_pages; ///< Array of page objects 318 unsigned int m_currentPage; ///< Number of current page 319 bool m_useRawStream; ///< Flag whether to use raw stream data (without decoding) 320 321 bool m_encrypted; ///< Flag whether the document is encrypted 322 wxPdfEncrypt* m_decryptor; ///< decryptor instance 323 wxPdfDictionary* m_encryption; ///< Encryption dictionary 324 325 wxPdfObjectQueue* m_objectQueue; ///< Queue of referenced objects 326 wxPdfObjectQueue* m_objectQueueLast; ///< Pointer to last queue element 327 wxPdfObjectMap* m_objectMap; ///< Map for object queue elements 328 wxPdfObjStmMap* m_objStmCache; ///< Cache for object streams 329 bool m_cacheObjects; ///< Flag whether object streams should be cached 330 331 int m_objNum; ///< Number of current object 332 int m_objGen; ///< Generation of current object 333 334 wxPdfXRef m_xref; ///< Cross reference 335 336 static wxFileSystem* ms_fileSystem; ///< wxWidgets file system 337 }; 338 339 #define WXPDF_LZW_STRINGTABLE_SIZE 8192 340 341 /// Class representing an LZW decoder. (For internal use only) 342 class WXDLLIMPEXP_PDFDOC wxPdfLzwDecoder 343 { 344 public: 345 /// Constructor 346 wxPdfLzwDecoder(); 347 348 /// Destructor 349 virtual ~wxPdfLzwDecoder(); 350 351 /// Get next code 352 int GetNextCode(); 353 354 /// Decode a byte stream 355 bool Decode(wxMemoryInputStream* dataIn, wxMemoryOutputStream* dataOut); 356 357 /// Initialize the string table 358 void InitializeStringTable(); 359 360 /// Write decoded string into output buffer 361 void WriteString(int code); 362 363 /// Add string to string table 364 void AddStringToTable(int oldCode, char newString); 365 366 private: 367 wxMemoryInputStream* m_dataIn; ///< Encoded data stream 368 wxMemoryOutputStream* m_dataOut; ///< Decoded data stream 369 size_t m_dataSize; ///< Length of encoded data stream 370 wxArrayInt m_stringTable[WXPDF_LZW_STRINGTABLE_SIZE]; ///< Table of decoded strings 371 372 int m_tableIndex; ///< Current string table index 373 int m_bitsToGet; ///< Number of bits to get from stream 374 int m_bytePointer; ///< Offset in encoded data stream 375 int m_bitPointer; ///< Bit offset 376 int m_nextData; ///< Next data item 377 int m_nextBits; ///< Next bits 378 379 static int ms_andTable[4]; ///< Static array of string table offsets 380 }; 381 382 #endif 383