1 ///////////////////////////////////////////////////////////////////////////////
2 // Name:        pdfparser.h
3 // Purpose:
4 // Author:      Ulrich Telle
5 // Created:     2006-05-15
6 // Copyright:   (c) Ulrich Telle
7 // Licence:     wxWindows licence
8 ///////////////////////////////////////////////////////////////////////////////
9 
10 /// \file pdfparser.h Interface of the wxPdfParser classes
11 
12 #ifndef _PDF_PARSER_H_
13 #define _PDF_PARSER_H_
14 
15 // wxWidgets headers
16 #include <wx/dynarray.h>
17 #include <wx/filesys.h>
18 #include <wx/mstream.h>
19 #include <wx/string.h>
20 
21 // wxPdfDocument headers
22 #include "wx/pdfdocdef.h"
23 #include "wx/pdfarraydouble.h"
24 #include "wx/pdfobjects.h"
25 
26 class WXDLLIMPEXP_FWD_PDFDOC wxPdfEncrypt;
27 class WXDLLIMPEXP_FWD_PDFDOC wxPdfInfo;
28 
29 /// Permissions required for import of a document
30 // Permission bit  3: Print
31 // Permission bit  5: Copy or extract text and graphics
32 // Permission bit 10: Extract text and graphics
33 // THIS MUST NOT BE CHANGED!
34 #define REQUIRED_PERMISSIONS 0x0214
35 
36 /// Token types
37 #define TOKEN_COMMENT           1
38 #define TOKEN_BOOLEAN           2
39 #define TOKEN_NUMBER            3
40 #define TOKEN_STRING            4
41 #define TOKEN_NAME              5
42 #define TOKEN_START_ARRAY       6
43 #define TOKEN_END_ARRAY         7
44 #define TOKEN_START_DICTIONARY  8
45 #define TOKEN_END_DICTIONARY    9
46 #define TOKEN_REFERENCE        10
47 #define TOKEN_NULL             12
48 #define TOKEN_OTHER            13
49 
50 /// Class representing a tokenizer for parsing PDF documenst.
51 class WXDLLIMPEXP_PDFDOC wxPdfTokenizer
52 {
53 public:
54   /// Constructor
55   wxPdfTokenizer(wxInputStream* inputStream);
56 
57   /// Destructor
58   virtual ~wxPdfTokenizer();
59 
60   /// Set current offset position in stream
61   off_t Seek(off_t pos);
62 
63   /// Get current offset position in stream
64   off_t Tell();
65 
66   /// Go back one position in the stream
67   void BackOnePosition(int ch);
68 
69   /// Get length of stream
70   off_t GetLength();
71 
72   /// Read one byte from stream
73   int ReadChar();
74 
75   /// Read size bytes from stream
76   wxMemoryOutputStream* ReadBuffer(size_t size);
77 
78   /// Find the offset of the startxref tag
79   off_t GetStartXRef();
80 
81   /// Read a string
82   wxString ReadString(int size);
83 
84   /// Check the header of the document stream
85   wxString CheckPdfHeader();
86 
87   /// Get the next token
88   bool NextToken();
89 
90   /// Get the next valid token
91   void NextValidToken();
92 
93   /// Get the type of the last token
94   int GetTokenType();
95 
96   /// Get the token value as a string
97   wxString GetStringValue();
98 
99   /// Get the token value as an integer
100   int GetIntValue();
101 
102   /// Check whether the token is a hexadecimal string
IsHexString()103   bool IsHexString() { return m_hexString; }
104 
105   /// Get object reference
106   int GetReference();
107 
108   /// Get object generation
109   int GetGeneration();
110 
111   /// Check byte whether it represents a white space character
112   static bool IsWhitespace(int ch);
113 
114   /// Check byte whether it is a delimiter
115   static bool IsDelimiter(int ch);
116 
117   /// Check byte whether it is a delimiter or a whitespace character
118   static bool IsDelimiterOrWhitespace(int ch);
119 
120   /// Get hexadecimal character
121   static int GetHex(int v);
122 
123 private:
124   wxInputStream* m_inputStream; ///< Stream of document data
125   int            m_type;        ///< Type of last token
126   wxString       m_stringValue; ///< Value of last token
127   int            m_reference;   ///< Reference number of object
128   int            m_generation;  ///< Generation number of object
129   bool           m_hexString;   ///< Flag for hexadeciaml strings
130 
131 };
132 
133 /// Class representing an XRef entry (for internal use only)
134 class WXDLLIMPEXP_PDFDOC wxPdfXRefEntry
135 {
136 public:
137   /// Constructor
138   wxPdfXRefEntry();
139 
140   /// Destructor
141   virtual ~wxPdfXRefEntry();
142 
143   int m_type;    ///< Type of XRef entry
144   int m_ofs_idx; ///< Offset or index of object
145   int m_gen_ref; ///< Generation of object or reference of object stream containing the object
146 };
147 
148 WX_DECLARE_USER_EXPORTED_OBJARRAY(wxPdfXRefEntry, wxPdfXRef, WXDLLIMPEXP_PDFDOC);
149 
150 /// Class representing a PDF parser. (For internal use only)
151 class WXDLLIMPEXP_PDFDOC wxPdfParser
152 {
153 public:
154   /// Constructor
155   wxPdfParser(const wxString& filename,
156               const wxString& password = wxEmptyString);
157 
158   /// Destructor
159   virtual ~wxPdfParser();
160 
161   /// Check whether the PDF document to be parsed is valid
162   bool IsOk();
163 
164   /// Get PDF version of parsed document
GetPdfVersion()165   wxString GetPdfVersion() { return m_pdfVersion; }
166 
167   /// Get number of pages in the parsed document
168   unsigned int GetPageCount();
169 
170   /// Get the document information dictionary
171   bool GetSourceInfo(wxPdfInfo& info);
172 
173   /// Get the queue of referenced objects
GetObjectQueue()174   wxPdfObjectQueue* GetObjectQueue() { return m_objectQueue; }
175 
176   /// Get the map of referenced objects
GetObjectMap()177   wxPdfObjectMap* GetObjectMap() { return m_objectMap; }
178 
179   /// Append a referenced object to the queue
180   void AppendObject(int originalObjectId, int actualObjectId, wxPdfObject* obj);
181 
182   /// Get the resources of a specific page
183   wxPdfObject* GetPageResources(unsigned int pageno);
184 
185   /// Get the content stream collection of a specific page
186   void GetContent(unsigned int pageno, wxArrayPtrVoid& contents);
187 
188   /// Get the media box of a specific page
189   wxPdfArrayDouble* GetPageMediaBox(unsigned int pageno);
190 
191   /// Get the crop box of a specific page
192   wxPdfArrayDouble* GetPageCropBox(unsigned int pageno);
193 
194   /// Get the bleed box of a specific page
195   wxPdfArrayDouble* GetPageBleedBox(unsigned int pageno);
196 
197   /// Get the trim box of a specific page
198   wxPdfArrayDouble* GetPageTrimBox(unsigned int pageno);
199 
200   /// Get the art box of a specific page
201   wxPdfArrayDouble* GetPageArtBox(unsigned int pageno);
202 
203   /// Get the rotation of a specific page
204   int GetPageRotation (unsigned int pageno);
205 
206   /// Resolve an object
207   wxPdfObject* ResolveObject(wxPdfObject* obj);
208 
209   /// Set flag whether a stream should be decoded or not
SetUseRawStream(bool useRawStream)210   void SetUseRawStream(bool useRawStream) { m_useRawStream = useRawStream; }
211 
212   /// Get flag whether a stream should be decoded or not
GetUseRawStream()213   bool GetUseRawStream() { return m_useRawStream; }
214 
215 protected:
216   /// Get the resources of a specific page identified by a page object
217   wxPdfObject* GetPageResources(wxPdfObject* page);
218 
219   /// Get the content stream collection of a specific page
220   void GetPageContent(wxPdfObject* contentRef, wxArrayPtrVoid& contents);
221 
222   /// Get a page box
223   wxPdfArrayDouble* GetPageBox(wxPdfDictionary* page, const wxString& boxIndex);
224 
225   /// Get a page rotation
226   int GetPageRotation (wxPdfDictionary* page);
227 
228   /// Parse PDF document
229   bool ParseDocument();
230 
231   /// Setup a decryptor
232   bool SetupDecryptor();
233 
234   /// Parse the cross reference
235   bool ParseXRef();
236 
237   /// Parse the page tree of the PDF document
238   bool ParsePageTree(wxPdfDictionary* pages);
239 
240   /// Parse a cross reference section
241   wxPdfDictionary* ParseXRefSection();
242 
243   /// Parse a cross reference stream
244   bool ParseXRefStream(int ptr, bool setTrailer);
245 
246   /// Parse an object
247   wxPdfObject* ParseObject();
248 
249   /// Parse a dictionary
250   wxPdfDictionary* ParseDictionary();
251 
252   /// Parse an array
253   wxPdfArray* ParseArray();
254 
255   /// Parse a specific object
256   wxPdfObject* ParseSpecificObject(int idx);
257 
258   /// Parse a direct object
259   wxPdfObject* ParseDirectObject(int k);
260 
261   /// Parse an object from an object stream
262   wxPdfObject* ParseObjectStream(wxPdfStream* stream, int idx);
263 
264   /// Parse the content of a stream object
265   void GetStreamBytes(wxPdfStream* stream);
266 
267   /// Parse the raw content of a stream object
268   void GetStreamBytesRaw(wxPdfStream* stream);
269 
270   /// Decode a stream predictor
271   wxMemoryOutputStream* DecodePredictor(wxMemoryOutputStream* in, wxPdfObject* dicPar);
272 
273   /// Decode a stream that has the FlateDecode filter.
274   /**
275    * \param osIn the input data
276    * \return the decoded data
277    */
278   wxMemoryOutputStream* FlateDecode(wxMemoryOutputStream* osIn);
279 
280   /// Decode a stream that has the ASCIIHexDecode filter.
281   /**
282    * \param osIn the input data
283    * \return the decoded data
284    */
285   wxMemoryOutputStream* ASCIIHexDecode(wxMemoryOutputStream* osIn);
286 
287   /// Decode a stream that has the ASCII85Decode filter.
288   /**
289    * \param osIn the input data
290    * \return the decoded data
291    */
292   wxMemoryOutputStream* ASCII85Decode(wxMemoryOutputStream* osIn);
293 
294   /// Decode a stream that has the ASCII85Decode filter.
295   /**
296    * \param osIn the input data
297    * \return the decoded data
298    */
299   wxMemoryOutputStream* LZWDecode(wxMemoryOutputStream* osIn);
300 
301   /// Get wxWidgets file system
302   static wxFileSystem* GetFileSystem();
303 
304 private:
305   /// Reserve at least count cross reference entries
306   void ReserveXRef(size_t count);
307 
308   bool              m_initialized;     ///< Flag whether parser is properly initialized
309   int               m_fileSize;        ///< File size
310   wxString          m_filename;        ///< File name of PDF document
311   wxString          m_password;        ///<
312   wxString          m_pdfVersion;      ///< Version of PDF document
313   wxFSFile*         m_pdfFile;         ///< File system file object of PDF document
314   wxPdfTokenizer*   m_tokens;          ///< Tokenizer
315   wxPdfDictionary*  m_trailer;         ///< Trailer dictionary
316   wxPdfDictionary*  m_root;            ///< Root object
317   wxArrayPtrVoid    m_pages;           ///< Array of page objects
318   unsigned int      m_currentPage;     ///< Number of current page
319   bool              m_useRawStream;    ///< Flag whether to use raw stream data (without decoding)
320 
321   bool              m_encrypted;       ///< Flag whether the document is encrypted
322   wxPdfEncrypt*     m_decryptor;       ///< decryptor instance
323   wxPdfDictionary*  m_encryption;      ///< Encryption dictionary
324 
325   wxPdfObjectQueue* m_objectQueue;     ///< Queue of referenced objects
326   wxPdfObjectQueue* m_objectQueueLast; ///< Pointer to last queue element
327   wxPdfObjectMap*   m_objectMap;       ///< Map for object queue elements
328   wxPdfObjStmMap*   m_objStmCache;     ///< Cache for object streams
329   bool              m_cacheObjects;    ///< Flag whether object streams should be cached
330 
331   int               m_objNum;          ///< Number of current object
332   int               m_objGen;          ///< Generation of current object
333 
334   wxPdfXRef         m_xref;            ///< Cross reference
335 
336   static wxFileSystem* ms_fileSystem; ///< wxWidgets file system
337 };
338 
339 #define WXPDF_LZW_STRINGTABLE_SIZE 8192
340 
341 /// Class representing an LZW decoder. (For internal use only)
342 class WXDLLIMPEXP_PDFDOC wxPdfLzwDecoder
343 {
344 public:
345   /// Constructor
346   wxPdfLzwDecoder();
347 
348   /// Destructor
349   virtual ~wxPdfLzwDecoder();
350 
351   /// Get next code
352   int GetNextCode();
353 
354   /// Decode a byte stream
355   bool Decode(wxMemoryInputStream* dataIn, wxMemoryOutputStream* dataOut);
356 
357   /// Initialize the string table
358   void InitializeStringTable();
359 
360   /// Write decoded string into output buffer
361   void WriteString(int code);
362 
363   /// Add string to string table
364   void AddStringToTable(int oldCode, char newString);
365 
366 private:
367   wxMemoryInputStream*  m_dataIn;       ///< Encoded data stream
368   wxMemoryOutputStream* m_dataOut;      ///< Decoded data stream
369   size_t                m_dataSize;     ///< Length of encoded data stream
370   wxArrayInt            m_stringTable[WXPDF_LZW_STRINGTABLE_SIZE]; ///< Table of decoded strings
371 
372   int                   m_tableIndex;   ///< Current string table index
373   int                   m_bitsToGet;    ///< Number of bits to get from stream
374   int                   m_bytePointer;  ///< Offset in encoded data stream
375   int                   m_bitPointer;   ///< Bit offset
376   int                   m_nextData;     ///< Next data item
377   int                   m_nextBits;     ///< Next bits
378 
379   static int            ms_andTable[4]; ///< Static array of string table offsets
380 };
381 
382 #endif
383