1 /***************************************************************************
2  *   Copyright (C) 2006 by Dominik Seichter                                *
3  *   domseichter@web.de                                                    *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU Library General Public License as       *
7  *   published by the Free Software Foundation; either version 2 of the    *
8  *   License, or (at your option) any later version.                       *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU Library General Public     *
16  *   License along with this program; if not, write to the                 *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  *                                                                         *
20  *   In addition, as a special exception, the copyright holders give       *
21  *   permission to link the code of portions of this program with the      *
22  *   OpenSSL library under certain conditions as described in each         *
23  *   individual source file, and distribute linked combinations            *
24  *   including the two.                                                    *
25  *   You must obey the GNU General Public License in all respects          *
26  *   for all of the code used other than OpenSSL.  If you modify           *
27  *   file(s) with this exception, you may extend this exception to your    *
28  *   version of the file(s), but you are not obligated to do so.  If you   *
29  *   do not wish to do so, delete this exception statement from your       *
30  *   version.  If you delete this exception statement from all source      *
31  *   files in the program, then also delete it here.                       *
32  ***************************************************************************/
33 
34 #include "PdfTokenizer.h"
35 
36 #include "PdfArray.h"
37 #include "PdfDictionary.h"
38 #include "PdfEncrypt.h"
39 #include "PdfInputDevice.h"
40 #include "PdfName.h"
41 #include "PdfString.h"
42 #include "PdfReference.h"
43 #include "PdfVariant.h"
44 #include "PdfDefinesPrivate.h"
45 
46 #include <limits>
47 #include <sstream>
48 #include <memory>
49 
50 #include <stdlib.h>
51 #include <string.h>
52 
53 #define PDF_BUFFER 4096
54 
55 #define DICT_SEP_LENGTH 2
56 #define NULL_LENGTH     4
57 #define TRUE_LENGTH     4
58 #define FALSE_LENGTH    5
59 
60 namespace PoDoFo {
61 
62 namespace PdfTokenizerNameSpace{
63 
64 static const int g_MapAllocLen = 256;
65 static char g_DelMap[g_MapAllocLen] = { 0 };
66 static char g_WsMap[g_MapAllocLen] = { 0 };
67 static char g_EscMap[g_MapAllocLen] = { 0 };
68 static char g_hexMap[g_MapAllocLen] = { 0 };
69 
70 // Generate the delimiter character map at runtime
71 // so that it can be derived from the more easily
72 // maintainable structures in PdfDefines.h
genDelMap()73 const char * genDelMap()
74 {
75     char* map = static_cast<char*>(g_DelMap);
76     memset( map, 0, sizeof(char) * g_MapAllocLen );
77     for (int i = 0; i < PoDoFo::s_nNumDelimiters; ++i)
78     {
79         map[static_cast<int>(PoDoFo::s_cDelimiters[i])] = 1;
80     }
81 
82     return map;
83 }
84 
85 // Generate the whitespace character map at runtime
86 // so that it can be derived from the more easily
87 // maintainable structures in PdfDefines.h
genWsMap()88 const char * genWsMap()
89 {
90     char* map = static_cast<char*>(g_WsMap);
91     memset( map, 0, sizeof(char) * g_MapAllocLen );
92     for (int i = 0; i < PoDoFo::s_nNumWhiteSpaces; ++i)
93     {
94         map[static_cast<int>(PoDoFo::s_cWhiteSpaces[i])] = 1;
95     }
96     return map;
97 }
98 
99 // Generate the escape character map at runtime
genEscMap()100 const char* genEscMap()
101 {
102     char* map = static_cast<char*>(g_EscMap);
103     memset( map, 0, sizeof(char) * g_MapAllocLen );
104 
105     map[static_cast<unsigned char>('n')] = '\n'; // Line feed (LF)
106     map[static_cast<unsigned char>('r')] = '\r'; // Carriage return (CR)
107     map[static_cast<unsigned char>('t')] = '\t'; // Horizontal tab (HT)
108     map[static_cast<unsigned char>('b')] = '\b'; // Backspace (BS)
109     map[static_cast<unsigned char>('f')] = '\f'; // Form feed (FF)
110     map[static_cast<unsigned char>(')')] = ')';
111     map[static_cast<unsigned char>('(')] = '(';
112     map[static_cast<unsigned char>('\\')] = '\\';
113 
114     return map;
115 }
116 
117 // Generate the hex character map at runtime
genHexMap()118 const char* genHexMap()
119 {
120     char* map = static_cast<char*>(g_hexMap);
121     memset( map, PdfTokenizer::HEX_NOT_FOUND, sizeof(char) * g_MapAllocLen );
122 
123     map[static_cast<unsigned char>('0')] = 0x0;
124     map[static_cast<unsigned char>('1')] = 0x1;
125     map[static_cast<unsigned char>('2')] = 0x2;
126     map[static_cast<unsigned char>('3')] = 0x3;
127     map[static_cast<unsigned char>('4')] = 0x4;
128     map[static_cast<unsigned char>('5')] = 0x5;
129     map[static_cast<unsigned char>('6')] = 0x6;
130     map[static_cast<unsigned char>('7')] = 0x7;
131     map[static_cast<unsigned char>('8')] = 0x8;
132     map[static_cast<unsigned char>('9')] = 0x9;
133     map[static_cast<unsigned char>('a')] = 0xA;
134     map[static_cast<unsigned char>('b')] = 0xB;
135     map[static_cast<unsigned char>('c')] = 0xC;
136     map[static_cast<unsigned char>('d')] = 0xD;
137     map[static_cast<unsigned char>('e')] = 0xE;
138     map[static_cast<unsigned char>('f')] = 0xF;
139     map[static_cast<unsigned char>('A')] = 0xA;
140     map[static_cast<unsigned char>('B')] = 0xB;
141     map[static_cast<unsigned char>('C')] = 0xC;
142     map[static_cast<unsigned char>('D')] = 0xD;
143     map[static_cast<unsigned char>('E')] = 0xE;
144     map[static_cast<unsigned char>('F')] = 0xF;
145 
146     return map;
147 }
148 
149 };
150 
151 const unsigned int PdfTokenizer::HEX_NOT_FOUND   = std::numeric_limits<unsigned int>::max();
152 const char * const PdfTokenizer::s_delimiterMap  = PdfTokenizerNameSpace::genDelMap();
153 const char * const PdfTokenizer::s_whitespaceMap = PdfTokenizerNameSpace::genWsMap();
154 const char * const PdfTokenizer::s_escMap        = PdfTokenizerNameSpace::genEscMap();
155 const char * const PdfTokenizer::s_hexMap        = PdfTokenizerNameSpace::genHexMap();
156 
157 const char PdfTokenizer::s_octMap[]        = {
158     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
161     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162     0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
163     1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
164     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
175     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
183     0, 0, 0, 0, 0, 0
184 };
185 
PdfTokenizer()186 PdfTokenizer::PdfTokenizer()
187     : m_buffer( PDF_BUFFER )
188 {
189     PdfLocaleImbue(m_doubleParser);
190 }
191 
PdfTokenizer(const char * pBuffer,size_t lLen)192 PdfTokenizer::PdfTokenizer( const char* pBuffer, size_t lLen )
193     : m_device( pBuffer, lLen ), m_buffer( PDF_BUFFER )
194 {
195     PdfLocaleImbue(m_doubleParser);
196 }
197 
PdfTokenizer(const PdfRefCountedInputDevice & rDevice,const PdfRefCountedBuffer & rBuffer)198 PdfTokenizer::PdfTokenizer( const PdfRefCountedInputDevice & rDevice, const PdfRefCountedBuffer & rBuffer )
199     : m_device( rDevice ), m_buffer( rBuffer )
200 {
201     PdfLocaleImbue(m_doubleParser);
202 }
203 
~PdfTokenizer()204 PdfTokenizer::~PdfTokenizer()
205 {
206 }
207 
GetNextToken(const char * & pszToken,EPdfTokenType * peType)208 bool PdfTokenizer::GetNextToken( const char*& pszToken , EPdfTokenType* peType )
209 {
210     int  c;
211     pdf_int64  counter  = 0;
212 
213     // check first if there are queued tokens and return them first
214     if( m_deqQueque.size() )
215     {
216         TTokenizerPair pair = m_deqQueque.front();
217         m_deqQueque.pop_front();
218 
219         if( peType )
220             *peType = pair.second;
221 
222         if ( !m_buffer.GetBuffer() || m_buffer.GetSize() == 0)
223         {
224             PODOFO_RAISE_ERROR(ePdfError_InvalidHandle);
225         }
226 
227         // make sure buffer is \0 terminated
228         strncpy(m_buffer.GetBuffer(), pair.first.c_str(), m_buffer.GetSize());
229         m_buffer.GetBuffer()[m_buffer.GetSize() - 1] = 0;
230         pszToken = m_buffer.GetBuffer();
231         return true;
232     }
233 
234     if( !m_device.Device() )
235     {
236         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
237     }
238 
239     if( peType )
240         *peType = ePdfTokenType_Token;
241 
242     while( (c = m_device.Device()->Look()) != EOF
243            && counter + 1 < static_cast<pdf_int64>(m_buffer.GetSize()) )
244     {
245         // ignore leading whitespaces
246         if( !counter && IsWhitespace( c ) )
247         {
248             // Consume the whitespace character
249             c = m_device.Device()->GetChar();
250             continue;
251         }
252         // ignore comments
253         else if( c == '%' )
254         {
255             // Consume all characters before the next line break
256 			// 2011-04-19 Ulrich Arnold: accept 0x0D, 0x0A and oX0D 0x0A as one EOL
257             do {
258                 c = m_device.Device()->GetChar();
259             } while( c != EOF && c != 0x0D  && c != 0x0A );
260 
261             if ( c == 0x0D )
262 			{
263                 if ( m_device.Device()->Look() == 0x0A )
264 	                c = m_device.Device()->GetChar();
265 			}
266             // If we've already read one or more chars of a token, return them, since
267             // comments are treated as token-delimiting whitespace. Otherwise keep reading
268             // at the start of the next line.
269             if (counter)
270                 break;
271         }
272         // special handling for << and >> tokens
273         else if( !counter && (c == '<' || c == '>' ) )
274         {
275             if( peType )
276                 *peType = ePdfTokenType_Delimiter;
277 
278             // retrieve c really from stream
279             c = m_device.Device()->GetChar();
280             m_buffer.GetBuffer()[counter] = c;
281             ++counter;
282 
283             char n = m_device.Device()->Look();
284             // Is n another < or > , ie are we opening/closing a dictionary?
285             // If so, consume that character too.
286             if( n == c )
287             {
288                 n = m_device.Device()->GetChar();
289                 m_buffer.GetBuffer()[counter] = n;
290                 ++counter;
291             }
292             // `m_buffer' contains one of < , > , << or >> ; we're done .
293             break;
294         }
295         else if( counter && (IsWhitespace( c ) || IsDelimiter( c )) )
296         {
297             // Next (unconsumed) character is a token-terminating char, so
298             // we have a complete token and can return it.
299             break;
300         }
301         else
302         {
303             // Consume the next character and add it to the token we're building.
304             c = m_device.Device()->GetChar();
305             m_buffer.GetBuffer()[counter] = c;
306             ++counter;
307 
308             if( IsDelimiter( c ) )
309             {
310                 // All delimeters except << and >> (handled above) are
311                 // one-character tokens, so if we hit one we can just return it
312                 // immediately.
313                 if( peType )
314                     *peType = ePdfTokenType_Delimiter;
315                 break;
316             }
317         }
318     }
319 
320     m_buffer.GetBuffer()[counter] = '\0';
321 
322     if( c == EOF && !counter )
323     {
324         // No characters were read before EOF, so we're out of data.
325         // Ensure the buffer points to NULL in case someone fails to check the return value.
326         pszToken = 0;
327         return false;
328     }
329 
330     pszToken = m_buffer.GetBuffer();
331     return true;
332 }
333 
IsNextToken(const char * pszToken)334 bool PdfTokenizer::IsNextToken( const char* pszToken )
335 {
336     if( !pszToken )
337     {
338         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
339     }
340 
341     const char* pszRead;
342     bool gotToken = this->GetNextToken( pszRead, NULL );
343 
344     if (!gotToken)
345     {
346         PODOFO_RAISE_ERROR( ePdfError_UnexpectedEOF );
347     }
348 
349     return (strcmp( pszToken, pszRead ) == 0);
350 }
351 
GetNextNumber()352 pdf_long PdfTokenizer::GetNextNumber()
353 {
354     EPdfTokenType eType;
355     const char* pszRead;
356     bool gotToken = this->GetNextToken( pszRead, &eType );
357 
358     if( !gotToken )
359     {
360         PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected number" );
361     }
362 
363     char* end;
364 #ifdef _WIN64
365     pdf_long l = _strtoui64( pszRead, &end, 10 );
366 #else
367     pdf_long l = strtol( pszRead, &end, 10 );
368 #endif
369     if( end == pszRead )
370     {
371         // Don't consume the token
372         this->QuequeToken( pszRead, eType );
373         PODOFO_RAISE_ERROR_INFO( ePdfError_NoNumber, pszRead );
374     }
375 
376     return l;
377 }
378 
GetNextVariant(PdfVariant & rVariant,PdfEncrypt * pEncrypt)379 void PdfTokenizer::GetNextVariant( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
380 {
381    EPdfTokenType eTokenType;
382    const char*   pszRead;
383    bool gotToken = this->GetNextToken( pszRead, &eTokenType );
384 
385    if (!gotToken)
386    {
387        PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected variant." );
388    }
389 
390    this->GetNextVariant( pszRead, eTokenType, rVariant, pEncrypt );
391 }
392 
GetNextVariant(const char * pszToken,EPdfTokenType eType,PdfVariant & rVariant,PdfEncrypt * pEncrypt)393 void PdfTokenizer::GetNextVariant( const char* pszToken, EPdfTokenType eType, PdfVariant& rVariant, PdfEncrypt* pEncrypt )
394 {
395     EPdfDataType eDataType = this->DetermineDataType( pszToken, eType, rVariant );
396 
397     if( eDataType == ePdfDataType_Null ||
398         eDataType == ePdfDataType_Bool ||
399         eDataType == ePdfDataType_Number ||
400         eDataType == ePdfDataType_Real ||
401         eDataType == ePdfDataType_Reference )
402     {
403         // the data was already read into rVariant by the DetermineDataType function
404         return;
405     }
406 
407     this->ReadDataType( eDataType, rVariant, pEncrypt );
408 }
409 
DetermineDataType(const char * pszToken,EPdfTokenType eTokenType,PdfVariant & rVariant)410 EPdfDataType PdfTokenizer::DetermineDataType( const char* pszToken, EPdfTokenType eTokenType, PdfVariant& rVariant )
411 {
412     if( eTokenType == ePdfTokenType_Token )
413     {
414         // check for the two special datatypes
415         // null and boolean.
416         // check for numbers
417         if( strncmp( "null", pszToken, NULL_LENGTH ) == 0 )
418         {
419             rVariant = PdfVariant();
420             return ePdfDataType_Null;
421         }
422         else if( strncmp( "true", pszToken, TRUE_LENGTH ) == 0 )
423         {
424             rVariant = PdfVariant( true );
425             return ePdfDataType_Bool;
426         }
427         else if( strncmp( "false", pszToken, FALSE_LENGTH ) == 0 )
428         {
429             rVariant = PdfVariant( false );
430             return ePdfDataType_Bool;
431         }
432 
433         EPdfDataType eDataType = ePdfDataType_Number;
434         const char*  pszStart  = pszToken;
435 
436         while( *pszStart )
437         {
438             if( *pszStart == '.' )
439                 eDataType = ePdfDataType_Real;
440             else if( !(isdigit( static_cast<const unsigned char>(*pszStart) ) || *pszStart == '-' || *pszStart == '+' ) )
441             {
442                 eDataType = ePdfDataType_Unknown;
443                 break;
444             }
445 
446             ++pszStart;
447         }
448 
449         if( eDataType == ePdfDataType_Real )
450         {
451             // DOM: strtod is locale dependend,
452             //      do not use it
453             //double dVal = strtod( pszToken, NULL );
454             double dVal;
455 
456             m_doubleParser.clear(); // clear error state
457             m_doubleParser.str( pszToken );
458             if( !(m_doubleParser >> dVal) )
459             {
460                 m_doubleParser.clear(); // clear error state
461                 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, pszToken );
462             }
463 
464             rVariant = PdfVariant( dVal );
465             return ePdfDataType_Real;
466         }
467         else if( eDataType == ePdfDataType_Number )
468         {
469 #ifdef _WIN64
470             rVariant = PdfVariant( static_cast<pdf_int64>(_strtoui64( pszToken, NULL, 10 )) );
471 #else
472             rVariant = PdfVariant( static_cast<pdf_int64>(strtol( pszToken, NULL, 10 )) );
473 #endif
474             // read another two tokens to see if it is a reference
475             // we cannot be sure that there is another token
476             // on the input device, so if we hit EOF just return
477             // ePdfDataType_Number .
478             EPdfTokenType eSecondTokenType;
479             bool gotToken = this->GetNextToken( pszToken, &eSecondTokenType );
480             if (!gotToken)
481                 // No next token, so it can't be a reference
482                 return eDataType;
483             if( eSecondTokenType != ePdfTokenType_Token )
484             {
485                 this->QuequeToken( pszToken, eSecondTokenType );
486                 return eDataType;
487             }
488 
489 
490             pszStart = pszToken;
491 #ifdef _WIN64
492             pdf_long  l   = _strtoui64( pszStart, const_cast<char**>(&pszToken), 10 );
493 #else
494             long  l   = strtol( pszStart, const_cast<char**>(&pszToken), 10 );
495 #endif
496             if( pszToken == pszStart )
497             {
498                 this->QuequeToken( pszStart, eSecondTokenType );
499                 return eDataType;
500             }
501 
502             std::string backup( pszStart );
503             EPdfTokenType eThirdTokenType;
504             gotToken = this->GetNextToken( pszToken, &eThirdTokenType );
505             if (!gotToken)
506                 // No third token, so it can't be a reference
507                 return eDataType;
508             if( eThirdTokenType == ePdfTokenType_Token &&
509                 pszToken[0] == 'R' && pszToken[1] == '\0' )
510             {
511                 rVariant = PdfReference( static_cast<unsigned int>(rVariant.GetNumber()),
512                                          static_cast<const pdf_uint16>(l) );
513                 return ePdfDataType_Reference;
514             }
515             else
516             {
517                 this->QuequeToken( backup.c_str(), eSecondTokenType );
518                 this->QuequeToken( pszToken, eThirdTokenType );
519                 return eDataType;
520             }
521         }
522     }
523     else if( eTokenType == ePdfTokenType_Delimiter )
524     {
525         if( strncmp( "<<", pszToken, DICT_SEP_LENGTH ) == 0 )
526             return ePdfDataType_Dictionary;
527         else if( pszToken[0] == '[' )
528             return ePdfDataType_Array;
529         else if( pszToken[0] == '(' )
530             return ePdfDataType_String;
531         else if( pszToken[0] == '<' )
532             return ePdfDataType_HexString;
533         else if( pszToken[0] == '/' )
534             return ePdfDataType_Name;
535     }
536 
537     if( false )
538     {
539         std::ostringstream ss;
540 #if defined(_MSC_VER)  &&  _MSC_VER <= 1200
541         ss << "Got unexpected PDF data in" << __FILE__ << ", line " << __LINE__
542 #else
543         ss << "Got unexpected PDF data in" << PODOFO__FUNCTION__
544 #endif
545            << ": \""
546            << pszToken
547            << "\". Current read offset is "
548            << m_device.Device()->Tell()
549            << " which should be around the problem.\n";
550         PdfError::DebugMessage(ss.str().c_str());
551     }
552 
553     return ePdfDataType_Unknown;
554 }
555 
ReadDataType(EPdfDataType eDataType,PdfVariant & rVariant,PdfEncrypt * pEncrypt)556 void PdfTokenizer::ReadDataType( EPdfDataType eDataType, PdfVariant& rVariant, PdfEncrypt* pEncrypt )
557 {
558     switch( eDataType )
559     {
560         case ePdfDataType_Dictionary:
561             this->ReadDictionary( rVariant, pEncrypt );
562             break;
563         case ePdfDataType_Array:
564             this->ReadArray( rVariant, pEncrypt );
565             break;
566         case ePdfDataType_String:
567             this->ReadString( rVariant, pEncrypt );
568             break;
569         case ePdfDataType_HexString:
570             this->ReadHexString( rVariant, pEncrypt );
571             break;
572         case ePdfDataType_Name:
573             this->ReadName( rVariant );
574             break;
575 
576         // The following datatypes are not handled by read datatype
577         // but are already parsed by DetermineDatatype
578         case ePdfDataType_Null:
579         case ePdfDataType_Bool:
580         case ePdfDataType_Number:
581         case ePdfDataType_Real:
582         case ePdfDataType_Reference:
583         case ePdfDataType_Unknown:
584         case ePdfDataType_RawData:
585 
586         default:
587         {
588             PdfError::LogMessage( eLogSeverity_Debug, "Got Datatype: %i\n", eDataType );
589             PODOFO_RAISE_ERROR( ePdfError_InvalidDataType );
590         }
591     }
592 }
593 
ReadDictionary(PdfVariant & rVariant,PdfEncrypt * pEncrypt)594 void PdfTokenizer::ReadDictionary( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
595 {
596     PdfVariant    val;
597     PdfName       key;
598     PdfDictionary dict;
599     EPdfTokenType eType;
600     const char *  pszToken;
601     PODOFO_UNIQUEU_PTR<std::vector<char> > contentsHexBuffer;
602 
603     for( ;; )
604     {
605         bool gotToken = this->GetNextToken( pszToken, &eType );
606         if (!gotToken)
607         {
608             PODOFO_RAISE_ERROR_INFO(ePdfError_UnexpectedEOF, "Expected dictionary key name or >> delim.");
609         }
610         if( eType == ePdfTokenType_Delimiter && strncmp( ">>", pszToken, DICT_SEP_LENGTH ) == 0 )
611             break;
612 
613         this->GetNextVariant( pszToken, eType, val, pEncrypt );
614         // Convert the read variant to a name; throws InvalidDataType if not a name.
615         key = val.GetName();
616 
617         // Try to get the next variant
618         gotToken = this->GetNextToken( pszToken, &eType );
619         if ( !gotToken )
620         {
621             PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected variant." );
622         }
623 
624         EPdfDataType eDataType = this->DetermineDataType( pszToken, eType, val );
625         if ( key == "Contents" && eDataType == ePdfDataType_HexString )
626         {
627             // 'Contents' key in signature dictionaries is an unencrypted Hex string:
628             // save the string buffer for later check if it needed decryption
629             contentsHexBuffer = PODOFO_UNIQUEU_PTR<std::vector<char> >( new std::vector<char>() );
630             ReadHexString( *contentsHexBuffer );
631             continue;
632         }
633 
634         switch ( eDataType )
635         {
636             case ePdfDataType_Null:
637             case ePdfDataType_Bool:
638             case ePdfDataType_Number:
639             case ePdfDataType_Real:
640             case ePdfDataType_Reference:
641             {
642                 // the data was already read into rVariant by the DetermineDataType function
643                 break;
644             }
645             case ePdfDataType_Name:
646             case ePdfDataType_String:
647             case ePdfDataType_HexString:
648             case ePdfDataType_Array:
649             case ePdfDataType_Dictionary:
650             {
651                 this->ReadDataType( eDataType, val, pEncrypt );
652                 break;
653             }
654             case ePdfDataType_RawData:
655             case ePdfDataType_Unknown:
656             default:
657             {
658                 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "Unexpected data type" );
659             }
660         }
661 
662         dict.AddKey( key, val );
663     }
664 
665     if ( contentsHexBuffer.get() != NULL )
666     {
667         PdfObject *type = dict.GetKey( "Type" );
668         // "Contents" is unencrypted in /Type/Sig and /Type/DocTimeStamp dictionaries
669         // https://issues.apache.org/jira/browse/PDFBOX-3173
670         bool contentsUnencrypted = type != NULL && type->GetDataType() == ePdfDataType_Name &&
671             (type->GetName() == PdfName( "Sig" ) || type->GetName() == PdfName( "DocTimeStamp" ));
672 
673         PdfEncrypt *encrypt = NULL;
674         if ( !contentsUnencrypted )
675             encrypt = pEncrypt;
676 
677         PdfString string;
678         string.SetHexData( contentsHexBuffer->size() ? &(*contentsHexBuffer)[0] : "", contentsHexBuffer->size(), encrypt );
679 
680         val = string;
681         dict.AddKey( "Contents", val );
682     }
683 
684     rVariant = dict;
685 }
686 
ReadArray(PdfVariant & rVariant,PdfEncrypt * pEncrypt)687 void PdfTokenizer::ReadArray( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
688 {
689     const char*   pszToken;
690     EPdfTokenType eType;
691     PdfVariant    var;
692     PdfArray      array;
693 
694     for( ;; )
695     {
696         bool gotToken = this->GetNextToken( pszToken, &eType );
697         if (!gotToken)
698         {
699             PODOFO_RAISE_ERROR_INFO(ePdfError_UnexpectedEOF, "Expected array item or ] delim.");
700         }
701         if( eType == ePdfTokenType_Delimiter && pszToken[0] == ']' )
702             break;
703 
704         this->GetNextVariant( pszToken, eType, var, pEncrypt );
705         array.push_back( var );
706     }
707 
708     rVariant = array;
709 }
710 
ReadString(PdfVariant & rVariant,PdfEncrypt * pEncrypt)711 void PdfTokenizer::ReadString( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
712 {
713     int               c;
714 
715     bool              bEscape       = false;
716     bool              bOctEscape    = false;
717     int               nOctCount     = 0;
718     char              cOctValue     = 0;
719     int               nBalanceCount = 0; // Balanced parathesis do not have to be escaped in strings
720 
721     m_vecBuffer.clear();
722 
723     while( (c = m_device.Device()->Look()) != EOF )
724     {
725         // end of stream reached
726         if( !bEscape )
727         {
728             // Handle raw characters
729             c = m_device.Device()->GetChar();
730             if( !nBalanceCount && c == ')' )
731                 break;
732 
733             if( c == '(' )
734                 ++nBalanceCount;
735             else if( c == ')' )
736                 --nBalanceCount;
737 
738             bEscape = (c == '\\');
739             if( !bEscape )
740                 m_vecBuffer.push_back( static_cast<char>(c) );
741         }
742         else
743         {
744             // Handle escape sequences
745             if( bOctEscape || s_octMap[c & 0xff] )
746                 // The last character we have read was a '\\',
747                 // so we check now for a digit to find stuff like \005
748                 bOctEscape = true;
749 
750             if( bOctEscape )
751             {
752                 // Handle octal escape sequences
753                 ++nOctCount;
754 
755                 if( !s_octMap[c & 0xff] )
756                 {
757                     // No octal character anymore,
758                     // so the octal sequence must be ended
759                     // and the character has to be treated as normal character!
760                     m_vecBuffer.push_back ( cOctValue );
761                     bEscape    = false;
762                     bOctEscape = false;
763                     nOctCount  = 0;
764                     cOctValue  = 0;
765                     continue;
766                 }
767 
768                 c = m_device.Device()->GetChar();
769                 cOctValue <<= 3;
770                 cOctValue  |= ((c-'0') & 0x07);
771 
772                 if( nOctCount > 2 )
773                 {
774                     m_vecBuffer.push_back ( cOctValue );
775                     bEscape    = false;
776                     bOctEscape = false;
777                     nOctCount  = 0;
778                     cOctValue  = 0;
779                 }
780             }
781             else
782             {
783                 // Handle plain escape sequences
784                 const char & code = s_escMap[m_device.Device()->GetChar() & 0xff];
785                 if( code )
786                     m_vecBuffer.push_back( code );
787 
788                 bEscape = false;
789             }
790         }
791     }
792 
793     // In case the string ends with a octal escape sequence
794     if( bOctEscape )
795         m_vecBuffer.push_back ( cOctValue );
796 
797     if( m_vecBuffer.size() )
798     {
799         if( pEncrypt )
800         {
801             pdf_long outLen = m_vecBuffer.size() - pEncrypt->CalculateStreamOffset();
802             char * outBuffer = new char[outLen + 16 - (outLen % 16)];
803             pEncrypt->Decrypt( reinterpret_cast<unsigned char*>(&(m_vecBuffer[0])),
804                               static_cast<unsigned int>(m_vecBuffer.size()),
805                               reinterpret_cast<unsigned char*>(outBuffer), outLen);
806 
807             rVariant = PdfString( outBuffer, outLen );
808 
809             delete[] outBuffer;
810         }
811         else
812         {
813             rVariant = PdfString( &(m_vecBuffer[0]), m_vecBuffer.size() );
814         }
815     }
816     else
817     {
818         rVariant = PdfString("");
819     }
820 }
821 
ReadHexString(PdfVariant & rVariant,PdfEncrypt * pEncrypt)822 void PdfTokenizer::ReadHexString( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
823 {
824     ReadHexString( m_vecBuffer );
825 
826     PdfString string;
827     string.SetHexData( m_vecBuffer.size() ? &(m_vecBuffer[0]) : "", m_vecBuffer.size(), pEncrypt );
828 
829     rVariant = string;
830 }
831 
ReadHexString(std::vector<char> & rVecBuffer)832 void PdfTokenizer::ReadHexString( std::vector<char>& rVecBuffer)
833 {
834     rVecBuffer.clear();
835     int        c;
836 
837     while( (c = m_device.Device()->GetChar()) != EOF )
838     {
839         // end of stream reached
840         if( c == '>' )
841             break;
842 
843         // only a hex digits
844         if( isdigit( c ) ||
845             ( c >= 'A' && c <= 'F') ||
846             ( c >= 'a' && c <= 'f'))
847             rVecBuffer.push_back( c );
848     }
849 
850     // pad to an even length if necessary
851     if(rVecBuffer.size() % 2 )
852         rVecBuffer.push_back( '0' );
853 }
854 
ReadName(PdfVariant & rVariant)855 void PdfTokenizer::ReadName( PdfVariant& rVariant )
856 {
857     EPdfTokenType eType;
858     const char*   pszToken;
859 
860     // Do special checking for empty names
861     // as GetNextToken will ignore white spaces
862     // and we have to take care for stuff like:
863     // 10 0 obj / endobj
864     // which stupid but legal PDF
865     int c = m_device.Device()->Look();
866     if( IsWhitespace( c ) ) // Delimeters are handled correctly by GetNextToken
867     {
868         // We are an empty PdfName
869         rVariant = PdfName();
870         return;
871     }
872 
873     bool gotToken = this->GetNextToken( pszToken, &eType );
874     if( !gotToken || eType != ePdfTokenType_Token )
875     {
876         // We got an empty name which is legal according to the PDF specification
877         // Some weird PDFs even use them.
878         rVariant = PdfName();
879 
880         // Enqueue the token again
881         if( gotToken )
882             QuequeToken( pszToken, eType );
883     }
884     else
885         rVariant = PdfName::FromEscaped( pszToken );
886 }
887 
QuequeToken(const char * pszToken,EPdfTokenType eType)888 void PdfTokenizer::QuequeToken( const char* pszToken, EPdfTokenType eType )
889 {
890     m_deqQueque.push_back( TTokenizerPair( std::string( pszToken ), eType ) );
891 }
892 
893 };
894