1 /***************************************************************************
2 * Copyright (C) 2006 by Dominik Seichter *
3 * domseichter@web.de *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Library General Public License as *
7 * published by the Free Software Foundation; either version 2 of the *
8 * License, or (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU Library General Public *
16 * License along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 * *
20 * In addition, as a special exception, the copyright holders give *
21 * permission to link the code of portions of this program with the *
22 * OpenSSL library under certain conditions as described in each *
23 * individual source file, and distribute linked combinations *
24 * including the two. *
25 * You must obey the GNU General Public License in all respects *
26 * for all of the code used other than OpenSSL. If you modify *
27 * file(s) with this exception, you may extend this exception to your *
28 * version of the file(s), but you are not obligated to do so. If you *
29 * do not wish to do so, delete this exception statement from your *
30 * version. If you delete this exception statement from all source *
31 * files in the program, then also delete it here. *
32 ***************************************************************************/
33
34 #include "PdfTokenizer.h"
35
36 #include "PdfArray.h"
37 #include "PdfDictionary.h"
38 #include "PdfEncrypt.h"
39 #include "PdfInputDevice.h"
40 #include "PdfName.h"
41 #include "PdfString.h"
42 #include "PdfReference.h"
43 #include "PdfVariant.h"
44 #include "PdfDefinesPrivate.h"
45
46 #include <limits>
47 #include <sstream>
48 #include <memory>
49
50 #include <stdlib.h>
51 #include <string.h>
52
53 #define PDF_BUFFER 4096
54
55 #define DICT_SEP_LENGTH 2
56 #define NULL_LENGTH 4
57 #define TRUE_LENGTH 4
58 #define FALSE_LENGTH 5
59
60 namespace PoDoFo {
61
62 namespace PdfTokenizerNameSpace{
63
64 static const int g_MapAllocLen = 256;
65 static char g_DelMap[g_MapAllocLen] = { 0 };
66 static char g_WsMap[g_MapAllocLen] = { 0 };
67 static char g_EscMap[g_MapAllocLen] = { 0 };
68 static char g_hexMap[g_MapAllocLen] = { 0 };
69
70 // Generate the delimiter character map at runtime
71 // so that it can be derived from the more easily
72 // maintainable structures in PdfDefines.h
genDelMap()73 const char * genDelMap()
74 {
75 char* map = static_cast<char*>(g_DelMap);
76 memset( map, 0, sizeof(char) * g_MapAllocLen );
77 for (int i = 0; i < PoDoFo::s_nNumDelimiters; ++i)
78 {
79 map[static_cast<int>(PoDoFo::s_cDelimiters[i])] = 1;
80 }
81
82 return map;
83 }
84
85 // Generate the whitespace character map at runtime
86 // so that it can be derived from the more easily
87 // maintainable structures in PdfDefines.h
genWsMap()88 const char * genWsMap()
89 {
90 char* map = static_cast<char*>(g_WsMap);
91 memset( map, 0, sizeof(char) * g_MapAllocLen );
92 for (int i = 0; i < PoDoFo::s_nNumWhiteSpaces; ++i)
93 {
94 map[static_cast<int>(PoDoFo::s_cWhiteSpaces[i])] = 1;
95 }
96 return map;
97 }
98
99 // Generate the escape character map at runtime
genEscMap()100 const char* genEscMap()
101 {
102 char* map = static_cast<char*>(g_EscMap);
103 memset( map, 0, sizeof(char) * g_MapAllocLen );
104
105 map[static_cast<unsigned char>('n')] = '\n'; // Line feed (LF)
106 map[static_cast<unsigned char>('r')] = '\r'; // Carriage return (CR)
107 map[static_cast<unsigned char>('t')] = '\t'; // Horizontal tab (HT)
108 map[static_cast<unsigned char>('b')] = '\b'; // Backspace (BS)
109 map[static_cast<unsigned char>('f')] = '\f'; // Form feed (FF)
110 map[static_cast<unsigned char>(')')] = ')';
111 map[static_cast<unsigned char>('(')] = '(';
112 map[static_cast<unsigned char>('\\')] = '\\';
113
114 return map;
115 }
116
117 // Generate the hex character map at runtime
genHexMap()118 const char* genHexMap()
119 {
120 char* map = static_cast<char*>(g_hexMap);
121 memset( map, PdfTokenizer::HEX_NOT_FOUND, sizeof(char) * g_MapAllocLen );
122
123 map[static_cast<unsigned char>('0')] = 0x0;
124 map[static_cast<unsigned char>('1')] = 0x1;
125 map[static_cast<unsigned char>('2')] = 0x2;
126 map[static_cast<unsigned char>('3')] = 0x3;
127 map[static_cast<unsigned char>('4')] = 0x4;
128 map[static_cast<unsigned char>('5')] = 0x5;
129 map[static_cast<unsigned char>('6')] = 0x6;
130 map[static_cast<unsigned char>('7')] = 0x7;
131 map[static_cast<unsigned char>('8')] = 0x8;
132 map[static_cast<unsigned char>('9')] = 0x9;
133 map[static_cast<unsigned char>('a')] = 0xA;
134 map[static_cast<unsigned char>('b')] = 0xB;
135 map[static_cast<unsigned char>('c')] = 0xC;
136 map[static_cast<unsigned char>('d')] = 0xD;
137 map[static_cast<unsigned char>('e')] = 0xE;
138 map[static_cast<unsigned char>('f')] = 0xF;
139 map[static_cast<unsigned char>('A')] = 0xA;
140 map[static_cast<unsigned char>('B')] = 0xB;
141 map[static_cast<unsigned char>('C')] = 0xC;
142 map[static_cast<unsigned char>('D')] = 0xD;
143 map[static_cast<unsigned char>('E')] = 0xE;
144 map[static_cast<unsigned char>('F')] = 0xF;
145
146 return map;
147 }
148
149 };
150
151 const unsigned int PdfTokenizer::HEX_NOT_FOUND = std::numeric_limits<unsigned int>::max();
152 const char * const PdfTokenizer::s_delimiterMap = PdfTokenizerNameSpace::genDelMap();
153 const char * const PdfTokenizer::s_whitespaceMap = PdfTokenizerNameSpace::genWsMap();
154 const char * const PdfTokenizer::s_escMap = PdfTokenizerNameSpace::genEscMap();
155 const char * const PdfTokenizer::s_hexMap = PdfTokenizerNameSpace::genHexMap();
156
157 const char PdfTokenizer::s_octMap[] = {
158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
163 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0
184 };
185
PdfTokenizer()186 PdfTokenizer::PdfTokenizer()
187 : m_buffer( PDF_BUFFER )
188 {
189 PdfLocaleImbue(m_doubleParser);
190 }
191
PdfTokenizer(const char * pBuffer,size_t lLen)192 PdfTokenizer::PdfTokenizer( const char* pBuffer, size_t lLen )
193 : m_device( pBuffer, lLen ), m_buffer( PDF_BUFFER )
194 {
195 PdfLocaleImbue(m_doubleParser);
196 }
197
PdfTokenizer(const PdfRefCountedInputDevice & rDevice,const PdfRefCountedBuffer & rBuffer)198 PdfTokenizer::PdfTokenizer( const PdfRefCountedInputDevice & rDevice, const PdfRefCountedBuffer & rBuffer )
199 : m_device( rDevice ), m_buffer( rBuffer )
200 {
201 PdfLocaleImbue(m_doubleParser);
202 }
203
~PdfTokenizer()204 PdfTokenizer::~PdfTokenizer()
205 {
206 }
207
GetNextToken(const char * & pszToken,EPdfTokenType * peType)208 bool PdfTokenizer::GetNextToken( const char*& pszToken , EPdfTokenType* peType )
209 {
210 int c;
211 pdf_int64 counter = 0;
212
213 // check first if there are queued tokens and return them first
214 if( m_deqQueque.size() )
215 {
216 TTokenizerPair pair = m_deqQueque.front();
217 m_deqQueque.pop_front();
218
219 if( peType )
220 *peType = pair.second;
221
222 if ( !m_buffer.GetBuffer() || m_buffer.GetSize() == 0)
223 {
224 PODOFO_RAISE_ERROR(ePdfError_InvalidHandle);
225 }
226
227 // make sure buffer is \0 terminated
228 strncpy(m_buffer.GetBuffer(), pair.first.c_str(), m_buffer.GetSize());
229 m_buffer.GetBuffer()[m_buffer.GetSize() - 1] = 0;
230 pszToken = m_buffer.GetBuffer();
231 return true;
232 }
233
234 if( !m_device.Device() )
235 {
236 PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
237 }
238
239 if( peType )
240 *peType = ePdfTokenType_Token;
241
242 while( (c = m_device.Device()->Look()) != EOF
243 && counter + 1 < static_cast<pdf_int64>(m_buffer.GetSize()) )
244 {
245 // ignore leading whitespaces
246 if( !counter && IsWhitespace( c ) )
247 {
248 // Consume the whitespace character
249 c = m_device.Device()->GetChar();
250 continue;
251 }
252 // ignore comments
253 else if( c == '%' )
254 {
255 // Consume all characters before the next line break
256 // 2011-04-19 Ulrich Arnold: accept 0x0D, 0x0A and oX0D 0x0A as one EOL
257 do {
258 c = m_device.Device()->GetChar();
259 } while( c != EOF && c != 0x0D && c != 0x0A );
260
261 if ( c == 0x0D )
262 {
263 if ( m_device.Device()->Look() == 0x0A )
264 c = m_device.Device()->GetChar();
265 }
266 // If we've already read one or more chars of a token, return them, since
267 // comments are treated as token-delimiting whitespace. Otherwise keep reading
268 // at the start of the next line.
269 if (counter)
270 break;
271 }
272 // special handling for << and >> tokens
273 else if( !counter && (c == '<' || c == '>' ) )
274 {
275 if( peType )
276 *peType = ePdfTokenType_Delimiter;
277
278 // retrieve c really from stream
279 c = m_device.Device()->GetChar();
280 m_buffer.GetBuffer()[counter] = c;
281 ++counter;
282
283 char n = m_device.Device()->Look();
284 // Is n another < or > , ie are we opening/closing a dictionary?
285 // If so, consume that character too.
286 if( n == c )
287 {
288 n = m_device.Device()->GetChar();
289 m_buffer.GetBuffer()[counter] = n;
290 ++counter;
291 }
292 // `m_buffer' contains one of < , > , << or >> ; we're done .
293 break;
294 }
295 else if( counter && (IsWhitespace( c ) || IsDelimiter( c )) )
296 {
297 // Next (unconsumed) character is a token-terminating char, so
298 // we have a complete token and can return it.
299 break;
300 }
301 else
302 {
303 // Consume the next character and add it to the token we're building.
304 c = m_device.Device()->GetChar();
305 m_buffer.GetBuffer()[counter] = c;
306 ++counter;
307
308 if( IsDelimiter( c ) )
309 {
310 // All delimeters except << and >> (handled above) are
311 // one-character tokens, so if we hit one we can just return it
312 // immediately.
313 if( peType )
314 *peType = ePdfTokenType_Delimiter;
315 break;
316 }
317 }
318 }
319
320 m_buffer.GetBuffer()[counter] = '\0';
321
322 if( c == EOF && !counter )
323 {
324 // No characters were read before EOF, so we're out of data.
325 // Ensure the buffer points to NULL in case someone fails to check the return value.
326 pszToken = 0;
327 return false;
328 }
329
330 pszToken = m_buffer.GetBuffer();
331 return true;
332 }
333
IsNextToken(const char * pszToken)334 bool PdfTokenizer::IsNextToken( const char* pszToken )
335 {
336 if( !pszToken )
337 {
338 PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
339 }
340
341 const char* pszRead;
342 bool gotToken = this->GetNextToken( pszRead, NULL );
343
344 if (!gotToken)
345 {
346 PODOFO_RAISE_ERROR( ePdfError_UnexpectedEOF );
347 }
348
349 return (strcmp( pszToken, pszRead ) == 0);
350 }
351
GetNextNumber()352 pdf_long PdfTokenizer::GetNextNumber()
353 {
354 EPdfTokenType eType;
355 const char* pszRead;
356 bool gotToken = this->GetNextToken( pszRead, &eType );
357
358 if( !gotToken )
359 {
360 PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected number" );
361 }
362
363 char* end;
364 #ifdef _WIN64
365 pdf_long l = _strtoui64( pszRead, &end, 10 );
366 #else
367 pdf_long l = strtol( pszRead, &end, 10 );
368 #endif
369 if( end == pszRead )
370 {
371 // Don't consume the token
372 this->QuequeToken( pszRead, eType );
373 PODOFO_RAISE_ERROR_INFO( ePdfError_NoNumber, pszRead );
374 }
375
376 return l;
377 }
378
GetNextVariant(PdfVariant & rVariant,PdfEncrypt * pEncrypt)379 void PdfTokenizer::GetNextVariant( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
380 {
381 EPdfTokenType eTokenType;
382 const char* pszRead;
383 bool gotToken = this->GetNextToken( pszRead, &eTokenType );
384
385 if (!gotToken)
386 {
387 PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected variant." );
388 }
389
390 this->GetNextVariant( pszRead, eTokenType, rVariant, pEncrypt );
391 }
392
GetNextVariant(const char * pszToken,EPdfTokenType eType,PdfVariant & rVariant,PdfEncrypt * pEncrypt)393 void PdfTokenizer::GetNextVariant( const char* pszToken, EPdfTokenType eType, PdfVariant& rVariant, PdfEncrypt* pEncrypt )
394 {
395 EPdfDataType eDataType = this->DetermineDataType( pszToken, eType, rVariant );
396
397 if( eDataType == ePdfDataType_Null ||
398 eDataType == ePdfDataType_Bool ||
399 eDataType == ePdfDataType_Number ||
400 eDataType == ePdfDataType_Real ||
401 eDataType == ePdfDataType_Reference )
402 {
403 // the data was already read into rVariant by the DetermineDataType function
404 return;
405 }
406
407 this->ReadDataType( eDataType, rVariant, pEncrypt );
408 }
409
DetermineDataType(const char * pszToken,EPdfTokenType eTokenType,PdfVariant & rVariant)410 EPdfDataType PdfTokenizer::DetermineDataType( const char* pszToken, EPdfTokenType eTokenType, PdfVariant& rVariant )
411 {
412 if( eTokenType == ePdfTokenType_Token )
413 {
414 // check for the two special datatypes
415 // null and boolean.
416 // check for numbers
417 if( strncmp( "null", pszToken, NULL_LENGTH ) == 0 )
418 {
419 rVariant = PdfVariant();
420 return ePdfDataType_Null;
421 }
422 else if( strncmp( "true", pszToken, TRUE_LENGTH ) == 0 )
423 {
424 rVariant = PdfVariant( true );
425 return ePdfDataType_Bool;
426 }
427 else if( strncmp( "false", pszToken, FALSE_LENGTH ) == 0 )
428 {
429 rVariant = PdfVariant( false );
430 return ePdfDataType_Bool;
431 }
432
433 EPdfDataType eDataType = ePdfDataType_Number;
434 const char* pszStart = pszToken;
435
436 while( *pszStart )
437 {
438 if( *pszStart == '.' )
439 eDataType = ePdfDataType_Real;
440 else if( !(isdigit( static_cast<const unsigned char>(*pszStart) ) || *pszStart == '-' || *pszStart == '+' ) )
441 {
442 eDataType = ePdfDataType_Unknown;
443 break;
444 }
445
446 ++pszStart;
447 }
448
449 if( eDataType == ePdfDataType_Real )
450 {
451 // DOM: strtod is locale dependend,
452 // do not use it
453 //double dVal = strtod( pszToken, NULL );
454 double dVal;
455
456 m_doubleParser.clear(); // clear error state
457 m_doubleParser.str( pszToken );
458 if( !(m_doubleParser >> dVal) )
459 {
460 m_doubleParser.clear(); // clear error state
461 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, pszToken );
462 }
463
464 rVariant = PdfVariant( dVal );
465 return ePdfDataType_Real;
466 }
467 else if( eDataType == ePdfDataType_Number )
468 {
469 #ifdef _WIN64
470 rVariant = PdfVariant( static_cast<pdf_int64>(_strtoui64( pszToken, NULL, 10 )) );
471 #else
472 rVariant = PdfVariant( static_cast<pdf_int64>(strtol( pszToken, NULL, 10 )) );
473 #endif
474 // read another two tokens to see if it is a reference
475 // we cannot be sure that there is another token
476 // on the input device, so if we hit EOF just return
477 // ePdfDataType_Number .
478 EPdfTokenType eSecondTokenType;
479 bool gotToken = this->GetNextToken( pszToken, &eSecondTokenType );
480 if (!gotToken)
481 // No next token, so it can't be a reference
482 return eDataType;
483 if( eSecondTokenType != ePdfTokenType_Token )
484 {
485 this->QuequeToken( pszToken, eSecondTokenType );
486 return eDataType;
487 }
488
489
490 pszStart = pszToken;
491 #ifdef _WIN64
492 pdf_long l = _strtoui64( pszStart, const_cast<char**>(&pszToken), 10 );
493 #else
494 long l = strtol( pszStart, const_cast<char**>(&pszToken), 10 );
495 #endif
496 if( pszToken == pszStart )
497 {
498 this->QuequeToken( pszStart, eSecondTokenType );
499 return eDataType;
500 }
501
502 std::string backup( pszStart );
503 EPdfTokenType eThirdTokenType;
504 gotToken = this->GetNextToken( pszToken, &eThirdTokenType );
505 if (!gotToken)
506 // No third token, so it can't be a reference
507 return eDataType;
508 if( eThirdTokenType == ePdfTokenType_Token &&
509 pszToken[0] == 'R' && pszToken[1] == '\0' )
510 {
511 rVariant = PdfReference( static_cast<unsigned int>(rVariant.GetNumber()),
512 static_cast<const pdf_uint16>(l) );
513 return ePdfDataType_Reference;
514 }
515 else
516 {
517 this->QuequeToken( backup.c_str(), eSecondTokenType );
518 this->QuequeToken( pszToken, eThirdTokenType );
519 return eDataType;
520 }
521 }
522 }
523 else if( eTokenType == ePdfTokenType_Delimiter )
524 {
525 if( strncmp( "<<", pszToken, DICT_SEP_LENGTH ) == 0 )
526 return ePdfDataType_Dictionary;
527 else if( pszToken[0] == '[' )
528 return ePdfDataType_Array;
529 else if( pszToken[0] == '(' )
530 return ePdfDataType_String;
531 else if( pszToken[0] == '<' )
532 return ePdfDataType_HexString;
533 else if( pszToken[0] == '/' )
534 return ePdfDataType_Name;
535 }
536
537 if( false )
538 {
539 std::ostringstream ss;
540 #if defined(_MSC_VER) && _MSC_VER <= 1200
541 ss << "Got unexpected PDF data in" << __FILE__ << ", line " << __LINE__
542 #else
543 ss << "Got unexpected PDF data in" << PODOFO__FUNCTION__
544 #endif
545 << ": \""
546 << pszToken
547 << "\". Current read offset is "
548 << m_device.Device()->Tell()
549 << " which should be around the problem.\n";
550 PdfError::DebugMessage(ss.str().c_str());
551 }
552
553 return ePdfDataType_Unknown;
554 }
555
ReadDataType(EPdfDataType eDataType,PdfVariant & rVariant,PdfEncrypt * pEncrypt)556 void PdfTokenizer::ReadDataType( EPdfDataType eDataType, PdfVariant& rVariant, PdfEncrypt* pEncrypt )
557 {
558 switch( eDataType )
559 {
560 case ePdfDataType_Dictionary:
561 this->ReadDictionary( rVariant, pEncrypt );
562 break;
563 case ePdfDataType_Array:
564 this->ReadArray( rVariant, pEncrypt );
565 break;
566 case ePdfDataType_String:
567 this->ReadString( rVariant, pEncrypt );
568 break;
569 case ePdfDataType_HexString:
570 this->ReadHexString( rVariant, pEncrypt );
571 break;
572 case ePdfDataType_Name:
573 this->ReadName( rVariant );
574 break;
575
576 // The following datatypes are not handled by read datatype
577 // but are already parsed by DetermineDatatype
578 case ePdfDataType_Null:
579 case ePdfDataType_Bool:
580 case ePdfDataType_Number:
581 case ePdfDataType_Real:
582 case ePdfDataType_Reference:
583 case ePdfDataType_Unknown:
584 case ePdfDataType_RawData:
585
586 default:
587 {
588 PdfError::LogMessage( eLogSeverity_Debug, "Got Datatype: %i\n", eDataType );
589 PODOFO_RAISE_ERROR( ePdfError_InvalidDataType );
590 }
591 }
592 }
593
ReadDictionary(PdfVariant & rVariant,PdfEncrypt * pEncrypt)594 void PdfTokenizer::ReadDictionary( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
595 {
596 PdfVariant val;
597 PdfName key;
598 PdfDictionary dict;
599 EPdfTokenType eType;
600 const char * pszToken;
601 PODOFO_UNIQUEU_PTR<std::vector<char> > contentsHexBuffer;
602
603 for( ;; )
604 {
605 bool gotToken = this->GetNextToken( pszToken, &eType );
606 if (!gotToken)
607 {
608 PODOFO_RAISE_ERROR_INFO(ePdfError_UnexpectedEOF, "Expected dictionary key name or >> delim.");
609 }
610 if( eType == ePdfTokenType_Delimiter && strncmp( ">>", pszToken, DICT_SEP_LENGTH ) == 0 )
611 break;
612
613 this->GetNextVariant( pszToken, eType, val, pEncrypt );
614 // Convert the read variant to a name; throws InvalidDataType if not a name.
615 key = val.GetName();
616
617 // Try to get the next variant
618 gotToken = this->GetNextToken( pszToken, &eType );
619 if ( !gotToken )
620 {
621 PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected variant." );
622 }
623
624 EPdfDataType eDataType = this->DetermineDataType( pszToken, eType, val );
625 if ( key == "Contents" && eDataType == ePdfDataType_HexString )
626 {
627 // 'Contents' key in signature dictionaries is an unencrypted Hex string:
628 // save the string buffer for later check if it needed decryption
629 contentsHexBuffer = PODOFO_UNIQUEU_PTR<std::vector<char> >( new std::vector<char>() );
630 ReadHexString( *contentsHexBuffer );
631 continue;
632 }
633
634 switch ( eDataType )
635 {
636 case ePdfDataType_Null:
637 case ePdfDataType_Bool:
638 case ePdfDataType_Number:
639 case ePdfDataType_Real:
640 case ePdfDataType_Reference:
641 {
642 // the data was already read into rVariant by the DetermineDataType function
643 break;
644 }
645 case ePdfDataType_Name:
646 case ePdfDataType_String:
647 case ePdfDataType_HexString:
648 case ePdfDataType_Array:
649 case ePdfDataType_Dictionary:
650 {
651 this->ReadDataType( eDataType, val, pEncrypt );
652 break;
653 }
654 case ePdfDataType_RawData:
655 case ePdfDataType_Unknown:
656 default:
657 {
658 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "Unexpected data type" );
659 }
660 }
661
662 dict.AddKey( key, val );
663 }
664
665 if ( contentsHexBuffer.get() != NULL )
666 {
667 PdfObject *type = dict.GetKey( "Type" );
668 // "Contents" is unencrypted in /Type/Sig and /Type/DocTimeStamp dictionaries
669 // https://issues.apache.org/jira/browse/PDFBOX-3173
670 bool contentsUnencrypted = type != NULL && type->GetDataType() == ePdfDataType_Name &&
671 (type->GetName() == PdfName( "Sig" ) || type->GetName() == PdfName( "DocTimeStamp" ));
672
673 PdfEncrypt *encrypt = NULL;
674 if ( !contentsUnencrypted )
675 encrypt = pEncrypt;
676
677 PdfString string;
678 string.SetHexData( contentsHexBuffer->size() ? &(*contentsHexBuffer)[0] : "", contentsHexBuffer->size(), encrypt );
679
680 val = string;
681 dict.AddKey( "Contents", val );
682 }
683
684 rVariant = dict;
685 }
686
ReadArray(PdfVariant & rVariant,PdfEncrypt * pEncrypt)687 void PdfTokenizer::ReadArray( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
688 {
689 const char* pszToken;
690 EPdfTokenType eType;
691 PdfVariant var;
692 PdfArray array;
693
694 for( ;; )
695 {
696 bool gotToken = this->GetNextToken( pszToken, &eType );
697 if (!gotToken)
698 {
699 PODOFO_RAISE_ERROR_INFO(ePdfError_UnexpectedEOF, "Expected array item or ] delim.");
700 }
701 if( eType == ePdfTokenType_Delimiter && pszToken[0] == ']' )
702 break;
703
704 this->GetNextVariant( pszToken, eType, var, pEncrypt );
705 array.push_back( var );
706 }
707
708 rVariant = array;
709 }
710
ReadString(PdfVariant & rVariant,PdfEncrypt * pEncrypt)711 void PdfTokenizer::ReadString( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
712 {
713 int c;
714
715 bool bEscape = false;
716 bool bOctEscape = false;
717 int nOctCount = 0;
718 char cOctValue = 0;
719 int nBalanceCount = 0; // Balanced parathesis do not have to be escaped in strings
720
721 m_vecBuffer.clear();
722
723 while( (c = m_device.Device()->Look()) != EOF )
724 {
725 // end of stream reached
726 if( !bEscape )
727 {
728 // Handle raw characters
729 c = m_device.Device()->GetChar();
730 if( !nBalanceCount && c == ')' )
731 break;
732
733 if( c == '(' )
734 ++nBalanceCount;
735 else if( c == ')' )
736 --nBalanceCount;
737
738 bEscape = (c == '\\');
739 if( !bEscape )
740 m_vecBuffer.push_back( static_cast<char>(c) );
741 }
742 else
743 {
744 // Handle escape sequences
745 if( bOctEscape || s_octMap[c & 0xff] )
746 // The last character we have read was a '\\',
747 // so we check now for a digit to find stuff like \005
748 bOctEscape = true;
749
750 if( bOctEscape )
751 {
752 // Handle octal escape sequences
753 ++nOctCount;
754
755 if( !s_octMap[c & 0xff] )
756 {
757 // No octal character anymore,
758 // so the octal sequence must be ended
759 // and the character has to be treated as normal character!
760 m_vecBuffer.push_back ( cOctValue );
761 bEscape = false;
762 bOctEscape = false;
763 nOctCount = 0;
764 cOctValue = 0;
765 continue;
766 }
767
768 c = m_device.Device()->GetChar();
769 cOctValue <<= 3;
770 cOctValue |= ((c-'0') & 0x07);
771
772 if( nOctCount > 2 )
773 {
774 m_vecBuffer.push_back ( cOctValue );
775 bEscape = false;
776 bOctEscape = false;
777 nOctCount = 0;
778 cOctValue = 0;
779 }
780 }
781 else
782 {
783 // Handle plain escape sequences
784 const char & code = s_escMap[m_device.Device()->GetChar() & 0xff];
785 if( code )
786 m_vecBuffer.push_back( code );
787
788 bEscape = false;
789 }
790 }
791 }
792
793 // In case the string ends with a octal escape sequence
794 if( bOctEscape )
795 m_vecBuffer.push_back ( cOctValue );
796
797 if( m_vecBuffer.size() )
798 {
799 if( pEncrypt )
800 {
801 pdf_long outLen = m_vecBuffer.size() - pEncrypt->CalculateStreamOffset();
802 char * outBuffer = new char[outLen + 16 - (outLen % 16)];
803 pEncrypt->Decrypt( reinterpret_cast<unsigned char*>(&(m_vecBuffer[0])),
804 static_cast<unsigned int>(m_vecBuffer.size()),
805 reinterpret_cast<unsigned char*>(outBuffer), outLen);
806
807 rVariant = PdfString( outBuffer, outLen );
808
809 delete[] outBuffer;
810 }
811 else
812 {
813 rVariant = PdfString( &(m_vecBuffer[0]), m_vecBuffer.size() );
814 }
815 }
816 else
817 {
818 rVariant = PdfString("");
819 }
820 }
821
ReadHexString(PdfVariant & rVariant,PdfEncrypt * pEncrypt)822 void PdfTokenizer::ReadHexString( PdfVariant& rVariant, PdfEncrypt* pEncrypt )
823 {
824 ReadHexString( m_vecBuffer );
825
826 PdfString string;
827 string.SetHexData( m_vecBuffer.size() ? &(m_vecBuffer[0]) : "", m_vecBuffer.size(), pEncrypt );
828
829 rVariant = string;
830 }
831
ReadHexString(std::vector<char> & rVecBuffer)832 void PdfTokenizer::ReadHexString( std::vector<char>& rVecBuffer)
833 {
834 rVecBuffer.clear();
835 int c;
836
837 while( (c = m_device.Device()->GetChar()) != EOF )
838 {
839 // end of stream reached
840 if( c == '>' )
841 break;
842
843 // only a hex digits
844 if( isdigit( c ) ||
845 ( c >= 'A' && c <= 'F') ||
846 ( c >= 'a' && c <= 'f'))
847 rVecBuffer.push_back( c );
848 }
849
850 // pad to an even length if necessary
851 if(rVecBuffer.size() % 2 )
852 rVecBuffer.push_back( '0' );
853 }
854
ReadName(PdfVariant & rVariant)855 void PdfTokenizer::ReadName( PdfVariant& rVariant )
856 {
857 EPdfTokenType eType;
858 const char* pszToken;
859
860 // Do special checking for empty names
861 // as GetNextToken will ignore white spaces
862 // and we have to take care for stuff like:
863 // 10 0 obj / endobj
864 // which stupid but legal PDF
865 int c = m_device.Device()->Look();
866 if( IsWhitespace( c ) ) // Delimeters are handled correctly by GetNextToken
867 {
868 // We are an empty PdfName
869 rVariant = PdfName();
870 return;
871 }
872
873 bool gotToken = this->GetNextToken( pszToken, &eType );
874 if( !gotToken || eType != ePdfTokenType_Token )
875 {
876 // We got an empty name which is legal according to the PDF specification
877 // Some weird PDFs even use them.
878 rVariant = PdfName();
879
880 // Enqueue the token again
881 if( gotToken )
882 QuequeToken( pszToken, eType );
883 }
884 else
885 rVariant = PdfName::FromEscaped( pszToken );
886 }
887
QuequeToken(const char * pszToken,EPdfTokenType eType)888 void PdfTokenizer::QuequeToken( const char* pszToken, EPdfTokenType eType )
889 {
890 m_deqQueque.push_back( TTokenizerPair( std::string( pszToken ), eType ) );
891 }
892
893 };
894