1 //========================================================================
2 //
3 // Lexer.cc
4 //
5 // Copyright 1996-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2006-2010, 2012-2014 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18 // Copyright (C) 2010 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2012, 2013 Adrian Johnson <ajohnson@redneon.com>
20 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
21 //
22 // To see a description of the changes please see the Changelog file that
23 // came with your tarball or type make ChangeLog if you are building from git
24 //
25 //========================================================================
26 
27 #include <config.h>
28 
29 #ifdef USE_GCC_PRAGMAS
30 #pragma implementation
31 #endif
32 
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <limits.h>
37 #include <ctype.h>
38 #include "Lexer.h"
39 #include "Error.h"
40 #include "XRef.h"
41 
42 //------------------------------------------------------------------------
43 
44 // A '1' in this array means the character is white space.  A '1' or
45 // '2' means the character ends a name or command.
46 static const char specialChars[256] = {
47   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
48   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
49   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
50   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
51   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
52   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
53   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
54   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
55   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
56   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
57   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
58   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
59   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
60   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
61   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
62   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
63 };
64 
65 static const int IntegerSafeLimit = (INT_MAX - 9) / 10;
66 static const long long LongLongSafeLimit = (LLONG_MAX - 9) / 10;
67 
68 //------------------------------------------------------------------------
69 // Lexer
70 //------------------------------------------------------------------------
71 
Lexer(XRef * xrefA,Stream * str)72 Lexer::Lexer(XRef *xrefA, Stream *str) {
73   Object obj;
74 
75   lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
76   xref = xrefA;
77 
78   curStr.initStream(str);
79   streams = new Array(xref);
80   streams->add(curStr.copy(&obj));
81   strPtr = 0;
82   freeArray = gTrue;
83   curStr.streamReset();
84 }
85 
Lexer(XRef * xrefA,Object * obj)86 Lexer::Lexer(XRef *xrefA, Object *obj) {
87   Object obj2;
88 
89   lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
90   xref = xrefA;
91 
92   if (obj->isStream()) {
93     streams = new Array(xref);
94     freeArray = gTrue;
95     streams->add(obj->copy(&obj2));
96   } else {
97     streams = obj->getArray();
98     freeArray = gFalse;
99   }
100   strPtr = 0;
101   if (streams->getLength() > 0) {
102     streams->get(strPtr, &curStr);
103     curStr.streamReset();
104   }
105 }
106 
~Lexer()107 Lexer::~Lexer() {
108   if (!curStr.isNone()) {
109     curStr.streamClose();
110     curStr.free();
111   }
112   if (freeArray) {
113     delete streams;
114   }
115 }
116 
getChar(GBool comesFromLook)117 int Lexer::getChar(GBool comesFromLook) {
118   int c;
119 
120   if (LOOK_VALUE_NOT_CACHED != lookCharLastValueCached) {
121     c = lookCharLastValueCached;
122     lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
123     return c;
124   }
125 
126   c = EOF;
127   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
128     if (comesFromLook == gTrue) {
129       return EOF;
130     } else {
131       curStr.streamClose();
132       curStr.free();
133       ++strPtr;
134       if (strPtr < streams->getLength()) {
135         streams->get(strPtr, &curStr);
136         curStr.streamReset();
137       }
138     }
139   }
140   return c;
141 }
142 
lookChar()143 int Lexer::lookChar() {
144 
145   if (LOOK_VALUE_NOT_CACHED != lookCharLastValueCached) {
146     return lookCharLastValueCached;
147   }
148   lookCharLastValueCached = getChar(gTrue);
149   if (lookCharLastValueCached == EOF) {
150     lookCharLastValueCached = LOOK_VALUE_NOT_CACHED;
151     return EOF;
152   } else {
153     return lookCharLastValueCached;
154   }
155 }
156 
getObj(Object * obj,int objNum)157 Object *Lexer::getObj(Object *obj, int objNum) {
158   char *p;
159   int c, c2;
160   GBool comment, neg, done, overflownInteger, overflownLongLong;
161   int numParen;
162   int xi;
163   long long xll = 0;
164   double xf = 0, scale;
165   GooString *s;
166   int n, m;
167 
168   // skip whitespace and comments
169   comment = gFalse;
170   while (1) {
171     if ((c = getChar()) == EOF) {
172       return obj->initEOF();
173     }
174     if (comment) {
175       if (c == '\r' || c == '\n')
176 	comment = gFalse;
177     } else if (c == '%') {
178       comment = gTrue;
179     } else if (specialChars[c] != 1) {
180       break;
181     }
182   }
183 
184   // start reading token
185   switch (c) {
186 
187   // number
188   case '0': case '1': case '2': case '3': case '4':
189   case '5': case '6': case '7': case '8': case '9':
190   case '+': case '-': case '.':
191     overflownInteger = gFalse;
192     overflownLongLong = gFalse;
193     neg = gFalse;
194     xi = 0;
195     if (c == '-') {
196       neg = gTrue;
197     } else if (c == '.') {
198       goto doReal;
199     } else if (c != '+') {
200       xi = c - '0';
201     }
202     while (1) {
203       c = lookChar();
204       if (isdigit(c)) {
205 	getChar();
206 	if (unlikely(overflownLongLong)) {
207 	  xf = xf * 10.0 + (c - '0');
208 	} else if (unlikely (overflownInteger)) {
209 	  if (unlikely(xll > LongLongSafeLimit) &&
210 	      (xll > (LLONG_MAX - (c - '0')) / 10.0)) {
211 	    overflownLongLong = gTrue;
212 	    xf = xll * 10.0 + (c - '0');
213 	  } else {
214 	    xll = xll * 10 + (c - '0');
215 	  }
216 	} else {
217 	  if (unlikely(xi > IntegerSafeLimit) &&
218 	      (xi > (INT_MAX - (c - '0')) / 10.0)) {
219 	    overflownInteger = gTrue;
220 	    xll = xi * 10LL + (c - '0');
221 	  } else {
222 	    xi = xi * 10 + (c - '0');
223 	  }
224 	}
225       } else if (c == '.') {
226 	getChar();
227 	goto doReal;
228       } else {
229 	break;
230       }
231     }
232     if (neg) {
233       xi = -xi;
234       xll = -xll;
235       xf = -xf;
236     }
237     if (unlikely(overflownInteger)) {
238       if (overflownLongLong) {
239         obj->initReal(xf);
240       } else {
241         if (unlikely(xll == INT_MIN)) {
242           obj->initInt(INT_MIN);
243         } else {
244           obj->initInt64(xll);
245         }
246       }
247     } else {
248       obj->initInt(xi);
249     }
250     break;
251   doReal:
252     if (likely(!overflownInteger)) {
253       xf = xi;
254     } else if (!overflownLongLong) {
255       xf = xll;
256     }
257     scale = 0.1;
258     while (1) {
259       c = lookChar();
260       if (c == '-') {
261 	// ignore minus signs in the middle of numbers to match
262 	// Adobe's behavior
263 	error(errSyntaxWarning, getPos(), "Badly formatted number");
264 	getChar();
265 	continue;
266       }
267       if (!isdigit(c)) {
268 	break;
269       }
270       getChar();
271       xf = xf + scale * (c - '0');
272       scale *= 0.1;
273     }
274     if (neg) {
275       xf = -xf;
276     }
277     obj->initReal(xf);
278     break;
279 
280   // string
281   case '(':
282     p = tokBuf;
283     n = 0;
284     numParen = 1;
285     done = gFalse;
286     s = NULL;
287     do {
288       c2 = EOF;
289       switch (c = getChar()) {
290 
291       case EOF:
292 #if 0
293       // This breaks some PDF files, e.g., ones from Photoshop.
294       case '\r':
295       case '\n':
296 #endif
297 	error(errSyntaxError, getPos(), "Unterminated string");
298 	done = gTrue;
299 	break;
300 
301       case '(':
302 	++numParen;
303 	c2 = c;
304 	break;
305 
306       case ')':
307 	if (--numParen == 0) {
308 	  done = gTrue;
309 	} else {
310 	  c2 = c;
311 	}
312 	break;
313 
314       case '\\':
315 	switch (c = getChar()) {
316 	case 'n':
317 	  c2 = '\n';
318 	  break;
319 	case 'r':
320 	  c2 = '\r';
321 	  break;
322 	case 't':
323 	  c2 = '\t';
324 	  break;
325 	case 'b':
326 	  c2 = '\b';
327 	  break;
328 	case 'f':
329 	  c2 = '\f';
330 	  break;
331 	case '\\':
332 	case '(':
333 	case ')':
334 	  c2 = c;
335 	  break;
336 	case '0': case '1': case '2': case '3':
337 	case '4': case '5': case '6': case '7':
338 	  c2 = c - '0';
339 	  c = lookChar();
340 	  if (c >= '0' && c <= '7') {
341 	    getChar();
342 	    c2 = (c2 << 3) + (c - '0');
343 	    c = lookChar();
344 	    if (c >= '0' && c <= '7') {
345 	      getChar();
346 	      c2 = (c2 << 3) + (c - '0');
347 	    }
348 	  }
349 	  break;
350 	case '\r':
351 	  c = lookChar();
352 	  if (c == '\n') {
353 	    getChar();
354 	  }
355 	  break;
356 	case '\n':
357 	  break;
358 	case EOF:
359 	  error(errSyntaxError, getPos(), "Unterminated string");
360 	  done = gTrue;
361 	  break;
362 	default:
363 	  c2 = c;
364 	  break;
365 	}
366 	break;
367 
368       default:
369 	c2 = c;
370 	break;
371       }
372 
373       if (c2 != EOF) {
374 	if (n == tokBufSize) {
375 	  if (!s)
376 	    s = new GooString(tokBuf, tokBufSize);
377 	  else
378 	    s->append(tokBuf, tokBufSize);
379 	  p = tokBuf;
380 	  n = 0;
381 
382 	  // we are growing see if the document is not malformed and we are growing too much
383 	  if (objNum > 0 && xref != NULL)
384 	  {
385 	    int newObjNum = xref->getNumEntry(curStr.streamGetPos());
386 	    if (newObjNum != objNum)
387 	    {
388 	      error(errSyntaxError, getPos(), "Unterminated string");
389 	      done = gTrue;
390 	      delete s;
391 	      n = -2;
392 	    }
393 	  }
394 	}
395 	*p++ = (char)c2;
396 	++n;
397       }
398     } while (!done);
399     if (n >= 0) {
400       if (!s)
401         s = new GooString(tokBuf, n);
402       else
403         s->append(tokBuf, n);
404       obj->initString(s);
405     } else {
406       obj->initEOF();
407     }
408     break;
409 
410   // name
411   case '/':
412     p = tokBuf;
413     n = 0;
414     s = NULL;
415     while ((c = lookChar()) != EOF && !specialChars[c]) {
416       getChar();
417       if (c == '#') {
418 	c2 = lookChar();
419 	if (c2 >= '0' && c2 <= '9') {
420 	  c = c2 - '0';
421 	} else if (c2 >= 'A' && c2 <= 'F') {
422 	  c = c2 - 'A' + 10;
423 	} else if (c2 >= 'a' && c2 <= 'f') {
424 	  c = c2 - 'a' + 10;
425 	} else {
426 	  goto notEscChar;
427 	}
428 	getChar();
429 	c <<= 4;
430 	c2 = getChar();
431 	if (c2 >= '0' && c2 <= '9') {
432 	  c += c2 - '0';
433 	} else if (c2 >= 'A' && c2 <= 'F') {
434 	  c += c2 - 'A' + 10;
435 	} else if (c2 >= 'a' && c2 <= 'f') {
436 	  c += c2 - 'a' + 10;
437 	} else {
438 	  error(errSyntaxError, getPos(), "Illegal digit in hex char in name");
439 	}
440       }
441      notEscChar:
442       // the PDF spec claims that names are limited to 127 chars, but
443       // Distiller 8 will produce longer names, and Acrobat 8 will
444       // accept longer names
445       ++n;
446       if (n < tokBufSize) {
447 	*p++ = c;
448       } else if (n == tokBufSize) {
449 	error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be");
450 	*p = c;
451 	s = new GooString(tokBuf, n);
452       } else {
453 	s->append((char)c);
454       }
455     }
456     if (n < tokBufSize) {
457       *p = '\0';
458       obj->initName(tokBuf);
459     } else {
460       obj->initName(s->getCString());
461       delete s;
462     }
463     break;
464 
465   // array punctuation
466   case '[':
467   case ']':
468     tokBuf[0] = c;
469     tokBuf[1] = '\0';
470     obj->initCmd(tokBuf);
471     break;
472 
473   // hex string or dict punctuation
474   case '<':
475     c = lookChar();
476 
477     // dict punctuation
478     if (c == '<') {
479       getChar();
480       tokBuf[0] = tokBuf[1] = '<';
481       tokBuf[2] = '\0';
482       obj->initCmd(tokBuf);
483 
484     // hex string
485     } else {
486       p = tokBuf;
487       m = n = 0;
488       c2 = 0;
489       s = NULL;
490       while (1) {
491 	c = getChar();
492 	if (c == '>') {
493 	  break;
494 	} else if (c == EOF) {
495 	  error(errSyntaxError, getPos(), "Unterminated hex string");
496 	  break;
497 	} else if (specialChars[c] != 1) {
498 	  c2 = c2 << 4;
499 	  if (c >= '0' && c <= '9')
500 	    c2 += c - '0';
501 	  else if (c >= 'A' && c <= 'F')
502 	    c2 += c - 'A' + 10;
503 	  else if (c >= 'a' && c <= 'f')
504 	    c2 += c - 'a' + 10;
505 	  else
506 	    error(errSyntaxError, getPos(), "Illegal character <{0:02x}> in hex string", c);
507 	  if (++m == 2) {
508 	    if (n == tokBufSize) {
509 	      if (!s)
510 		s = new GooString(tokBuf, tokBufSize);
511 	      else
512 		s->append(tokBuf, tokBufSize);
513 	      p = tokBuf;
514 	      n = 0;
515 	    }
516 	    *p++ = (char)c2;
517 	    ++n;
518 	    c2 = 0;
519 	    m = 0;
520 	  }
521 	}
522       }
523       if (!s)
524 	s = new GooString(tokBuf, n);
525       else
526 	s->append(tokBuf, n);
527       if (m == 1)
528 	s->append((char)(c2 << 4));
529       obj->initString(s);
530     }
531     break;
532 
533   // dict punctuation
534   case '>':
535     c = lookChar();
536     if (c == '>') {
537       getChar();
538       tokBuf[0] = tokBuf[1] = '>';
539       tokBuf[2] = '\0';
540       obj->initCmd(tokBuf);
541     } else {
542       error(errSyntaxError, getPos(), "Illegal character '>'");
543       obj->initError();
544     }
545     break;
546 
547   // error
548   case ')':
549   case '{':
550   case '}':
551     error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
552     obj->initError();
553     break;
554 
555   // command
556   default:
557     p = tokBuf;
558     *p++ = c;
559     n = 1;
560     while ((c = lookChar()) != EOF && !specialChars[c]) {
561       getChar();
562       if (++n == tokBufSize) {
563 	error(errSyntaxError, getPos(), "Command token too long");
564 	break;
565       }
566       *p++ = c;
567     }
568     *p = '\0';
569     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
570       obj->initBool(gTrue);
571     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
572       obj->initBool(gFalse);
573     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
574       obj->initNull();
575     } else {
576       obj->initCmd(tokBuf);
577     }
578     break;
579   }
580 
581   return obj;
582 }
583 
getObj(Object * obj,const char * cmdA,int objNum)584 Object *Lexer::getObj(Object *obj, const char *cmdA, int objNum) {
585   char *p;
586   int c;
587   GBool comment;
588   int n;
589 
590   // skip whitespace and comments
591   comment = gFalse;
592   const char *cmd1 = tokBuf;
593   *tokBuf = 0;
594   while (strcmp(cmdA, cmd1) && (objNum < 0 || (xref && xref->getNumEntry(getPos()) == objNum))) {
595     while (1) {
596       if ((c = getChar()) == EOF) {
597         return obj->initEOF();
598       }
599       if (comment) {
600         if (c == '\r' || c == '\n') {
601           comment = gFalse;
602         }
603       } else if (c == '%') {
604         comment = gTrue;
605       } else if (specialChars[c] != 1) {
606         break;
607       }
608     }
609     p = tokBuf;
610     *p++ = c;
611     n = 1;
612     while ((c = lookChar()) != EOF && specialChars[c] == 0) {
613       getChar();
614       if (++n == tokBufSize) {
615         break;
616       }
617       *p++ = c;
618     }
619     *p = '\0';
620   }
621   obj->initCmd(tokBuf);
622 
623   return obj;
624 }
625 
skipToNextLine()626 void Lexer::skipToNextLine() {
627   int c;
628 
629   while (1) {
630     c = getChar();
631     if (c == EOF || c == '\n') {
632       return;
633     }
634     if (c == '\r') {
635       if ((c = lookChar()) == '\n') {
636 	getChar();
637       }
638       return;
639     }
640   }
641 }
642 
isSpace(int c)643 GBool Lexer::isSpace(int c) {
644   return c >= 0 && c <= 0xff && specialChars[c] == 1;
645 }
646