1 //========================================================================
2 //
3 // Lexer.cc
4 //
5 // Copyright 1996-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 #include <aconf.h>
10
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
14
15 #include <stdlib.h>
16 #include <stddef.h>
17 #include <string.h>
18 #include <ctype.h>
19 #include "Lexer.h"
20 #include "Error.h"
21
22 //------------------------------------------------------------------------
23
24 // A '1' in this array means the character is white space. A '1' or
25 // '2' means the character ends a name or command.
26 static char specialChars[256] = {
27 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
29 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
32 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x
33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x
35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
36 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
37 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax
38 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx
41 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex
42 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx
43 };
44
45 //------------------------------------------------------------------------
46 // Lexer
47 //------------------------------------------------------------------------
48
Lexer(XRef * xref,Stream * str)49 Lexer::Lexer(XRef *xref, Stream *str) {
50 Object obj;
51
52 curStr.initStream(str);
53 streams = new Array(xref);
54 streams->add(curStr.copy(&obj));
55 strPtr = 0;
56 freeArray = gTrue;
57 curStr.streamReset();
58 }
59
Lexer(XRef * xref,Object * obj)60 Lexer::Lexer(XRef *xref, Object *obj) {
61 Object obj2;
62
63 if (obj->isStream()) {
64 streams = new Array(xref);
65 freeArray = gTrue;
66 streams->add(obj->copy(&obj2));
67 } else {
68 streams = obj->getArray();
69 freeArray = gFalse;
70 }
71 strPtr = 0;
72 if (streams->getLength() > 0) {
73 streams->get(strPtr, &curStr);
74 curStr.streamReset();
75 }
76 }
77
~Lexer()78 Lexer::~Lexer() {
79 if (!curStr.isNone()) {
80 curStr.streamClose();
81 curStr.free();
82 }
83 if (freeArray) {
84 delete streams;
85 }
86 }
87
getChar()88 int Lexer::getChar() {
89 int c;
90
91 c = EOF;
92 while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
93 curStr.streamClose();
94 curStr.free();
95 ++strPtr;
96 if (strPtr < streams->getLength()) {
97 streams->get(strPtr, &curStr);
98 curStr.streamReset();
99 }
100 }
101 return c;
102 }
103
lookChar()104 int Lexer::lookChar() {
105 if (curStr.isNone()) {
106 return EOF;
107 }
108 return curStr.streamLookChar();
109 }
110
getObj(Object * obj)111 Object *Lexer::getObj(Object *obj) {
112 char *p;
113 int c, c2;
114 GBool comment, neg, done;
115 int numParen;
116 int xi;
117 double xf, scale;
118 GString *s;
119 int n, m;
120
121 // skip whitespace and comments
122 comment = gFalse;
123 while (1) {
124 if ((c = getChar()) == EOF) {
125 return obj->initEOF();
126 }
127 if (comment) {
128 if (c == '\r' || c == '\n')
129 comment = gFalse;
130 } else if (c == '%') {
131 comment = gTrue;
132 } else if (specialChars[c] != 1) {
133 break;
134 }
135 }
136
137 // start reading token
138 switch (c) {
139
140 // number
141 case '0': case '1': case '2': case '3': case '4':
142 case '5': case '6': case '7': case '8': case '9':
143 case '-': case '.':
144 neg = gFalse;
145 xf = xi = 0;
146 if (c == '-') {
147 neg = gTrue;
148 } else if (c == '.') {
149 goto doReal;
150 } else {
151 xf = xi = c - '0';
152 }
153 while (1) {
154 c = lookChar();
155 if (isdigit(c)) {
156 getChar();
157 xi = xi * 10 + (c - '0');
158 xf = xf * 10 + (c - '0');
159 } else if (c == '.') {
160 getChar();
161 goto doReal;
162 } else {
163 break;
164 }
165 }
166 if (neg) {
167 xi = -xi;
168 }
169 obj->initInt(xi);
170 break;
171 doReal:
172 scale = 0.1;
173 while (1) {
174 c = lookChar();
175 if (c == '-') {
176 // ignore minus signs in the middle of numbers to match
177 // Adobe's behavior
178 error(errSyntaxWarning, getPos(), "Badly formatted number");
179 getChar();
180 continue;
181 }
182 if (!isdigit(c)) {
183 break;
184 }
185 getChar();
186 xf = xf + scale * (c - '0');
187 scale *= 0.1;
188 }
189 if (neg) {
190 xf = -xf;
191 }
192 obj->initReal(xf);
193 break;
194
195 // string
196 case '(':
197 p = tokBuf;
198 n = 0;
199 numParen = 1;
200 done = gFalse;
201 s = NULL;
202 do {
203 c2 = EOF;
204 switch (c = getChar()) {
205
206 case EOF:
207 #if 0
208 // This breaks some PDF files, e.g., ones from Photoshop.
209 case '\r':
210 case '\n':
211 #endif
212 error(errSyntaxError, getPos(), "Unterminated string");
213 done = gTrue;
214 break;
215
216 case '(':
217 ++numParen;
218 c2 = c;
219 break;
220
221 case ')':
222 if (--numParen == 0) {
223 done = gTrue;
224 } else {
225 c2 = c;
226 }
227 break;
228
229 case '\\':
230 switch (c = getChar()) {
231 case 'n':
232 c2 = '\n';
233 break;
234 case 'r':
235 c2 = '\r';
236 break;
237 case 't':
238 c2 = '\t';
239 break;
240 case 'b':
241 c2 = '\b';
242 break;
243 case 'f':
244 c2 = '\f';
245 break;
246 case '\\':
247 case '(':
248 case ')':
249 c2 = c;
250 break;
251 case '0': case '1': case '2': case '3':
252 case '4': case '5': case '6': case '7':
253 c2 = c - '0';
254 c = lookChar();
255 if (c >= '0' && c <= '7') {
256 getChar();
257 c2 = (c2 << 3) + (c - '0');
258 c = lookChar();
259 if (c >= '0' && c <= '7') {
260 getChar();
261 c2 = (c2 << 3) + (c - '0');
262 }
263 }
264 break;
265 case '\r':
266 c = lookChar();
267 if (c == '\n') {
268 getChar();
269 }
270 break;
271 case '\n':
272 break;
273 case EOF:
274 error(errSyntaxError, getPos(), "Unterminated string");
275 done = gTrue;
276 break;
277 default:
278 c2 = c;
279 break;
280 }
281 break;
282
283 default:
284 c2 = c;
285 break;
286 }
287
288 if (c2 != EOF) {
289 if (n == tokBufSize) {
290 if (!s)
291 s = new GString(tokBuf, tokBufSize);
292 else
293 s->append(tokBuf, tokBufSize);
294 p = tokBuf;
295 n = 0;
296 }
297 *p++ = (char)c2;
298 ++n;
299 }
300 } while (!done);
301 if (!s)
302 s = new GString(tokBuf, n);
303 else
304 s->append(tokBuf, n);
305 obj->initString(s);
306 break;
307
308 // name
309 case '/':
310 p = tokBuf;
311 n = 0;
312 s = NULL;
313 while ((c = lookChar()) != EOF && !specialChars[c]) {
314 getChar();
315 if (c == '#') {
316 c2 = lookChar();
317 if (c2 >= '0' && c2 <= '9') {
318 c = c2 - '0';
319 } else if (c2 >= 'A' && c2 <= 'F') {
320 c = c2 - 'A' + 10;
321 } else if (c2 >= 'a' && c2 <= 'f') {
322 c = c2 - 'a' + 10;
323 } else {
324 goto notEscChar;
325 }
326 getChar();
327 c <<= 4;
328 c2 = getChar();
329 if (c2 >= '0' && c2 <= '9') {
330 c += c2 - '0';
331 } else if (c2 >= 'A' && c2 <= 'F') {
332 c += c2 - 'A' + 10;
333 } else if (c2 >= 'a' && c2 <= 'f') {
334 c += c2 - 'a' + 10;
335 } else {
336 error(errSyntaxError, getPos(), "Illegal digit in hex char in name");
337 }
338 }
339 notEscChar:
340 // the PDF spec claims that names are limited to 127 chars, but
341 // Distiller 8 will produce longer names, and Acrobat 8 will
342 // accept longer names
343 ++n;
344 if (n < tokBufSize) {
345 *p++ = c;
346 } else if (n == tokBufSize) {
347 *p = c;
348 s = new GString(tokBuf, n);
349 } else {
350 s->append((char)c);
351 }
352 }
353 if (n < tokBufSize) {
354 *p = '\0';
355 obj->initName(tokBuf);
356 } else {
357 obj->initName(s->getCString());
358 delete s;
359 }
360 break;
361
362 // array punctuation
363 case '[':
364 case ']':
365 tokBuf[0] = c;
366 tokBuf[1] = '\0';
367 obj->initCmd(tokBuf);
368 break;
369
370 // hex string or dict punctuation
371 case '<':
372 c = lookChar();
373
374 // dict punctuation
375 if (c == '<') {
376 getChar();
377 tokBuf[0] = tokBuf[1] = '<';
378 tokBuf[2] = '\0';
379 obj->initCmd(tokBuf);
380
381 // hex string
382 } else {
383 p = tokBuf;
384 m = n = 0;
385 c2 = 0;
386 s = NULL;
387 while (1) {
388 c = getChar();
389 if (c == '>') {
390 break;
391 } else if (c == EOF) {
392 error(errSyntaxError, getPos(), "Unterminated hex string");
393 break;
394 } else if (specialChars[c] != 1) {
395 c2 = c2 << 4;
396 if (c >= '0' && c <= '9')
397 c2 += c - '0';
398 else if (c >= 'A' && c <= 'F')
399 c2 += c - 'A' + 10;
400 else if (c >= 'a' && c <= 'f')
401 c2 += c - 'a' + 10;
402 else
403 error(errSyntaxError, getPos(),
404 "Illegal character <{0:02x}> in hex string", c);
405 if (++m == 2) {
406 if (n == tokBufSize) {
407 if (!s)
408 s = new GString(tokBuf, tokBufSize);
409 else
410 s->append(tokBuf, tokBufSize);
411 p = tokBuf;
412 n = 0;
413 }
414 *p++ = (char)c2;
415 ++n;
416 c2 = 0;
417 m = 0;
418 }
419 }
420 }
421 if (!s)
422 s = new GString(tokBuf, n);
423 else
424 s->append(tokBuf, n);
425 if (m == 1)
426 s->append((char)(c2 << 4));
427 obj->initString(s);
428 }
429 break;
430
431 // dict punctuation
432 case '>':
433 c = lookChar();
434 if (c == '>') {
435 getChar();
436 tokBuf[0] = tokBuf[1] = '>';
437 tokBuf[2] = '\0';
438 obj->initCmd(tokBuf);
439 } else {
440 error(errSyntaxError, getPos(), "Illegal character '>'");
441 obj->initError();
442 }
443 break;
444
445 // error
446 case ')':
447 case '{':
448 case '}':
449 error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
450 obj->initError();
451 break;
452
453 // command
454 default:
455 p = tokBuf;
456 *p++ = c;
457 n = 1;
458 while ((c = lookChar()) != EOF && !specialChars[c]) {
459 getChar();
460 if (++n == tokBufSize) {
461 error(errSyntaxError, getPos(), "Command token too long");
462 break;
463 }
464 *p++ = c;
465 }
466 *p = '\0';
467 if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
468 obj->initBool(gTrue);
469 } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
470 obj->initBool(gFalse);
471 } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
472 obj->initNull();
473 } else {
474 obj->initCmd(tokBuf);
475 }
476 break;
477 }
478
479 return obj;
480 }
481
skipToNextLine()482 void Lexer::skipToNextLine() {
483 int c;
484
485 while (1) {
486 c = getChar();
487 if (c == EOF || c == '\n') {
488 return;
489 }
490 if (c == '\r') {
491 if ((c = lookChar()) == '\n') {
492 getChar();
493 }
494 return;
495 }
496 }
497 }
498
isSpace(int c)499 GBool Lexer::isSpace(int c) {
500 return c >= 0 && c <= 0xff && specialChars[c] == 1;
501 }
502