1 #include "mupdf/fitz.h"
2 #include "mupdf/pdf.h"
3
4 #include <string.h>
5 #include <time.h>
6
7 #ifdef _WIN32
8 #define timegm _mkgmtime
9 #endif
10
11 #define isdigit(c) (c >= '0' && c <= '9')
12
13 fz_rect
pdf_to_rect(fz_context * ctx,pdf_obj * array)14 pdf_to_rect(fz_context *ctx, pdf_obj *array)
15 {
16 if (!pdf_is_array(ctx, array))
17 return fz_empty_rect;
18 else
19 {
20 float a = pdf_array_get_real(ctx, array, 0);
21 float b = pdf_array_get_real(ctx, array, 1);
22 float c = pdf_array_get_real(ctx, array, 2);
23 float d = pdf_array_get_real(ctx, array, 3);
24 fz_rect r;
25 r.x0 = fz_min(a, c);
26 r.y0 = fz_min(b, d);
27 r.x1 = fz_max(a, c);
28 r.y1 = fz_max(b, d);
29 return r;
30 }
31 }
32
33 fz_quad
pdf_to_quad(fz_context * ctx,pdf_obj * array,int offset)34 pdf_to_quad(fz_context *ctx, pdf_obj *array, int offset)
35 {
36 fz_quad q;
37 q.ul.x = pdf_array_get_real(ctx, array, offset+0);
38 q.ul.y = pdf_array_get_real(ctx, array, offset+1);
39 q.ur.x = pdf_array_get_real(ctx, array, offset+2);
40 q.ur.y = pdf_array_get_real(ctx, array, offset+3);
41 q.ll.x = pdf_array_get_real(ctx, array, offset+4);
42 q.ll.y = pdf_array_get_real(ctx, array, offset+5);
43 q.lr.x = pdf_array_get_real(ctx, array, offset+6);
44 q.lr.y = pdf_array_get_real(ctx, array, offset+7);
45 return q;
46 }
47
48 fz_matrix
pdf_to_matrix(fz_context * ctx,pdf_obj * array)49 pdf_to_matrix(fz_context *ctx, pdf_obj *array)
50 {
51 if (!pdf_is_array(ctx, array))
52 return fz_identity;
53 else
54 {
55 fz_matrix m;
56 m.a = pdf_array_get_real(ctx, array, 0);
57 m.b = pdf_array_get_real(ctx, array, 1);
58 m.c = pdf_array_get_real(ctx, array, 2);
59 m.d = pdf_array_get_real(ctx, array, 3);
60 m.e = pdf_array_get_real(ctx, array, 4);
61 m.f = pdf_array_get_real(ctx, array, 5);
62 return m;
63 }
64 }
65
66 int64_t
pdf_to_date(fz_context * ctx,pdf_obj * time)67 pdf_to_date(fz_context *ctx, pdf_obj *time)
68 {
69 const char *s = pdf_to_str_buf(ctx, time);
70 int tz_sign, tz_hour, tz_min, tz_adj;
71 struct tm tm;
72 time_t utc;
73
74 if (!s[0])
75 return -1;
76
77 memset(&tm, 0, sizeof tm);
78 tm.tm_mday = 1;
79
80 tz_sign = 1;
81 tz_hour = 0;
82 tz_min = 0;
83
84 if (s[0] == 'D' && s[1] == ':')
85 s += 2;
86
87 if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2]) || !isdigit(s[3]))
88 {
89 fz_warn(ctx, "invalid date format (missing year)");
90 return -1;
91 }
92 tm.tm_year = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0') - 1900;
93 s += 4;
94
95 if (tm.tm_year < 70)
96 {
97 fz_warn(ctx, "invalid date (year out of range)");
98 return -1;
99 }
100
101 if (isdigit(s[0]) && isdigit(s[1]))
102 {
103 tm.tm_mon = (s[0]-'0')*10 + (s[1]-'0') - 1; /* month is 0-11 in struct tm */
104 s += 2;
105 if (isdigit(s[0]) && isdigit(s[1]))
106 {
107 tm.tm_mday = (s[0]-'0')*10 + (s[1]-'0');
108 s += 2;
109 if (isdigit(s[0]) && isdigit(s[1]))
110 {
111 tm.tm_hour = (s[0]-'0')*10 + (s[1]-'0');
112 s += 2;
113 if (isdigit(s[0]) && isdigit(s[1]))
114 {
115 tm.tm_min = (s[0]-'0')*10 + (s[1]-'0');
116 s += 2;
117 if (isdigit(s[0]) && isdigit(s[1]))
118 {
119 tm.tm_sec = (s[0]-'0')*10 + (s[1]-'0');
120 s += 2;
121 }
122 }
123 }
124 }
125 }
126
127 if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23 || tm.tm_mday > 31 || tm.tm_mon > 11)
128 {
129 fz_warn(ctx, "invalid date (a field is out of range)");
130 return -1;
131 }
132
133 if (s[0] == 'Z')
134 {
135 s += 1;
136 }
137 else if ((s[0] == '-' || s[0] == '+') && isdigit(s[1]) && isdigit(s[2]))
138 {
139 tz_sign = (s[0] == '-') ? -1 : 1;
140 tz_hour = (s[1]-'0')*10 + (s[2]-'0');
141 s += 3;
142 if (s[0] == '\'' && isdigit(s[1]) && isdigit(s[2]))
143 {
144 tz_min = (s[1]-'0')*10 + (s[2]-'0');
145 s += 3;
146 if (s[0] == '\'')
147 s += 1;
148 }
149 }
150
151 /* PDF is based on ISO/IEC 8824 which limits time zones from -15 to +16. */
152 if (tz_sign < 0 && (tz_hour > 15 || (tz_hour == 15 && tz_min > 0)))
153 {
154 fz_warn(ctx, "invalid date format (time zone out of range)");
155 return -1;
156 }
157 if (tz_sign > 0 && (tz_hour > 16 || (tz_hour == 16 && tz_min > 0)))
158 {
159 fz_warn(ctx, "invalid date format (time zone out of range)");
160 return -1;
161 }
162
163 if (s[0] != 0)
164 fz_warn(ctx, "invalid date format (garbage at end)");
165
166 utc = timegm(&tm);
167 if (utc == (time_t)-1)
168 {
169 fz_warn(ctx, "date overflow error");
170 return -1;
171 }
172
173 tz_adj = tz_sign * (tz_hour * 3600 + tz_min * 60);
174 return utc - tz_adj;
175 }
176
177 static int
rune_from_utf16be(int * out,const unsigned char * s,const unsigned char * end)178 rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end)
179 {
180 if (s + 2 <= end)
181 {
182 int a = s[0] << 8 | s[1];
183 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
184 {
185 int b = s[2] << 8 | s[3];
186 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
187 return 4;
188 }
189 *out = a;
190 return 2;
191 }
192 *out = FZ_REPLACEMENT_CHARACTER;
193 return 1;
194 }
195
196 static int
rune_from_utf16le(int * out,const unsigned char * s,const unsigned char * end)197 rune_from_utf16le(int *out, const unsigned char *s, const unsigned char *end)
198 {
199 if (s + 2 <= end)
200 {
201 int a = s[1] << 8 | s[0];
202 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
203 {
204 int b = s[3] << 8 | s[2];
205 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
206 return 4;
207 }
208 *out = a;
209 return 2;
210 }
211 *out = FZ_REPLACEMENT_CHARACTER;
212 return 1;
213 }
214
215 static size_t
skip_language_code_utf16le(const unsigned char * s,size_t n,size_t i)216 skip_language_code_utf16le(const unsigned char *s, size_t n, size_t i)
217 {
218 /* skip language escape codes */
219 if (i + 6 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+5] == 0 && s[i+4] == 27)
220 return 6;
221 else if (i + 8 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+7] == 0 && s[i+6] == 27)
222 return 8;
223 return 0;
224 }
225
226 static size_t
skip_language_code_utf16be(const unsigned char * s,size_t n,size_t i)227 skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i)
228 {
229 /* skip language escape codes */
230 if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27)
231 return 6;
232 else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27)
233 return 8;
234 return 0;
235 }
236
237 static size_t
skip_language_code_utf8(const unsigned char * s,size_t n,size_t i)238 skip_language_code_utf8(const unsigned char *s, size_t n, size_t i)
239 {
240 /* skip language escape codes */
241 if (i + 3 <= n && s[i] == 27 && s[i+3])
242 return 3;
243 else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27)
244 return 5;
245 return 0;
246 }
247
248 static int
is_valid_utf8(const unsigned char * s,const unsigned char * end)249 is_valid_utf8(const unsigned char *s, const unsigned char *end)
250 {
251 for (; s < end; ++s)
252 {
253 int skip = *s < 0x80 ? 0 : *s < 0xC0 ? -1 : *s < 0xE0 ? 1 : *s < 0xF0 ? 2 : *s < 0xF5 ? 3 : -1;
254 if (skip == -1)
255 return 0;
256 while (skip-- > 0)
257 if (++s >= end || (*s & 0xC0) != 0x80)
258 return 0;
259 }
260 return 1;
261 }
262
263 char *
pdf_new_utf8_from_pdf_string(fz_context * ctx,const char * ssrcptr,size_t srclen)264 pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen)
265 {
266 const unsigned char *srcptr = (const unsigned char*)ssrcptr;
267 char *dstptr, *dst;
268 size_t dstlen = 0;
269 int ucs;
270 size_t i, n;
271
272 /* UTF-16BE */
273 if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
274 {
275 i = 2;
276 while (i + 2 <= srclen)
277 {
278 n = skip_language_code_utf16be(srcptr, srclen, i);
279 if (n)
280 i += n;
281 else
282 {
283 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
284 dstlen += fz_runelen(ucs);
285 }
286 }
287
288 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16be");
289
290 i = 2;
291 while (i + 2 <= srclen)
292 {
293 n = skip_language_code_utf16be(srcptr, srclen, i);
294 if (n)
295 i += n;
296 else
297 {
298 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
299 dstptr += fz_runetochar(dstptr, ucs);
300 }
301 }
302 }
303
304 /* UTF-16LE */
305 else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
306 {
307 i = 2;
308 while (i + 2 <= srclen)
309 {
310 n = skip_language_code_utf16le(srcptr, srclen, i);
311 if (n)
312 i += n;
313 else
314 {
315 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
316 dstlen += fz_runelen(ucs);
317 }
318 }
319
320 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16le");
321
322 i = 2;
323 while (i + 2 <= srclen)
324 {
325 n = skip_language_code_utf16le(srcptr, srclen, i);
326 if (n)
327 i += n;
328 else
329 {
330 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
331 dstptr += fz_runetochar(dstptr, ucs);
332 }
333 }
334 }
335
336 /* UTF-8 */
337 else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191)
338 {
339 i = 3;
340 while (i < srclen)
341 {
342 n = skip_language_code_utf8(srcptr, srclen, i);
343 if (n)
344 i += n;
345 else
346 {
347 i += 1;
348 dstlen += 1;
349 }
350 }
351
352 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf8");
353
354 i = 3;
355 while (i < srclen)
356 {
357 n = skip_language_code_utf8(srcptr, srclen, i);
358 if (n)
359 i += n;
360 else
361 *dstptr++ = srcptr[i++];
362 }
363 }
364
365 /* Detect UTF-8 strings that aren't marked with a BOM */
366 else if (is_valid_utf8(srcptr, srcptr + srclen))
367 {
368 dst = Memento_label(fz_malloc(ctx, srclen + 1), "utf8_from_guess");
369 memcpy(dst, srcptr, srclen);
370 dstptr = dst + srclen;
371 }
372
373 /* PDFDocEncoding */
374 else
375 {
376 for (i = 0; i < srclen; i++)
377 dstlen += fz_runelen(fz_unicode_from_pdf_doc_encoding[srcptr[i]]);
378
379 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_pdfdocenc");
380
381 for (i = 0; i < srclen; i++)
382 {
383 ucs = fz_unicode_from_pdf_doc_encoding[srcptr[i]];
384 dstptr += fz_runetochar(dstptr, ucs);
385 }
386 }
387
388 *dstptr = 0;
389 return dst;
390 }
391
392 char *
pdf_new_utf8_from_pdf_string_obj(fz_context * ctx,pdf_obj * src)393 pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src)
394 {
395 const char *srcptr;
396 size_t srclen;
397 srcptr = pdf_to_string(ctx, src, &srclen);
398 return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
399 }
400
401 char *
pdf_new_utf8_from_pdf_stream_obj(fz_context * ctx,pdf_obj * src)402 pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src)
403 {
404 fz_buffer *stmbuf;
405 char *srcptr;
406 size_t srclen;
407 char *dst = NULL;
408
409 stmbuf = pdf_load_stream(ctx, src);
410 srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr);
411 fz_try(ctx)
412 dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
413 fz_always(ctx)
414 fz_drop_buffer(ctx, stmbuf);
415 fz_catch(ctx)
416 fz_rethrow(ctx);
417 return dst;
418 }
419
420 char *
pdf_load_stream_or_string_as_utf8(fz_context * ctx,pdf_obj * src)421 pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src)
422 {
423 if (pdf_is_stream(ctx, src))
424 return pdf_new_utf8_from_pdf_stream_obj(ctx, src);
425 return pdf_new_utf8_from_pdf_string_obj(ctx, src);
426 }
427
428 static pdf_obj *
pdf_new_text_string_utf16be(fz_context * ctx,const char * s)429 pdf_new_text_string_utf16be(fz_context *ctx, const char *s)
430 {
431 const char *ss;
432 int c, i, n, a, b;
433 unsigned char *p;
434 pdf_obj *obj;
435
436 ss = s;
437 n = 0;
438 while (*ss)
439 {
440 ss += fz_chartorune(&c, ss);
441 n += (c >= 0x10000) ? 2 : 1;
442 }
443
444 p = fz_malloc(ctx, n * 2 + 2);
445 i = 0;
446 p[i++] = 254;
447 p[i++] = 255;
448 while (*s)
449 {
450 s += fz_chartorune(&c, s);
451 if (c >= 0x10000)
452 {
453 a = (((c - 0x10000) >> 10) & 0x3ff) + 0xD800;
454 p[i++] = (a>>8) & 0xff;
455 p[i++] = (a) & 0xff;
456 b = (((c - 0x10000)) & 0x3ff) + 0xDC00;
457 p[i++] = (b>>8) & 0xff;
458 p[i++] = (b) & 0xff;
459 }
460 else
461 {
462 p[i++] = (c>>8) & 0xff;
463 p[i++] = (c) & 0xff;
464 }
465 }
466
467 fz_try(ctx)
468 obj = pdf_new_string(ctx, (char*)p, i);
469 fz_always(ctx)
470 fz_free(ctx, p);
471 fz_catch(ctx)
472 fz_rethrow(ctx);
473 return obj;
474 }
475
476 pdf_obj *
pdf_new_text_string(fz_context * ctx,const char * s)477 pdf_new_text_string(fz_context *ctx, const char *s)
478 {
479 int i = 0;
480 while (s[i] != 0)
481 {
482 if (((unsigned char)s[i]) >= 128)
483 return pdf_new_text_string_utf16be(ctx, s);
484 ++i;
485 }
486 return pdf_new_string(ctx, s, i);
487 }
488
489 pdf_obj *
pdf_parse_array(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf)490 pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
491 {
492 pdf_obj *ary = NULL;
493 pdf_obj *obj = NULL;
494 int64_t a = 0, b = 0, n = 0;
495 pdf_token tok;
496 pdf_obj *op = NULL;
497
498 fz_var(obj);
499
500 ary = pdf_new_array(ctx, doc, 4);
501
502 fz_try(ctx)
503 {
504 while (1)
505 {
506 tok = pdf_lex(ctx, file, buf);
507
508 if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
509 {
510 if (n > 0)
511 pdf_array_push_int(ctx, ary, a);
512 if (n > 1)
513 pdf_array_push_int(ctx, ary, b);
514 n = 0;
515 }
516
517 if (tok == PDF_TOK_INT && n == 2)
518 {
519 pdf_array_push_int(ctx, ary, a);
520 a = b;
521 n --;
522 }
523
524 switch (tok)
525 {
526 case PDF_TOK_EOF:
527 fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file");
528
529 case PDF_TOK_CLOSE_ARRAY:
530 op = ary;
531 goto end;
532
533 case PDF_TOK_INT:
534 if (n == 0)
535 a = buf->i;
536 if (n == 1)
537 b = buf->i;
538 n ++;
539 break;
540
541 case PDF_TOK_R:
542 if (n != 2)
543 fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array");
544 pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b));
545 n = 0;
546 break;
547
548 case PDF_TOK_OPEN_ARRAY:
549 obj = pdf_parse_array(ctx, doc, file, buf);
550 pdf_array_push_drop(ctx, ary, obj);
551 break;
552
553 case PDF_TOK_OPEN_DICT:
554 obj = pdf_parse_dict(ctx, doc, file, buf);
555 pdf_array_push_drop(ctx, ary, obj);
556 break;
557
558 case PDF_TOK_NAME:
559 pdf_array_push_name(ctx, ary, buf->scratch);
560 break;
561 case PDF_TOK_REAL:
562 pdf_array_push_real(ctx, ary, buf->f);
563 break;
564 case PDF_TOK_STRING:
565 pdf_array_push_string(ctx, ary, buf->scratch, buf->len);
566 break;
567 case PDF_TOK_TRUE:
568 pdf_array_push_bool(ctx, ary, 1);
569 break;
570 case PDF_TOK_FALSE:
571 pdf_array_push_bool(ctx, ary, 0);
572 break;
573 case PDF_TOK_NULL:
574 pdf_array_push(ctx, ary, PDF_NULL);
575 break;
576
577 default:
578 pdf_array_push(ctx, ary, PDF_NULL);
579 break;
580 }
581 }
582 end:
583 {}
584 }
585 fz_catch(ctx)
586 {
587 pdf_drop_obj(ctx, ary);
588 fz_rethrow(ctx);
589 }
590 return op;
591 }
592
593 pdf_obj *
pdf_parse_dict(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf)594 pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
595 {
596 pdf_obj *dict;
597 pdf_obj *key = NULL;
598 pdf_obj *val = NULL;
599 pdf_token tok;
600 int64_t a, b;
601
602 dict = pdf_new_dict(ctx, doc, 8);
603
604 fz_var(key);
605 fz_var(val);
606
607 fz_try(ctx)
608 {
609 while (1)
610 {
611 tok = pdf_lex(ctx, file, buf);
612 skip:
613 if (tok == PDF_TOK_CLOSE_DICT)
614 break;
615
616 /* for BI .. ID .. EI in content streams */
617 if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
618 break;
619
620 if (tok != PDF_TOK_NAME)
621 fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict");
622
623 key = pdf_new_name(ctx, buf->scratch);
624
625 tok = pdf_lex(ctx, file, buf);
626
627 switch (tok)
628 {
629 case PDF_TOK_OPEN_ARRAY:
630 val = pdf_parse_array(ctx, doc, file, buf);
631 break;
632
633 case PDF_TOK_OPEN_DICT:
634 val = pdf_parse_dict(ctx, doc, file, buf);
635 break;
636
637 case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
638 case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
639 case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
640 case PDF_TOK_TRUE: val = PDF_TRUE; break;
641 case PDF_TOK_FALSE: val = PDF_FALSE; break;
642 case PDF_TOK_NULL: val = PDF_NULL; break;
643
644 case PDF_TOK_INT:
645 /* 64-bit to allow for numbers > INT_MAX and overflow */
646 a = buf->i;
647 tok = pdf_lex(ctx, file, buf);
648 if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
649 (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
650 {
651 val = pdf_new_int(ctx, a);
652 pdf_dict_put(ctx, dict, key, val);
653 pdf_drop_obj(ctx, val);
654 val = NULL;
655 pdf_drop_obj(ctx, key);
656 key = NULL;
657 goto skip;
658 }
659 if (tok == PDF_TOK_INT)
660 {
661 b = buf->i;
662 tok = pdf_lex(ctx, file, buf);
663 if (tok == PDF_TOK_R)
664 {
665 val = pdf_new_indirect(ctx, doc, a, b);
666 break;
667 }
668 }
669 fz_warn(ctx, "invalid indirect reference in dict");
670 val = PDF_NULL;
671 break;
672
673 default:
674 val = PDF_NULL;
675 break;
676 }
677
678 pdf_dict_put(ctx, dict, key, val);
679 pdf_drop_obj(ctx, val);
680 val = NULL;
681 pdf_drop_obj(ctx, key);
682 key = NULL;
683 }
684 }
685 fz_catch(ctx)
686 {
687 pdf_drop_obj(ctx, dict);
688 pdf_drop_obj(ctx, key);
689 pdf_drop_obj(ctx, val);
690 fz_rethrow(ctx);
691 }
692 return dict;
693 }
694
695 pdf_obj *
pdf_parse_stm_obj(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf)696 pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
697 {
698 pdf_token tok;
699
700 tok = pdf_lex(ctx, file, buf);
701
702 switch (tok)
703 {
704 case PDF_TOK_OPEN_ARRAY:
705 return pdf_parse_array(ctx, doc, file, buf);
706 case PDF_TOK_OPEN_DICT:
707 return pdf_parse_dict(ctx, doc, file, buf);
708 case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch);
709 case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f);
710 case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len);
711 case PDF_TOK_TRUE: return PDF_TRUE;
712 case PDF_TOK_FALSE: return PDF_FALSE;
713 case PDF_TOK_NULL: return PDF_NULL;
714 case PDF_TOK_INT: return pdf_new_int(ctx, buf->i);
715 default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream");
716 }
717 }
718
719 pdf_obj *
pdf_parse_ind_obj(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf,int * onum,int * ogen,int64_t * ostmofs,int * try_repair)720 pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
721 fz_stream *file, pdf_lexbuf *buf,
722 int *onum, int *ogen, int64_t *ostmofs, int *try_repair)
723 {
724 pdf_obj *obj = NULL;
725 int num = 0, gen = 0;
726 int64_t stm_ofs;
727 pdf_token tok;
728 int64_t a, b;
729 int read_next_token = 1;
730
731 fz_var(obj);
732
733 tok = pdf_lex(ctx, file, buf);
734 if (tok != PDF_TOK_INT)
735 {
736 if (try_repair)
737 *try_repair = 1;
738 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number");
739 }
740 num = buf->i;
741 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
742 fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range");
743
744 tok = pdf_lex(ctx, file, buf);
745 if (tok != PDF_TOK_INT)
746 {
747 if (try_repair)
748 *try_repair = 1;
749 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num);
750 }
751 gen = buf->i;
752
753 tok = pdf_lex(ctx, file, buf);
754 if (tok != PDF_TOK_OBJ)
755 {
756 if (try_repair)
757 *try_repair = 1;
758 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen);
759 }
760
761 tok = pdf_lex(ctx, file, buf);
762
763 switch (tok)
764 {
765 case PDF_TOK_OPEN_ARRAY:
766 obj = pdf_parse_array(ctx, doc, file, buf);
767 break;
768
769 case PDF_TOK_OPEN_DICT:
770 obj = pdf_parse_dict(ctx, doc, file, buf);
771 break;
772
773 case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
774 case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
775 case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
776 case PDF_TOK_TRUE: obj = PDF_TRUE; break;
777 case PDF_TOK_FALSE: obj = PDF_FALSE; break;
778 case PDF_TOK_NULL: obj = PDF_NULL; break;
779
780 case PDF_TOK_INT:
781 a = buf->i;
782 tok = pdf_lex(ctx, file, buf);
783
784 if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
785 {
786 obj = pdf_new_int(ctx, a);
787 read_next_token = 0;
788 break;
789 }
790 else if (tok == PDF_TOK_INT)
791 {
792 b = buf->i;
793 tok = pdf_lex(ctx, file, buf);
794 if (tok == PDF_TOK_R)
795 {
796 obj = pdf_new_indirect(ctx, doc, a, b);
797 break;
798 }
799 }
800 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen);
801
802 case PDF_TOK_ENDOBJ:
803 obj = PDF_NULL;
804 read_next_token = 0;
805 break;
806
807 default:
808 fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen);
809 }
810
811 fz_try(ctx)
812 {
813 if (read_next_token)
814 tok = pdf_lex(ctx, file, buf);
815
816 if (tok == PDF_TOK_STREAM)
817 {
818 int c = fz_read_byte(ctx, file);
819 while (c == ' ')
820 c = fz_read_byte(ctx, file);
821 if (c == '\r')
822 {
823 c = fz_peek_byte(ctx, file);
824 if (c != '\n')
825 fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
826 else
827 fz_read_byte(ctx, file);
828 }
829 stm_ofs = fz_tell(ctx, file);
830 }
831 else if (tok == PDF_TOK_ENDOBJ)
832 {
833 stm_ofs = 0;
834 }
835 else
836 {
837 fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
838 stm_ofs = 0;
839 }
840 }
841 fz_catch(ctx)
842 {
843 pdf_drop_obj(ctx, obj);
844 fz_rethrow(ctx);
845 }
846
847 if (onum) *onum = num;
848 if (ogen) *ogen = gen;
849 if (ostmofs) *ostmofs = stm_ofs;
850
851 return obj;
852 }
853