1 #include "mupdf/fitz.h"
2 #include "mupdf/pdf.h"
3 
4 #include <string.h>
5 #include <time.h>
6 
7 #ifdef _WIN32
8 #define timegm _mkgmtime
9 #endif
10 
11 #define isdigit(c) (c >= '0' && c <= '9')
12 
13 fz_rect
pdf_to_rect(fz_context * ctx,pdf_obj * array)14 pdf_to_rect(fz_context *ctx, pdf_obj *array)
15 {
16 	if (!pdf_is_array(ctx, array))
17 		return fz_empty_rect;
18 	else
19 	{
20 		float a = pdf_array_get_real(ctx, array, 0);
21 		float b = pdf_array_get_real(ctx, array, 1);
22 		float c = pdf_array_get_real(ctx, array, 2);
23 		float d = pdf_array_get_real(ctx, array, 3);
24 		fz_rect r;
25 		r.x0 = fz_min(a, c);
26 		r.y0 = fz_min(b, d);
27 		r.x1 = fz_max(a, c);
28 		r.y1 = fz_max(b, d);
29 		return r;
30 	}
31 }
32 
33 fz_quad
pdf_to_quad(fz_context * ctx,pdf_obj * array,int offset)34 pdf_to_quad(fz_context *ctx, pdf_obj *array, int offset)
35 {
36 	fz_quad q;
37 	q.ul.x = pdf_array_get_real(ctx, array, offset+0);
38 	q.ul.y = pdf_array_get_real(ctx, array, offset+1);
39 	q.ur.x = pdf_array_get_real(ctx, array, offset+2);
40 	q.ur.y = pdf_array_get_real(ctx, array, offset+3);
41 	q.ll.x = pdf_array_get_real(ctx, array, offset+4);
42 	q.ll.y = pdf_array_get_real(ctx, array, offset+5);
43 	q.lr.x = pdf_array_get_real(ctx, array, offset+6);
44 	q.lr.y = pdf_array_get_real(ctx, array, offset+7);
45 	return q;
46 }
47 
48 fz_matrix
pdf_to_matrix(fz_context * ctx,pdf_obj * array)49 pdf_to_matrix(fz_context *ctx, pdf_obj *array)
50 {
51 	if (!pdf_is_array(ctx, array))
52 		return fz_identity;
53 	else
54 	{
55 		fz_matrix m;
56 		m.a = pdf_array_get_real(ctx, array, 0);
57 		m.b = pdf_array_get_real(ctx, array, 1);
58 		m.c = pdf_array_get_real(ctx, array, 2);
59 		m.d = pdf_array_get_real(ctx, array, 3);
60 		m.e = pdf_array_get_real(ctx, array, 4);
61 		m.f = pdf_array_get_real(ctx, array, 5);
62 		return m;
63 	}
64 }
65 
66 int64_t
pdf_to_date(fz_context * ctx,pdf_obj * time)67 pdf_to_date(fz_context *ctx, pdf_obj *time)
68 {
69 	const char *s = pdf_to_str_buf(ctx, time);
70 	int tz_sign, tz_hour, tz_min, tz_adj;
71 	struct tm tm;
72 	time_t utc;
73 
74 	if (!s[0])
75 		return -1;
76 
77 	memset(&tm, 0, sizeof tm);
78 	tm.tm_mday = 1;
79 
80 	tz_sign = 1;
81 	tz_hour = 0;
82 	tz_min = 0;
83 
84 	if (s[0] == 'D' && s[1] == ':')
85 		s += 2;
86 
87 	if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2]) || !isdigit(s[3]))
88 	{
89 		fz_warn(ctx, "invalid date format (missing year)");
90 		return -1;
91 	}
92 	tm.tm_year = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0') - 1900;
93 	s += 4;
94 
95 	if (tm.tm_year < 70)
96 	{
97 		fz_warn(ctx, "invalid date (year out of range)");
98 		return -1;
99 	}
100 
101 	if (isdigit(s[0]) && isdigit(s[1]))
102 	{
103 		tm.tm_mon = (s[0]-'0')*10 + (s[1]-'0') - 1; /* month is 0-11 in struct tm */
104 		s += 2;
105 		if (isdigit(s[0]) && isdigit(s[1]))
106 		{
107 			tm.tm_mday = (s[0]-'0')*10 + (s[1]-'0');
108 			s += 2;
109 			if (isdigit(s[0]) && isdigit(s[1]))
110 			{
111 				tm.tm_hour = (s[0]-'0')*10 + (s[1]-'0');
112 				s += 2;
113 				if (isdigit(s[0]) && isdigit(s[1]))
114 				{
115 					tm.tm_min = (s[0]-'0')*10 + (s[1]-'0');
116 					s += 2;
117 					if (isdigit(s[0]) && isdigit(s[1]))
118 					{
119 						tm.tm_sec = (s[0]-'0')*10 + (s[1]-'0');
120 						s += 2;
121 					}
122 				}
123 			}
124 		}
125 	}
126 
127 	if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23 || tm.tm_mday > 31 || tm.tm_mon > 11)
128 	{
129 		fz_warn(ctx, "invalid date (a field is out of range)");
130 		return -1;
131 	}
132 
133 	if (s[0] == 'Z')
134 	{
135 		s += 1;
136 	}
137 	else if ((s[0] == '-' || s[0] == '+') && isdigit(s[1]) && isdigit(s[2]))
138 	{
139 		tz_sign = (s[0] == '-') ? -1 : 1;
140 		tz_hour = (s[1]-'0')*10 + (s[2]-'0');
141 		s += 3;
142 		if (s[0] == '\'' && isdigit(s[1]) && isdigit(s[2]))
143 		{
144 			tz_min = (s[1]-'0')*10 + (s[2]-'0');
145 			s += 3;
146 			if (s[0] == '\'')
147 				s += 1;
148 		}
149 	}
150 
151 	/* PDF is based on ISO/IEC 8824 which limits time zones from -15 to +16. */
152 	if (tz_sign < 0 && (tz_hour > 15 || (tz_hour == 15 && tz_min > 0)))
153 	{
154 		fz_warn(ctx, "invalid date format (time zone out of range)");
155 		return -1;
156 	}
157 	if (tz_sign > 0 && (tz_hour > 16 || (tz_hour == 16 && tz_min > 0)))
158 	{
159 		fz_warn(ctx, "invalid date format (time zone out of range)");
160 		return -1;
161 	}
162 
163 	if (s[0] != 0)
164 		fz_warn(ctx, "invalid date format (garbage at end)");
165 
166 	utc = timegm(&tm);
167 	if (utc == (time_t)-1)
168 	{
169 		fz_warn(ctx, "date overflow error");
170 		return -1;
171 	}
172 
173 	tz_adj = tz_sign * (tz_hour * 3600 + tz_min * 60);
174 	return utc - tz_adj;
175 }
176 
177 static int
rune_from_utf16be(int * out,const unsigned char * s,const unsigned char * end)178 rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end)
179 {
180 	if (s + 2 <= end)
181 	{
182 		int a = s[0] << 8 | s[1];
183 		if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
184 		{
185 			int b = s[2] << 8 | s[3];
186 			*out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
187 			return 4;
188 		}
189 		*out = a;
190 		return 2;
191 	}
192 	*out = FZ_REPLACEMENT_CHARACTER;
193 	return 1;
194 }
195 
196 static int
rune_from_utf16le(int * out,const unsigned char * s,const unsigned char * end)197 rune_from_utf16le(int *out, const unsigned char *s, const unsigned char *end)
198 {
199 	if (s + 2 <= end)
200 	{
201 		int a = s[1] << 8 | s[0];
202 		if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
203 		{
204 			int b = s[3] << 8 | s[2];
205 			*out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
206 			return 4;
207 		}
208 		*out = a;
209 		return 2;
210 	}
211 	*out = FZ_REPLACEMENT_CHARACTER;
212 	return 1;
213 }
214 
215 static size_t
skip_language_code_utf16le(const unsigned char * s,size_t n,size_t i)216 skip_language_code_utf16le(const unsigned char *s, size_t n, size_t i)
217 {
218 	/* skip language escape codes */
219 	if (i + 6 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+5] == 0 && s[i+4] == 27)
220 		return 6;
221 	else if (i + 8 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+7] == 0 && s[i+6] == 27)
222 		return 8;
223 	return 0;
224 }
225 
226 static size_t
skip_language_code_utf16be(const unsigned char * s,size_t n,size_t i)227 skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i)
228 {
229 	/* skip language escape codes */
230 	if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27)
231 		return 6;
232 	else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27)
233 		return 8;
234 	return 0;
235 }
236 
237 static size_t
skip_language_code_utf8(const unsigned char * s,size_t n,size_t i)238 skip_language_code_utf8(const unsigned char *s, size_t n, size_t i)
239 {
240 	/* skip language escape codes */
241 	if (i + 3 <= n && s[i] == 27 && s[i+3])
242 		return 3;
243 	else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27)
244 		return 5;
245 	return 0;
246 }
247 
248 static int
is_valid_utf8(const unsigned char * s,const unsigned char * end)249 is_valid_utf8(const unsigned char *s, const unsigned char *end)
250 {
251 	for (; s < end; ++s)
252 	{
253 		int skip = *s < 0x80 ? 0 : *s < 0xC0 ? -1 : *s < 0xE0 ? 1 : *s < 0xF0 ? 2 : *s < 0xF5 ? 3 : -1;
254 		if (skip == -1)
255 			return 0;
256 		while (skip-- > 0)
257 			if (++s >= end || (*s & 0xC0) != 0x80)
258 				return 0;
259 	}
260 	return 1;
261 }
262 
263 char *
pdf_new_utf8_from_pdf_string(fz_context * ctx,const char * ssrcptr,size_t srclen)264 pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen)
265 {
266 	const unsigned char *srcptr = (const unsigned char*)ssrcptr;
267 	char *dstptr, *dst;
268 	size_t dstlen = 0;
269 	int ucs;
270 	size_t i, n;
271 
272 	/* UTF-16BE */
273 	if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
274 	{
275 		i = 2;
276 		while (i + 2 <= srclen)
277 		{
278 			n = skip_language_code_utf16be(srcptr, srclen, i);
279 			if (n)
280 				i += n;
281 			else
282 			{
283 				i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
284 				dstlen += fz_runelen(ucs);
285 			}
286 		}
287 
288 		dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16be");
289 
290 		i = 2;
291 		while (i + 2 <= srclen)
292 		{
293 			n = skip_language_code_utf16be(srcptr, srclen, i);
294 			if (n)
295 				i += n;
296 			else
297 			{
298 				i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
299 				dstptr += fz_runetochar(dstptr, ucs);
300 			}
301 		}
302 	}
303 
304 	/* UTF-16LE */
305 	else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
306 	{
307 		i = 2;
308 		while (i + 2 <= srclen)
309 		{
310 			n = skip_language_code_utf16le(srcptr, srclen, i);
311 			if (n)
312 				i += n;
313 			else
314 			{
315 				i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
316 				dstlen += fz_runelen(ucs);
317 			}
318 		}
319 
320 		dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16le");
321 
322 		i = 2;
323 		while (i + 2 <= srclen)
324 		{
325 			n = skip_language_code_utf16le(srcptr, srclen, i);
326 			if (n)
327 				i += n;
328 			else
329 			{
330 				i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
331 				dstptr += fz_runetochar(dstptr, ucs);
332 			}
333 		}
334 	}
335 
336 	/* UTF-8 */
337 	else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191)
338 	{
339 		i = 3;
340 		while (i < srclen)
341 		{
342 			n = skip_language_code_utf8(srcptr, srclen, i);
343 			if (n)
344 				i += n;
345 			else
346 			{
347 				i += 1;
348 				dstlen += 1;
349 			}
350 		}
351 
352 		dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf8");
353 
354 		i = 3;
355 		while (i < srclen)
356 		{
357 			n = skip_language_code_utf8(srcptr, srclen, i);
358 			if (n)
359 				i += n;
360 			else
361 				*dstptr++ = srcptr[i++];
362 		}
363 	}
364 
365 	/* Detect UTF-8 strings that aren't marked with a BOM */
366 	else if (is_valid_utf8(srcptr, srcptr + srclen))
367 	{
368 		dst = Memento_label(fz_malloc(ctx, srclen + 1), "utf8_from_guess");
369 		memcpy(dst, srcptr, srclen);
370 		dstptr = dst + srclen;
371 	}
372 
373 	/* PDFDocEncoding */
374 	else
375 	{
376 		for (i = 0; i < srclen; i++)
377 			dstlen += fz_runelen(fz_unicode_from_pdf_doc_encoding[srcptr[i]]);
378 
379 		dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_pdfdocenc");
380 
381 		for (i = 0; i < srclen; i++)
382 		{
383 			ucs = fz_unicode_from_pdf_doc_encoding[srcptr[i]];
384 			dstptr += fz_runetochar(dstptr, ucs);
385 		}
386 	}
387 
388 	*dstptr = 0;
389 	return dst;
390 }
391 
392 char *
pdf_new_utf8_from_pdf_string_obj(fz_context * ctx,pdf_obj * src)393 pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src)
394 {
395 	const char *srcptr;
396 	size_t srclen;
397 	srcptr = pdf_to_string(ctx, src, &srclen);
398 	return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
399 }
400 
401 char *
pdf_new_utf8_from_pdf_stream_obj(fz_context * ctx,pdf_obj * src)402 pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src)
403 {
404 	fz_buffer *stmbuf;
405 	char *srcptr;
406 	size_t srclen;
407 	char *dst = NULL;
408 
409 	stmbuf = pdf_load_stream(ctx, src);
410 	srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr);
411 	fz_try(ctx)
412 		dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
413 	fz_always(ctx)
414 		fz_drop_buffer(ctx, stmbuf);
415 	fz_catch(ctx)
416 		fz_rethrow(ctx);
417 	return dst;
418 }
419 
420 char *
pdf_load_stream_or_string_as_utf8(fz_context * ctx,pdf_obj * src)421 pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src)
422 {
423 	if (pdf_is_stream(ctx, src))
424 		return pdf_new_utf8_from_pdf_stream_obj(ctx, src);
425 	return pdf_new_utf8_from_pdf_string_obj(ctx, src);
426 }
427 
428 static pdf_obj *
pdf_new_text_string_utf16be(fz_context * ctx,const char * s)429 pdf_new_text_string_utf16be(fz_context *ctx, const char *s)
430 {
431 	const char *ss;
432 	int c, i, n, a, b;
433 	unsigned char *p;
434 	pdf_obj *obj;
435 
436 	ss = s;
437 	n = 0;
438 	while (*ss)
439 	{
440 		ss += fz_chartorune(&c, ss);
441 		n += (c >= 0x10000) ? 2 : 1;
442 	}
443 
444 	p = fz_malloc(ctx, n * 2 + 2);
445 	i = 0;
446 	p[i++] = 254;
447 	p[i++] = 255;
448 	while (*s)
449 	{
450 		s += fz_chartorune(&c, s);
451 		if (c >= 0x10000)
452 		{
453 			a = (((c - 0x10000) >> 10) & 0x3ff) + 0xD800;
454 			p[i++] = (a>>8) & 0xff;
455 			p[i++] = (a) & 0xff;
456 			b = (((c - 0x10000)) & 0x3ff) + 0xDC00;
457 			p[i++] = (b>>8) & 0xff;
458 			p[i++] = (b) & 0xff;
459 		}
460 		else
461 		{
462 			p[i++] = (c>>8) & 0xff;
463 			p[i++] = (c) & 0xff;
464 		}
465 	}
466 
467 	fz_try(ctx)
468 		obj = pdf_new_string(ctx, (char*)p, i);
469 	fz_always(ctx)
470 		fz_free(ctx, p);
471 	fz_catch(ctx)
472 		fz_rethrow(ctx);
473 	return obj;
474 }
475 
476 pdf_obj *
pdf_new_text_string(fz_context * ctx,const char * s)477 pdf_new_text_string(fz_context *ctx, const char *s)
478 {
479 	int i = 0;
480 	while (s[i] != 0)
481 	{
482 		if (((unsigned char)s[i]) >= 128)
483 			return pdf_new_text_string_utf16be(ctx, s);
484 		++i;
485 	}
486 	return pdf_new_string(ctx, s, i);
487 }
488 
489 pdf_obj *
pdf_parse_array(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf)490 pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
491 {
492 	pdf_obj *ary = NULL;
493 	pdf_obj *obj = NULL;
494 	int64_t a = 0, b = 0, n = 0;
495 	pdf_token tok;
496 	pdf_obj *op = NULL;
497 
498 	fz_var(obj);
499 
500 	ary = pdf_new_array(ctx, doc, 4);
501 
502 	fz_try(ctx)
503 	{
504 		while (1)
505 		{
506 			tok = pdf_lex(ctx, file, buf);
507 
508 			if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
509 			{
510 				if (n > 0)
511 					pdf_array_push_int(ctx, ary, a);
512 				if (n > 1)
513 					pdf_array_push_int(ctx, ary, b);
514 				n = 0;
515 			}
516 
517 			if (tok == PDF_TOK_INT && n == 2)
518 			{
519 				pdf_array_push_int(ctx, ary, a);
520 				a = b;
521 				n --;
522 			}
523 
524 			switch (tok)
525 			{
526 			case PDF_TOK_EOF:
527 				fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file");
528 
529 			case PDF_TOK_CLOSE_ARRAY:
530 				op = ary;
531 				goto end;
532 
533 			case PDF_TOK_INT:
534 				if (n == 0)
535 					a = buf->i;
536 				if (n == 1)
537 					b = buf->i;
538 				n ++;
539 				break;
540 
541 			case PDF_TOK_R:
542 				if (n != 2)
543 					fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array");
544 				pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b));
545 				n = 0;
546 				break;
547 
548 			case PDF_TOK_OPEN_ARRAY:
549 				obj = pdf_parse_array(ctx, doc, file, buf);
550 				pdf_array_push_drop(ctx, ary, obj);
551 				break;
552 
553 			case PDF_TOK_OPEN_DICT:
554 				obj = pdf_parse_dict(ctx, doc, file, buf);
555 				pdf_array_push_drop(ctx, ary, obj);
556 				break;
557 
558 			case PDF_TOK_NAME:
559 				pdf_array_push_name(ctx, ary, buf->scratch);
560 				break;
561 			case PDF_TOK_REAL:
562 				pdf_array_push_real(ctx, ary, buf->f);
563 				break;
564 			case PDF_TOK_STRING:
565 				pdf_array_push_string(ctx, ary, buf->scratch, buf->len);
566 				break;
567 			case PDF_TOK_TRUE:
568 				pdf_array_push_bool(ctx, ary, 1);
569 				break;
570 			case PDF_TOK_FALSE:
571 				pdf_array_push_bool(ctx, ary, 0);
572 				break;
573 			case PDF_TOK_NULL:
574 				pdf_array_push(ctx, ary, PDF_NULL);
575 				break;
576 
577 			default:
578 				pdf_array_push(ctx, ary, PDF_NULL);
579 				break;
580 			}
581 		}
582 end:
583 		{}
584 	}
585 	fz_catch(ctx)
586 	{
587 		pdf_drop_obj(ctx, ary);
588 		fz_rethrow(ctx);
589 	}
590 	return op;
591 }
592 
593 pdf_obj *
pdf_parse_dict(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf)594 pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
595 {
596 	pdf_obj *dict;
597 	pdf_obj *key = NULL;
598 	pdf_obj *val = NULL;
599 	pdf_token tok;
600 	int64_t a, b;
601 
602 	dict = pdf_new_dict(ctx, doc, 8);
603 
604 	fz_var(key);
605 	fz_var(val);
606 
607 	fz_try(ctx)
608 	{
609 		while (1)
610 		{
611 			tok = pdf_lex(ctx, file, buf);
612 	skip:
613 			if (tok == PDF_TOK_CLOSE_DICT)
614 				break;
615 
616 			/* for BI .. ID .. EI in content streams */
617 			if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
618 				break;
619 
620 			if (tok != PDF_TOK_NAME)
621 				fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict");
622 
623 			key = pdf_new_name(ctx, buf->scratch);
624 
625 			tok = pdf_lex(ctx, file, buf);
626 
627 			switch (tok)
628 			{
629 			case PDF_TOK_OPEN_ARRAY:
630 				val = pdf_parse_array(ctx, doc, file, buf);
631 				break;
632 
633 			case PDF_TOK_OPEN_DICT:
634 				val = pdf_parse_dict(ctx, doc, file, buf);
635 				break;
636 
637 			case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
638 			case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
639 			case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
640 			case PDF_TOK_TRUE: val = PDF_TRUE; break;
641 			case PDF_TOK_FALSE: val = PDF_FALSE; break;
642 			case PDF_TOK_NULL: val = PDF_NULL; break;
643 
644 			case PDF_TOK_INT:
645 				/* 64-bit to allow for numbers > INT_MAX and overflow */
646 				a = buf->i;
647 				tok = pdf_lex(ctx, file, buf);
648 				if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
649 					(tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
650 				{
651 					val = pdf_new_int(ctx, a);
652 					pdf_dict_put(ctx, dict, key, val);
653 					pdf_drop_obj(ctx, val);
654 					val = NULL;
655 					pdf_drop_obj(ctx, key);
656 					key = NULL;
657 					goto skip;
658 				}
659 				if (tok == PDF_TOK_INT)
660 				{
661 					b = buf->i;
662 					tok = pdf_lex(ctx, file, buf);
663 					if (tok == PDF_TOK_R)
664 					{
665 						val = pdf_new_indirect(ctx, doc, a, b);
666 						break;
667 					}
668 				}
669 				fz_warn(ctx, "invalid indirect reference in dict");
670 				val = PDF_NULL;
671 				break;
672 
673 			default:
674 				val = PDF_NULL;
675 				break;
676 			}
677 
678 			pdf_dict_put(ctx, dict, key, val);
679 			pdf_drop_obj(ctx, val);
680 			val = NULL;
681 			pdf_drop_obj(ctx, key);
682 			key = NULL;
683 		}
684 	}
685 	fz_catch(ctx)
686 	{
687 		pdf_drop_obj(ctx, dict);
688 		pdf_drop_obj(ctx, key);
689 		pdf_drop_obj(ctx, val);
690 		fz_rethrow(ctx);
691 	}
692 	return dict;
693 }
694 
695 pdf_obj *
pdf_parse_stm_obj(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf)696 pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
697 {
698 	pdf_token tok;
699 
700 	tok = pdf_lex(ctx, file, buf);
701 
702 	switch (tok)
703 	{
704 	case PDF_TOK_OPEN_ARRAY:
705 		return pdf_parse_array(ctx, doc, file, buf);
706 	case PDF_TOK_OPEN_DICT:
707 		return pdf_parse_dict(ctx, doc, file, buf);
708 	case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch);
709 	case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f);
710 	case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len);
711 	case PDF_TOK_TRUE: return PDF_TRUE;
712 	case PDF_TOK_FALSE: return PDF_FALSE;
713 	case PDF_TOK_NULL: return PDF_NULL;
714 	case PDF_TOK_INT: return pdf_new_int(ctx, buf->i);
715 	default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream");
716 	}
717 }
718 
719 pdf_obj *
pdf_parse_ind_obj(fz_context * ctx,pdf_document * doc,fz_stream * file,pdf_lexbuf * buf,int * onum,int * ogen,int64_t * ostmofs,int * try_repair)720 pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
721 	fz_stream *file, pdf_lexbuf *buf,
722 	int *onum, int *ogen, int64_t *ostmofs, int *try_repair)
723 {
724 	pdf_obj *obj = NULL;
725 	int num = 0, gen = 0;
726 	int64_t stm_ofs;
727 	pdf_token tok;
728 	int64_t a, b;
729 	int read_next_token = 1;
730 
731 	fz_var(obj);
732 
733 	tok = pdf_lex(ctx, file, buf);
734 	if (tok != PDF_TOK_INT)
735 	{
736 		if (try_repair)
737 			*try_repair = 1;
738 		fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number");
739 	}
740 	num = buf->i;
741 	if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
742 		fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range");
743 
744 	tok = pdf_lex(ctx, file, buf);
745 	if (tok != PDF_TOK_INT)
746 	{
747 		if (try_repair)
748 			*try_repair = 1;
749 		fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num);
750 	}
751 	gen = buf->i;
752 
753 	tok = pdf_lex(ctx, file, buf);
754 	if (tok != PDF_TOK_OBJ)
755 	{
756 		if (try_repair)
757 			*try_repair = 1;
758 		fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen);
759 	}
760 
761 	tok = pdf_lex(ctx, file, buf);
762 
763 	switch (tok)
764 	{
765 	case PDF_TOK_OPEN_ARRAY:
766 		obj = pdf_parse_array(ctx, doc, file, buf);
767 		break;
768 
769 	case PDF_TOK_OPEN_DICT:
770 		obj = pdf_parse_dict(ctx, doc, file, buf);
771 		break;
772 
773 	case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
774 	case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
775 	case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
776 	case PDF_TOK_TRUE: obj = PDF_TRUE; break;
777 	case PDF_TOK_FALSE: obj = PDF_FALSE; break;
778 	case PDF_TOK_NULL: obj = PDF_NULL; break;
779 
780 	case PDF_TOK_INT:
781 		a = buf->i;
782 		tok = pdf_lex(ctx, file, buf);
783 
784 		if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
785 		{
786 			obj = pdf_new_int(ctx, a);
787 			read_next_token = 0;
788 			break;
789 		}
790 		else if (tok == PDF_TOK_INT)
791 		{
792 			b = buf->i;
793 			tok = pdf_lex(ctx, file, buf);
794 			if (tok == PDF_TOK_R)
795 			{
796 				obj = pdf_new_indirect(ctx, doc, a, b);
797 				break;
798 			}
799 		}
800 		fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen);
801 
802 	case PDF_TOK_ENDOBJ:
803 		obj = PDF_NULL;
804 		read_next_token = 0;
805 		break;
806 
807 	default:
808 		fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen);
809 	}
810 
811 	fz_try(ctx)
812 	{
813 		if (read_next_token)
814 			tok = pdf_lex(ctx, file, buf);
815 
816 		if (tok == PDF_TOK_STREAM)
817 		{
818 			int c = fz_read_byte(ctx, file);
819 			while (c == ' ')
820 				c = fz_read_byte(ctx, file);
821 			if (c == '\r')
822 			{
823 				c = fz_peek_byte(ctx, file);
824 				if (c != '\n')
825 					fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
826 				else
827 					fz_read_byte(ctx, file);
828 			}
829 			stm_ofs = fz_tell(ctx, file);
830 		}
831 		else if (tok == PDF_TOK_ENDOBJ)
832 		{
833 			stm_ofs = 0;
834 		}
835 		else
836 		{
837 			fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
838 			stm_ofs = 0;
839 		}
840 	}
841 	fz_catch(ctx)
842 	{
843 		pdf_drop_obj(ctx, obj);
844 		fz_rethrow(ctx);
845 	}
846 
847 	if (onum) *onum = num;
848 	if (ogen) *ogen = gen;
849 	if (ostmofs) *ostmofs = stm_ofs;
850 
851 	return obj;
852 }
853