1 #include "mupdf/fitz.h"
2 #include "mupdf/pdf.h"
3 
4 #include <assert.h>
5 #include <limits.h>
6 #include <string.h>
7 
8 #undef DEBUG_PROGESSIVE_ADVANCE
9 
10 #ifdef DEBUG_PROGESSIVE_ADVANCE
11 #define DEBUGMESS(A) do { fz_warn A; } while (0)
12 #else
13 #define DEBUGMESS(A) do { } while (0)
14 #endif
15 
16 #define isdigit(c) (c >= '0' && c <= '9')
17 
iswhite(int ch)18 static inline int iswhite(int ch)
19 {
20 	return
21 		ch == '\000' || ch == '\011' || ch == '\012' ||
22 		ch == '\014' || ch == '\015' || ch == '\040';
23 }
24 
25 /*
26  * xref tables
27  */
28 
pdf_drop_xref_sections_imp(fz_context * ctx,pdf_document * doc,pdf_xref * xref_sections,int num_xref_sections)29 static void pdf_drop_xref_sections_imp(fz_context *ctx, pdf_document *doc, pdf_xref *xref_sections, int num_xref_sections)
30 {
31 	pdf_unsaved_sig *usig;
32 	int x, e;
33 
34 	for (x = 0; x < num_xref_sections; x++)
35 	{
36 		pdf_xref *xref = &xref_sections[x];
37 		pdf_xref_subsec *sub = xref->subsec;
38 
39 		while (sub != NULL)
40 		{
41 			pdf_xref_subsec *next_sub = sub->next;
42 			for (e = 0; e < sub->len; e++)
43 			{
44 				pdf_xref_entry *entry = &sub->table[e];
45 				if (entry->obj)
46 				{
47 					pdf_drop_obj(ctx, entry->obj);
48 					fz_drop_buffer(ctx, entry->stm_buf);
49 				}
50 			}
51 			fz_free(ctx, sub->table);
52 			fz_free(ctx, sub);
53 			sub = next_sub;
54 		}
55 
56 		pdf_drop_obj(ctx, xref->pre_repair_trailer);
57 		pdf_drop_obj(ctx, xref->trailer);
58 
59 		while ((usig = xref->unsaved_sigs) != NULL)
60 		{
61 			xref->unsaved_sigs = usig->next;
62 			pdf_drop_obj(ctx, usig->field);
63 			pdf_drop_signer(ctx, usig->signer);
64 			fz_free(ctx, usig);
65 		}
66 	}
67 
68 	fz_free(ctx, xref_sections);
69 }
70 
pdf_drop_xref_sections(fz_context * ctx,pdf_document * doc)71 static void pdf_drop_xref_sections(fz_context *ctx, pdf_document *doc)
72 {
73 	pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
74 	pdf_drop_xref_sections_imp(ctx, doc, doc->xref_sections, doc->num_xref_sections);
75 
76 	doc->saved_xref_sections = NULL;
77 	doc->saved_num_xref_sections = 0;
78 	doc->xref_sections = NULL;
79 	doc->num_xref_sections = 0;
80 	doc->num_incremental_sections = 0;
81 }
82 
83 static void
extend_xref_index(fz_context * ctx,pdf_document * doc,int newlen)84 extend_xref_index(fz_context *ctx, pdf_document *doc, int newlen)
85 {
86 	int i;
87 
88 	doc->xref_index = fz_realloc_array(ctx, doc->xref_index, newlen, int);
89 	for (i = doc->max_xref_len; i < newlen; i++)
90 	{
91 		doc->xref_index[i] = 0;
92 	}
93 	doc->max_xref_len = newlen;
94 }
95 
96 /* This is only ever called when we already have an incremental
97  * xref. This means there will only be 1 subsec, and it will be
98  * a complete subsec. */
pdf_resize_xref(fz_context * ctx,pdf_document * doc,int newlen)99 static void pdf_resize_xref(fz_context *ctx, pdf_document *doc, int newlen)
100 {
101 	int i;
102 	pdf_xref *xref = &doc->xref_sections[doc->xref_base];
103 	pdf_xref_subsec *sub;
104 
105 	assert(xref != NULL);
106 	sub = xref->subsec;
107 	assert(sub->next == NULL && sub->start == 0 && sub->len == xref->num_objects);
108 	assert(newlen > xref->num_objects);
109 
110 	sub->table = fz_realloc_array(ctx, sub->table, newlen, pdf_xref_entry);
111 	for (i = xref->num_objects; i < newlen; i++)
112 	{
113 		sub->table[i].type = 0;
114 		sub->table[i].ofs = 0;
115 		sub->table[i].gen = 0;
116 		sub->table[i].num = 0;
117 		sub->table[i].stm_ofs = 0;
118 		sub->table[i].stm_buf = NULL;
119 		sub->table[i].obj = NULL;
120 	}
121 	xref->num_objects = newlen;
122 	sub->len = newlen;
123 	if (doc->max_xref_len < newlen)
124 		extend_xref_index(ctx, doc, newlen);
125 }
126 
pdf_populate_next_xref_level(fz_context * ctx,pdf_document * doc)127 static void pdf_populate_next_xref_level(fz_context *ctx, pdf_document *doc)
128 {
129 	pdf_xref *xref;
130 	doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
131 	doc->num_xref_sections++;
132 
133 	xref = &doc->xref_sections[doc->num_xref_sections - 1];
134 	xref->subsec = NULL;
135 	xref->num_objects = 0;
136 	xref->trailer = NULL;
137 	xref->pre_repair_trailer = NULL;
138 	xref->unsaved_sigs = NULL;
139 	xref->unsaved_sigs_end = NULL;
140 }
141 
pdf_trailer(fz_context * ctx,pdf_document * doc)142 pdf_obj *pdf_trailer(fz_context *ctx, pdf_document *doc)
143 {
144 	/* Return the document's trailer (of the appopriate vintage) */
145 	pdf_xref *xref = &doc->xref_sections[doc->xref_base];
146 
147 	return xref ? xref->trailer : NULL;
148 }
149 
pdf_set_populating_xref_trailer(fz_context * ctx,pdf_document * doc,pdf_obj * trailer)150 void pdf_set_populating_xref_trailer(fz_context *ctx, pdf_document *doc, pdf_obj *trailer)
151 {
152 	/* Update the trailer of the xref section being populated */
153 	pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections - 1];
154 	if (xref->trailer)
155 	{
156 		pdf_drop_obj(ctx, xref->pre_repair_trailer);
157 		xref->pre_repair_trailer = xref->trailer;
158 	}
159 	xref->trailer = pdf_keep_obj(ctx, trailer);
160 }
161 
pdf_xref_len(fz_context * ctx,pdf_document * doc)162 int pdf_xref_len(fz_context *ctx, pdf_document *doc)
163 {
164 	return doc->max_xref_len;
165 }
166 
167 /* Ensure that the given xref has a single subsection
168  * that covers the entire range. */
169 static void
ensure_solid_xref(fz_context * ctx,pdf_document * doc,int num,int which)170 ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num, int which)
171 {
172 	pdf_xref *xref = &doc->xref_sections[which];
173 	pdf_xref_subsec *sub = xref->subsec;
174 	pdf_xref_subsec *new_sub;
175 
176 	if (num < xref->num_objects)
177 		num = xref->num_objects;
178 
179 	if (sub != NULL && sub->next == NULL && sub->start == 0 && sub->len >= num)
180 		return;
181 
182 	new_sub = fz_malloc_struct(ctx, pdf_xref_subsec);
183 	fz_try(ctx)
184 	{
185 		new_sub->table = fz_calloc(ctx, num, sizeof(pdf_xref_entry));
186 		new_sub->start = 0;
187 		new_sub->len = num;
188 		new_sub->next = NULL;
189 	}
190 	fz_catch(ctx)
191 	{
192 		fz_free(ctx, new_sub);
193 		fz_rethrow(ctx);
194 	}
195 
196 	/* Move objects over to the new subsection and destroy the old
197 	 * ones */
198 	sub = xref->subsec;
199 	while (sub != NULL)
200 	{
201 		pdf_xref_subsec *next = sub->next;
202 		int i;
203 
204 		for (i = 0; i < sub->len; i++)
205 		{
206 			new_sub->table[i+sub->start] = sub->table[i];
207 		}
208 		fz_free(ctx, sub->table);
209 		fz_free(ctx, sub);
210 		sub = next;
211 	}
212 	xref->num_objects = num;
213 	xref->subsec = new_sub;
214 	if (doc->max_xref_len < num)
215 		extend_xref_index(ctx, doc, num);
216 }
217 
pdf_get_populating_xref_entry(fz_context * ctx,pdf_document * doc,int num)218 pdf_xref_entry *pdf_get_populating_xref_entry(fz_context *ctx, pdf_document *doc, int num)
219 {
220 	/* Return an entry within the xref currently being populated */
221 	pdf_xref *xref;
222 	pdf_xref_subsec *sub;
223 
224 	if (doc->num_xref_sections == 0)
225 	{
226 		doc->xref_sections = fz_malloc_struct(ctx, pdf_xref);
227 		doc->num_xref_sections = 1;
228 	}
229 
230 	/* Prevent accidental heap underflow */
231 	if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
232 		fz_throw(ctx, FZ_ERROR_GENERIC, "object number out of range (%d)", num);
233 
234 	/* Return the pointer to the entry in the last section. */
235 	xref = &doc->xref_sections[doc->num_xref_sections-1];
236 
237 	for (sub = xref->subsec; sub != NULL; sub = sub->next)
238 	{
239 		if (num >= sub->start && num < sub->start + sub->len)
240 			return &sub->table[num-sub->start];
241 	}
242 
243 	/* We've been asked for an object that's not in a subsec. */
244 	ensure_solid_xref(ctx, doc, num+1, doc->num_xref_sections-1);
245 	xref = &doc->xref_sections[doc->num_xref_sections-1];
246 	sub = xref->subsec;
247 
248 	return &sub->table[num-sub->start];
249 }
250 
pdf_get_xref_entry(fz_context * ctx,pdf_document * doc,int i)251 pdf_xref_entry *pdf_get_xref_entry(fz_context *ctx, pdf_document *doc, int i)
252 {
253 	pdf_xref *xref = NULL;
254 	pdf_xref_subsec *sub;
255 	int j;
256 
257 	if (i < 0)
258 		fz_throw(ctx, FZ_ERROR_GENERIC, "Negative object number requested");
259 
260 	if (i <= doc->max_xref_len)
261 		j = doc->xref_index[i];
262 	else
263 		j = 0;
264 
265 	/* We may be accessing an earlier version of the document using xref_base
266 	 * and j may be an index into a later xref section */
267 	if (doc->xref_base > j)
268 		j = doc->xref_base;
269 
270 	/* Find the first xref section where the entry is defined. */
271 	for (; j < doc->num_xref_sections; j++)
272 	{
273 		xref = &doc->xref_sections[j];
274 
275 		if (i < xref->num_objects)
276 		{
277 			for (sub = xref->subsec; sub != NULL; sub = sub->next)
278 			{
279 				pdf_xref_entry *entry;
280 
281 				if (i < sub->start || i >= sub->start + sub->len)
282 					continue;
283 
284 				entry = &sub->table[i - sub->start];
285 				if (entry->type)
286 				{
287 					/* Don't update xref_index if xref_base may have
288 					 * influenced the value of j */
289 					if (doc->xref_base == 0)
290 						doc->xref_index[i] = j;
291 					return entry;
292 				}
293 			}
294 		}
295 	}
296 
297 	/* Didn't find the entry in any section. Return the entry from
298 	 * the final section. */
299 	doc->xref_index[i] = 0;
300 	if (xref == NULL || i < xref->num_objects)
301 	{
302 		xref = &doc->xref_sections[doc->xref_base];
303 		for (sub = xref->subsec; sub != NULL; sub = sub->next)
304 		{
305 			if (i >= sub->start && i < sub->start + sub->len)
306 				return &sub->table[i - sub->start];
307 		}
308 	}
309 
310 	/* At this point, we solidify the xref. This ensures that we
311 	 * can return a pointer. This is the only case where this function
312 	 * might throw an exception, and it will never happen when we are
313 	 * working within a 'solid' xref. */
314 	ensure_solid_xref(ctx, doc, i+1, 0);
315 	xref = &doc->xref_sections[0];
316 	sub = xref->subsec;
317 	return &sub->table[i - sub->start];
318 }
319 
320 /*
321 	Ensure we have an incremental xref section where we can store
322 	updated versions of indirect objects. This is a new xref section
323 	consisting of a single xref subsection.
324 */
ensure_incremental_xref(fz_context * ctx,pdf_document * doc)325 static void ensure_incremental_xref(fz_context *ctx, pdf_document *doc)
326 {
327 	/* If there are as yet no incremental sections, or if the most recent
328 	 * one has been used to sign a signature field, then we need a new one.
329 	 * After a signing, any further document changes require a new increment */
330 	if ((doc->num_incremental_sections == 0 || doc->xref_sections[0].unsaved_sigs != NULL)
331 		&& !doc->disallow_new_increments)
332 	{
333 		pdf_xref *xref = &doc->xref_sections[0];
334 		pdf_xref *pxref;
335 		pdf_xref_entry *new_table = fz_calloc(ctx, xref->num_objects, sizeof(pdf_xref_entry));
336 		pdf_xref_subsec *sub = NULL;
337 		pdf_obj *trailer = NULL;
338 		int i;
339 
340 		fz_var(trailer);
341 		fz_var(sub);
342 		fz_try(ctx)
343 		{
344 			sub = fz_malloc_struct(ctx, pdf_xref_subsec);
345 			trailer = xref->trailer ? pdf_copy_dict(ctx, xref->trailer) : NULL;
346 			doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
347 			xref = &doc->xref_sections[0];
348 			pxref = &doc->xref_sections[1];
349 			memmove(pxref, xref, doc->num_xref_sections * sizeof(pdf_xref));
350 			/* xref->num_objects is already correct */
351 			xref->subsec = sub;
352 			sub = NULL;
353 			xref->trailer = trailer;
354 			xref->pre_repair_trailer = NULL;
355 			xref->unsaved_sigs = NULL;
356 			xref->unsaved_sigs_end = NULL;
357 			xref->subsec->next = NULL;
358 			xref->subsec->len = xref->num_objects;
359 			xref->subsec->start = 0;
360 			xref->subsec->table = new_table;
361 			doc->num_xref_sections++;
362 			doc->num_incremental_sections++;
363 		}
364 		fz_catch(ctx)
365 		{
366 			fz_free(ctx, sub);
367 			fz_free(ctx, new_table);
368 			pdf_drop_obj(ctx, trailer);
369 			fz_rethrow(ctx);
370 		}
371 
372 		/* Update the xref_index */
373 		for (i = 0; i < doc->max_xref_len; i++)
374 		{
375 			doc->xref_index[i]++;
376 		}
377 	}
378 }
379 
380 /* Used when altering a document */
pdf_get_incremental_xref_entry(fz_context * ctx,pdf_document * doc,int i)381 static pdf_xref_entry *pdf_get_incremental_xref_entry(fz_context *ctx, pdf_document *doc, int i)
382 {
383 	pdf_xref *xref;
384 	pdf_xref_subsec *sub;
385 
386 	/* Make a new final xref section if we haven't already */
387 	ensure_incremental_xref(ctx, doc);
388 
389 	xref = &doc->xref_sections[doc->xref_base];
390 	if (i >= xref->num_objects)
391 		pdf_resize_xref(ctx, doc, i + 1);
392 
393 	sub = xref->subsec;
394 	assert(sub != NULL && sub->next == NULL);
395 	assert(i >= sub->start && i < sub->start + sub->len);
396 	doc->xref_index[i] = 0;
397 	return &sub->table[i - sub->start];
398 }
399 
pdf_xref_is_incremental(fz_context * ctx,pdf_document * doc,int num)400 int pdf_xref_is_incremental(fz_context *ctx, pdf_document *doc, int num)
401 {
402 	pdf_xref *xref = &doc->xref_sections[doc->xref_base];
403 	pdf_xref_subsec *sub = xref->subsec;
404 
405 	assert(sub != NULL && sub->next == NULL && sub->len == xref->num_objects && sub->start == 0);
406 
407 	return num < xref->num_objects && sub->table[num].type;
408 }
409 
pdf_xref_store_unsaved_signature(fz_context * ctx,pdf_document * doc,pdf_obj * field,pdf_pkcs7_signer * signer)410 void pdf_xref_store_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field, pdf_pkcs7_signer *signer)
411 {
412 	pdf_xref *xref = &doc->xref_sections[0];
413 	pdf_unsaved_sig *unsaved_sig;
414 
415 	/* Record details within the document structure so that contents
416 	 * and byte_range can be updated with their correct values at
417 	 * saving time */
418 	unsaved_sig = fz_malloc_struct(ctx, pdf_unsaved_sig);
419 	unsaved_sig->field = pdf_keep_obj(ctx, field);
420 	unsaved_sig->signer = signer->keep(ctx, signer);
421 	unsaved_sig->next = NULL;
422 	if (xref->unsaved_sigs_end == NULL)
423 		xref->unsaved_sigs_end = &xref->unsaved_sigs;
424 
425 	*xref->unsaved_sigs_end = unsaved_sig;
426 	xref->unsaved_sigs_end = &unsaved_sig->next;
427 }
428 
pdf_xref_obj_is_unsaved_signature(pdf_document * doc,pdf_obj * obj)429 int pdf_xref_obj_is_unsaved_signature(pdf_document *doc, pdf_obj *obj)
430 {
431 	int i;
432 	for (i = 0; i < doc->num_incremental_sections; i++)
433 	{
434 		pdf_xref *xref = &doc->xref_sections[i];
435 		pdf_unsaved_sig *usig;
436 
437 		for (usig = xref->unsaved_sigs; usig; usig = usig->next)
438 		{
439 			if (usig->field == obj)
440 				return 1;
441 		}
442 	}
443 
444 	return 0;
445 }
446 
pdf_ensure_solid_xref(fz_context * ctx,pdf_document * doc,int num)447 void pdf_ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num)
448 {
449 	if (doc->num_xref_sections == 0)
450 		pdf_populate_next_xref_level(ctx, doc);
451 
452 	ensure_solid_xref(ctx, doc, num, doc->num_xref_sections-1);
453 }
454 
pdf_xref_ensure_incremental_object(fz_context * ctx,pdf_document * doc,int num)455 void pdf_xref_ensure_incremental_object(fz_context *ctx, pdf_document *doc, int num)
456 {
457 	pdf_xref_entry *new_entry, *old_entry;
458 	pdf_xref_subsec *sub = NULL;
459 	int i;
460 
461 	/* Make sure we have created an xref section for incremental updates */
462 	ensure_incremental_xref(ctx, doc);
463 
464 	/* Search for the section that contains this object */
465 	for (i = doc->xref_index[num]; i < doc->num_xref_sections; i++)
466 	{
467 		pdf_xref *xref = &doc->xref_sections[i];
468 
469 		if (num < 0 && num >= xref->num_objects)
470 			break;
471 		for (sub = xref->subsec; sub != NULL; sub = sub->next)
472 		{
473 			if (sub->start <= num && num < sub->start + sub->len && sub->table[num - sub->start].type)
474 				break;
475 		}
476 		if (sub != NULL)
477 			break;
478 	}
479 	/* sub == NULL implies we did not find it */
480 
481 	/* If we don't find it, or it's already in the incremental section, return */
482 	if (i == 0 || sub == NULL)
483 		return;
484 
485 	/* Move the object to the incremental section */
486 	doc->xref_index[num] = 0;
487 	old_entry = &sub->table[num - sub->start];
488 	new_entry = pdf_get_incremental_xref_entry(ctx, doc, num);
489 	*new_entry = *old_entry;
490 	if (i < doc->num_incremental_sections)
491 	{
492 		/* old entry is incremental and may have changes.
493 		 * Better keep a copy. We must override the old entry with
494 		 * the copy because the caller may be holding a reference to
495 		 * the original and expect it to end up in the new entry */
496 		old_entry->obj = pdf_deep_copy_obj(ctx, old_entry->obj);
497 	}
498 	else
499 	{
500 		old_entry->obj = NULL;
501 	}
502 	old_entry->stm_buf = NULL;
503 }
504 
pdf_replace_xref(fz_context * ctx,pdf_document * doc,pdf_xref_entry * entries,int n)505 void pdf_replace_xref(fz_context *ctx, pdf_document *doc, pdf_xref_entry *entries, int n)
506 {
507 	int *xref_index = NULL;
508 	pdf_xref *xref = NULL;
509 	pdf_xref_subsec *sub;
510 
511 	fz_var(xref_index);
512 	fz_var(xref);
513 
514 	fz_try(ctx)
515 	{
516 		xref_index = fz_calloc(ctx, n, sizeof(int));
517 		xref = fz_malloc_struct(ctx, pdf_xref);
518 		sub = fz_malloc_struct(ctx, pdf_xref_subsec);
519 	}
520 	fz_catch(ctx)
521 	{
522 		fz_free(ctx, xref);
523 		fz_free(ctx, xref_index);
524 		fz_rethrow(ctx);
525 	}
526 
527 	sub->table = entries;
528 	sub->start = 0;
529 	sub->len = n;
530 
531 	xref->subsec = sub;
532 	xref->num_objects = n;
533 	xref->trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
534 
535 	/* The new table completely replaces the previous separate sections */
536 	pdf_drop_xref_sections(ctx, doc);
537 
538 	doc->xref_sections = xref;
539 	doc->num_xref_sections = 1;
540 	doc->num_incremental_sections = 0;
541 	doc->xref_base = 0;
542 	doc->disallow_new_increments = 0;
543 	doc->max_xref_len = n;
544 
545 	fz_free(ctx, doc->xref_index);
546 	doc->xref_index = xref_index;
547 }
548 
pdf_forget_xref(fz_context * ctx,pdf_document * doc)549 void pdf_forget_xref(fz_context *ctx, pdf_document *doc)
550 {
551 	pdf_obj *trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
552 
553 	if (doc->saved_xref_sections)
554 		pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
555 
556 	doc->saved_xref_sections = doc->xref_sections;
557 	doc->saved_num_xref_sections = doc->num_xref_sections;
558 
559 	doc->startxref = 0;
560 	doc->num_xref_sections = 0;
561 	doc->num_incremental_sections = 0;
562 	doc->xref_base = 0;
563 	doc->disallow_new_increments = 0;
564 
565 	fz_try(ctx)
566 	{
567 		pdf_get_populating_xref_entry(ctx, doc, 0);
568 	}
569 	fz_catch(ctx)
570 	{
571 		pdf_drop_obj(ctx, trailer);
572 		fz_rethrow(ctx);
573 	}
574 
575 	/* Set the trailer of the final xref section. */
576 	doc->xref_sections[0].trailer = trailer;
577 }
578 
579 /*
580  * magic version tag and startxref
581  */
582 
583 int
pdf_version(fz_context * ctx,pdf_document * doc)584 pdf_version(fz_context *ctx, pdf_document *doc)
585 {
586 	int version = doc->version;
587 	fz_try(ctx)
588 	{
589 		pdf_obj *obj = pdf_dict_getl(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), PDF_NAME(Version), NULL);
590 		const char *str = pdf_to_name(ctx, obj);
591 		if (*str)
592 			version = 10 * (fz_atof(str) + 0.05f);
593 	}
594 	fz_catch(ctx)
595 	{
596 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
597 		fz_warn(ctx, "Ignoring broken Root/Version number.");
598 	}
599 	return version;
600 }
601 
602 static void
pdf_load_version(fz_context * ctx,pdf_document * doc)603 pdf_load_version(fz_context *ctx, pdf_document *doc)
604 {
605 	char buf[20];
606 
607 	fz_seek(ctx, doc->file, 0, SEEK_SET);
608 	fz_read_line(ctx, doc->file, buf, sizeof buf);
609 	if (strlen(buf) < 5 || memcmp(buf, "%PDF-", 5) != 0)
610 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize version marker");
611 
612 	doc->version = 10 * (fz_atof(buf+5) + 0.05f);
613 	if (doc->version < 10 || doc->version > 17)
614 		if (doc->version != 20)
615 			fz_warn(ctx, "unknown PDF version: %d.%d", doc->version / 10, doc->version % 10);
616 }
617 
618 static void
pdf_read_start_xref(fz_context * ctx,pdf_document * doc)619 pdf_read_start_xref(fz_context *ctx, pdf_document *doc)
620 {
621 	unsigned char buf[1024];
622 	size_t i, n;
623 	int64_t t;
624 
625 	fz_seek(ctx, doc->file, 0, SEEK_END);
626 
627 	doc->file_size = fz_tell(ctx, doc->file);
628 
629 	t = fz_maxi64(0, doc->file_size - (int64_t)sizeof buf);
630 	fz_seek(ctx, doc->file, t, SEEK_SET);
631 
632 	n = fz_read(ctx, doc->file, buf, sizeof buf);
633 	if (n < 9)
634 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
635 
636 	i = n - 9;
637 	do
638 	{
639 		if (memcmp(buf + i, "startxref", 9) == 0)
640 		{
641 			i += 9;
642 			while (i < n && iswhite(buf[i]))
643 				i ++;
644 			doc->startxref = 0;
645 			while (i < n && isdigit(buf[i]))
646 			{
647 				if (doc->startxref >= INT64_MAX/10)
648 					fz_throw(ctx, FZ_ERROR_GENERIC, "startxref too large");
649 				doc->startxref = doc->startxref * 10 + (buf[i++] - '0');
650 			}
651 			if (doc->startxref != 0)
652 				return;
653 			break;
654 		}
655 	} while (i-- > 0);
656 
657 	fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
658 }
659 
660 static void
fz_skip_space(fz_context * ctx,fz_stream * stm)661 fz_skip_space(fz_context *ctx, fz_stream *stm)
662 {
663 	do
664 	{
665 		int c = fz_peek_byte(ctx, stm);
666 		if (c == EOF || c > 32)
667 			return;
668 		(void)fz_read_byte(ctx, stm);
669 	}
670 	while (1);
671 }
672 
fz_skip_string(fz_context * ctx,fz_stream * stm,const char * str)673 static int fz_skip_string(fz_context *ctx, fz_stream *stm, const char *str)
674 {
675 	while (*str)
676 	{
677 		int c = fz_peek_byte(ctx, stm);
678 		if (c == EOF || c != *str++)
679 			return 1;
680 		(void)fz_read_byte(ctx, stm);
681 	}
682 	return 0;
683 }
684 
685 /*
686  * trailer dictionary
687  */
688 
689 static int
pdf_xref_size_from_old_trailer(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)690 pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
691 {
692 	int len;
693 	char *s;
694 	int64_t t;
695 	pdf_token tok;
696 	int c;
697 	int size = 0;
698 	int64_t ofs;
699 	pdf_obj *trailer = NULL;
700 	size_t n;
701 
702 	fz_var(trailer);
703 
704 	/* Record the current file read offset so that we can reinstate it */
705 	ofs = fz_tell(ctx, doc->file);
706 
707 	fz_skip_space(ctx, doc->file);
708 	if (fz_skip_string(ctx, doc->file, "xref"))
709 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
710 	fz_skip_space(ctx, doc->file);
711 
712 	while (1)
713 	{
714 		c = fz_peek_byte(ctx, doc->file);
715 		if (!isdigit(c))
716 			break;
717 
718 		fz_read_line(ctx, doc->file, buf->scratch, buf->size);
719 		s = buf->scratch;
720 		fz_strsep(&s, " "); /* ignore start */
721 		if (!s)
722 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing");
723 		len = fz_atoi(fz_strsep(&s, " "));
724 		if (len < 0)
725 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive");
726 
727 		/* broken pdfs where the section is not on a separate line */
728 		if (s && *s != '\0')
729 			fz_seek(ctx, doc->file, -(2 + (int)strlen(s)), SEEK_CUR);
730 
731 		t = fz_tell(ctx, doc->file);
732 		if (t < 0)
733 			fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
734 
735 		/* Spec says xref entries should be 20 bytes, but it's not infrequent
736 		 * to see 19, in particular for some PCLm drivers. Cope. */
737 		if (len > 0)
738 		{
739 			n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, 20);
740 			if (n < 19)
741 				fz_throw(ctx, FZ_ERROR_GENERIC, "malformed xref table");
742 			if (n == 20 && buf->scratch[19] > 32)
743 				n = 19;
744 		}
745 		else
746 			n = 20;
747 
748 		if (len > (int64_t)((INT64_MAX - t) / n))
749 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref has too many entries");
750 
751 		fz_seek(ctx, doc->file, t + n * (int64_t)len, SEEK_SET);
752 	}
753 
754 	fz_try(ctx)
755 	{
756 		tok = pdf_lex(ctx, doc->file, buf);
757 		if (tok != PDF_TOK_TRAILER)
758 			fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer marker");
759 
760 		tok = pdf_lex(ctx, doc->file, buf);
761 		if (tok != PDF_TOK_OPEN_DICT)
762 			fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary");
763 
764 		trailer = pdf_parse_dict(ctx, doc, doc->file, buf);
765 
766 		size = pdf_dict_get_int(ctx, trailer, PDF_NAME(Size));
767 		if (size < 0 || size > PDF_MAX_OBJECT_NUMBER + 1)
768 			fz_throw(ctx, FZ_ERROR_GENERIC, "trailer Size entry out of range");
769 	}
770 	fz_always(ctx)
771 	{
772 		pdf_drop_obj(ctx, trailer);
773 	}
774 	fz_catch(ctx)
775 	{
776 		fz_rethrow(ctx);
777 	}
778 
779 	fz_seek(ctx, doc->file, ofs, SEEK_SET);
780 
781 	return size;
782 }
783 
784 static pdf_xref_entry *
pdf_xref_find_subsection(fz_context * ctx,pdf_document * doc,int start,int len)785 pdf_xref_find_subsection(fz_context *ctx, pdf_document *doc, int start, int len)
786 {
787 	pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections-1];
788 	pdf_xref_subsec *sub;
789 	int num_objects;
790 
791 	/* Different cases here. Case 1) We might be asking for a
792 	 * subsection (or a subset of a subsection) that we already
793 	 * have - Just return it. Case 2) We might be asking for a
794 	 * completely new subsection - Create it and return it.
795 	 * Case 3) We might have an overlapping one - Create a 'solid'
796 	 * subsection and return that. */
797 
798 	/* Sanity check */
799 	for (sub = xref->subsec; sub != NULL; sub = sub->next)
800 	{
801 		if (start >= sub->start && start + len <= sub->start + sub->len)
802 			return &sub->table[start-sub->start]; /* Case 1 */
803 		if (start + len > sub->start && start <= sub->start + sub->len)
804 			break; /* Case 3 */
805 	}
806 
807 	num_objects = xref->num_objects;
808 	if (num_objects < start + len)
809 		num_objects = start + len;
810 
811 	if (sub == NULL)
812 	{
813 		/* Case 2 */
814 		sub = fz_malloc_struct(ctx, pdf_xref_subsec);
815 		fz_try(ctx)
816 		{
817 			sub->table = fz_calloc(ctx, len, sizeof(pdf_xref_entry));
818 			sub->start = start;
819 			sub->len = len;
820 			sub->next = xref->subsec;
821 			xref->subsec = sub;
822 		}
823 		fz_catch(ctx)
824 		{
825 			fz_free(ctx, sub);
826 			fz_rethrow(ctx);
827 		}
828 		xref->num_objects = num_objects;
829 		if (doc->max_xref_len < num_objects)
830 			extend_xref_index(ctx, doc, num_objects);
831 	}
832 	else
833 	{
834 		/* Case 3 */
835 		ensure_solid_xref(ctx, doc, num_objects, doc->num_xref_sections-1);
836 		xref = &doc->xref_sections[doc->num_xref_sections-1];
837 		sub = xref->subsec;
838 	}
839 	return &sub->table[start-sub->start];
840 }
841 
842 static pdf_obj *
pdf_read_old_xref(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)843 pdf_read_old_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
844 {
845 	int start, len, c, i, xref_len, carried;
846 	fz_stream *file = doc->file;
847 	pdf_xref_entry *table;
848 	pdf_token tok;
849 	size_t n;
850 	char *s, *e;
851 
852 	xref_len = pdf_xref_size_from_old_trailer(ctx, doc, buf);
853 
854 	fz_skip_space(ctx, doc->file);
855 	if (fz_skip_string(ctx, doc->file, "xref"))
856 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
857 	fz_skip_space(ctx, doc->file);
858 
859 	while (1)
860 	{
861 		c = fz_peek_byte(ctx, file);
862 		if (!isdigit(c))
863 			break;
864 
865 		fz_read_line(ctx, file, buf->scratch, buf->size);
866 		s = buf->scratch;
867 		start = fz_atoi(fz_strsep(&s, " "));
868 		len = fz_atoi(fz_strsep(&s, " "));
869 
870 		/* broken pdfs where the section is not on a separate line */
871 		if (s && *s != '\0')
872 		{
873 			fz_warn(ctx, "broken xref subsection. proceeding anyway.");
874 			fz_seek(ctx, file, -(2 + (int)strlen(s)), SEEK_CUR);
875 		}
876 
877 		if (start < 0 || start > PDF_MAX_OBJECT_NUMBER
878 				|| len < 0 || len > PDF_MAX_OBJECT_NUMBER
879 				|| start + len - 1 > PDF_MAX_OBJECT_NUMBER)
880 		{
881 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range");
882 		}
883 		/* broken pdfs where size in trailer undershoots entries in xref sections */
884 		if (start + len > xref_len)
885 		{
886 			fz_warn(ctx, "broken xref subsection, proceeding anyway.");
887 		}
888 
889 		table = pdf_xref_find_subsection(ctx, doc, start, len);
890 
891 		/* Xref entries SHOULD be 20 bytes long, but we see 19 byte
892 		 * ones more frequently than we'd like (e.g. PCLm drivers).
893 		 * Cope with this by 'carrying' data forward. */
894 		carried = 0;
895 		for (i = 0; i < len; i++)
896 		{
897 			pdf_xref_entry *entry = &table[i];
898 			n = fz_read(ctx, file, (unsigned char *) buf->scratch + carried, 20-carried);
899 			if (n != (size_t)(20-carried))
900 				fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected EOF in xref table");
901 			n += carried;
902 			buf->scratch[n] = '\0';
903 			if (!entry->type)
904 			{
905 				s = buf->scratch;
906 				e = s + n;
907 
908 				entry->num = start + i;
909 
910 				/* broken pdfs where line start with white space */
911 				while (s < e && iswhite(*s))
912 					s++;
913 
914 				if (s == e || !isdigit(*s))
915 					fz_throw(ctx, FZ_ERROR_GENERIC, "xref offset missing");
916 				while (s < e && isdigit(*s))
917 					entry->ofs = entry->ofs * 10 + *s++ - '0';
918 
919 				while (s < e && iswhite(*s))
920 					s++;
921 				if (s == e || !isdigit(*s))
922 					fz_throw(ctx, FZ_ERROR_GENERIC, "xref generation number missing");
923 				while (s < e && isdigit(*s))
924 					entry->gen = entry->gen * 10 + *s++ - '0';
925 
926 				while (s < e && iswhite(*s))
927 					s++;
928 				if (s == e || (*s != 'f' && *s != 'n' && *s != 'o'))
929 					fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected xref type: 0x%x (%d %d R)", s == e ? 0 : *s, entry->num, entry->gen);
930 				entry->type = *s++;
931 
932 				/* If the last byte of our buffer isn't an EOL (or space), carry one byte forward */
933 				carried = buf->scratch[19] > 32;
934 				if (carried)
935 					buf->scratch[0] = buf->scratch[19];
936 			}
937 		}
938 		if (carried)
939 			fz_unread_byte(ctx, file);
940 	}
941 
942 	tok = pdf_lex(ctx, file, buf);
943 	if (tok != PDF_TOK_TRAILER)
944 		fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer marker");
945 
946 	tok = pdf_lex(ctx, file, buf);
947 	if (tok != PDF_TOK_OPEN_DICT)
948 		fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary");
949 
950 	doc->has_old_style_xrefs = 1;
951 
952 	return pdf_parse_dict(ctx, doc, file, buf);
953 }
954 
955 static void
pdf_read_new_xref_section(fz_context * ctx,pdf_document * doc,fz_stream * stm,int i0,int i1,int w0,int w1,int w2)956 pdf_read_new_xref_section(fz_context *ctx, pdf_document *doc, fz_stream *stm, int i0, int i1, int w0, int w1, int w2)
957 {
958 	pdf_xref_entry *table;
959 	int i, n;
960 
961 	if (i0 < 0 || i0 > PDF_MAX_OBJECT_NUMBER || i1 < 0 || i1 > PDF_MAX_OBJECT_NUMBER || i0 + i1 - 1 > PDF_MAX_OBJECT_NUMBER)
962 		fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range");
963 
964 	table = pdf_xref_find_subsection(ctx, doc, i0, i1);
965 	for (i = i0; i < i0 + i1; i++)
966 	{
967 		pdf_xref_entry *entry = &table[i-i0];
968 		int a = 0;
969 		int64_t b = 0;
970 		int c = 0;
971 
972 		if (fz_is_eof(ctx, stm))
973 			fz_throw(ctx, FZ_ERROR_GENERIC, "truncated xref stream");
974 
975 		for (n = 0; n < w0; n++)
976 			a = (a << 8) + fz_read_byte(ctx, stm);
977 		for (n = 0; n < w1; n++)
978 			b = (b << 8) + fz_read_byte(ctx, stm);
979 		for (n = 0; n < w2; n++)
980 			c = (c << 8) + fz_read_byte(ctx, stm);
981 
982 		if (!entry->type)
983 		{
984 			int t = w0 ? a : 1;
985 			entry->type = t == 0 ? 'f' : t == 1 ? 'n' : t == 2 ? 'o' : 0;
986 			entry->ofs = w1 ? b : 0;
987 			entry->gen = w2 ? c : 0;
988 			entry->num = i;
989 		}
990 	}
991 
992 	doc->has_xref_streams = 1;
993 }
994 
995 /* Entered with file locked, remains locked throughout. */
996 static pdf_obj *
pdf_read_new_xref(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)997 pdf_read_new_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
998 {
999 	fz_stream *stm = NULL;
1000 	pdf_obj *trailer = NULL;
1001 	pdf_obj *index = NULL;
1002 	pdf_obj *obj = NULL;
1003 	int gen, num = 0;
1004 	int64_t ofs, stm_ofs;
1005 	int size, w0, w1, w2;
1006 	int t;
1007 
1008 	fz_var(trailer);
1009 	fz_var(stm);
1010 
1011 	fz_try(ctx)
1012 	{
1013 		ofs = fz_tell(ctx, doc->file);
1014 		trailer = pdf_parse_ind_obj(ctx, doc, doc->file, buf, &num, &gen, &stm_ofs, NULL);
1015 	}
1016 	fz_catch(ctx)
1017 	{
1018 		pdf_drop_obj(ctx, trailer);
1019 		fz_rethrow(ctx);
1020 	}
1021 
1022 	fz_try(ctx)
1023 	{
1024 		pdf_xref_entry *entry;
1025 
1026 		obj = pdf_dict_get(ctx, trailer, PDF_NAME(Size));
1027 		if (!obj)
1028 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing Size entry (%d 0 R)", num);
1029 
1030 		size = pdf_to_int(ctx, obj);
1031 
1032 		obj = pdf_dict_get(ctx, trailer, PDF_NAME(W));
1033 		if (!obj)
1034 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing W entry (%d  R)", num);
1035 		w0 = pdf_array_get_int(ctx, obj, 0);
1036 		w1 = pdf_array_get_int(ctx, obj, 1);
1037 		w2 = pdf_array_get_int(ctx, obj, 2);
1038 
1039 		if (w0 < 0)
1040 			fz_warn(ctx, "xref stream objects have corrupt type");
1041 		if (w1 < 0)
1042 			fz_warn(ctx, "xref stream objects have corrupt offset");
1043 		if (w2 < 0)
1044 			fz_warn(ctx, "xref stream objects have corrupt generation");
1045 
1046 		w0 = w0 < 0 ? 0 : w0;
1047 		w1 = w1 < 0 ? 0 : w1;
1048 		w2 = w2 < 0 ? 0 : w2;
1049 
1050 		index = pdf_dict_get(ctx, trailer, PDF_NAME(Index));
1051 
1052 		stm = pdf_open_stream_with_offset(ctx, doc, num, trailer, stm_ofs);
1053 
1054 		if (!index)
1055 		{
1056 			pdf_read_new_xref_section(ctx, doc, stm, 0, size, w0, w1, w2);
1057 		}
1058 		else
1059 		{
1060 			int n = pdf_array_len(ctx, index);
1061 			for (t = 0; t < n; t += 2)
1062 			{
1063 				int i0 = pdf_array_get_int(ctx, index, t + 0);
1064 				int i1 = pdf_array_get_int(ctx, index, t + 1);
1065 				pdf_read_new_xref_section(ctx, doc, stm, i0, i1, w0, w1, w2);
1066 			}
1067 		}
1068 		entry = pdf_get_populating_xref_entry(ctx, doc, num);
1069 		entry->ofs = ofs;
1070 		entry->gen = gen;
1071 		entry->num = num;
1072 		entry->stm_ofs = stm_ofs;
1073 		pdf_drop_obj(ctx, entry->obj);
1074 		entry->obj = pdf_keep_obj(ctx, trailer);
1075 		entry->type = 'n';
1076 	}
1077 	fz_always(ctx)
1078 	{
1079 		fz_drop_stream(ctx, stm);
1080 	}
1081 	fz_catch(ctx)
1082 	{
1083 		pdf_drop_obj(ctx, trailer);
1084 		fz_rethrow(ctx);
1085 	}
1086 
1087 	return trailer;
1088 }
1089 
1090 static pdf_obj *
pdf_read_xref(fz_context * ctx,pdf_document * doc,int64_t ofs,pdf_lexbuf * buf)1091 pdf_read_xref(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf)
1092 {
1093 	pdf_obj *trailer;
1094 	int c;
1095 
1096 	fz_seek(ctx, doc->file, ofs, SEEK_SET);
1097 
1098 	while (iswhite(fz_peek_byte(ctx, doc->file)))
1099 		fz_read_byte(ctx, doc->file);
1100 
1101 	c = fz_peek_byte(ctx, doc->file);
1102 	if (c == 'x')
1103 		trailer = pdf_read_old_xref(ctx, doc, buf);
1104 	else if (isdigit(c))
1105 		trailer = pdf_read_new_xref(ctx, doc, buf);
1106 	else
1107 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize xref format");
1108 
1109 	return trailer;
1110 }
1111 
1112 static int64_t
read_xref_section(fz_context * ctx,pdf_document * doc,int64_t ofs,pdf_lexbuf * buf)1113 read_xref_section(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf)
1114 {
1115 	pdf_obj *trailer = NULL;
1116 	pdf_obj *prevobj;
1117 	int64_t xrefstmofs = 0;
1118 	int64_t prevofs = 0;
1119 
1120 	trailer = pdf_read_xref(ctx, doc, ofs, buf);
1121 	fz_try(ctx)
1122 	{
1123 		pdf_set_populating_xref_trailer(ctx, doc, trailer);
1124 
1125 		/* FIXME: do we overwrite free entries properly? */
1126 		/* FIXME: Does this work properly with progression? */
1127 		xrefstmofs = pdf_to_int64(ctx, pdf_dict_get(ctx, trailer, PDF_NAME(XRefStm)));
1128 		if (xrefstmofs)
1129 		{
1130 			if (xrefstmofs < 0)
1131 				fz_throw(ctx, FZ_ERROR_GENERIC, "negative xref stream offset");
1132 
1133 			/*
1134 				Read the XRefStm stream, but throw away the resulting trailer. We do not
1135 				follow any Prev tag therein, as specified on Page 108 of the PDF reference
1136 				1.7
1137 			*/
1138 			pdf_drop_obj(ctx, pdf_read_xref(ctx, doc, xrefstmofs, buf));
1139 		}
1140 
1141 		prevobj = pdf_dict_get(ctx, trailer, PDF_NAME(Prev));
1142 		if (pdf_is_int(ctx, prevobj))
1143 		{
1144 			prevofs = pdf_to_int64(ctx, prevobj);
1145 			if (prevofs <= 0)
1146 				fz_throw(ctx, FZ_ERROR_GENERIC, "invalid offset for previous xref section");
1147 		}
1148 	}
1149 	fz_always(ctx)
1150 		pdf_drop_obj(ctx, trailer);
1151 	fz_catch(ctx)
1152 		fz_rethrow(ctx);
1153 
1154 	return prevofs;
1155 }
1156 
1157 static void
pdf_read_xref_sections(fz_context * ctx,pdf_document * doc,int64_t ofs,pdf_lexbuf * buf,int read_previous)1158 pdf_read_xref_sections(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf, int read_previous)
1159 {
1160 	int i, len, cap;
1161 	int64_t *offsets;
1162 
1163 	len = 0;
1164 	cap = 10;
1165 	offsets = fz_malloc_array(ctx, cap, int64_t);
1166 
1167 	fz_try(ctx)
1168 	{
1169 		while(ofs)
1170 		{
1171 			for (i = 0; i < len; i ++)
1172 			{
1173 				if (offsets[i] == ofs)
1174 					break;
1175 			}
1176 			if (i < len)
1177 			{
1178 				fz_warn(ctx, "ignoring xref section recursion at offset %d", (int)ofs);
1179 				break;
1180 			}
1181 			if (len == cap)
1182 			{
1183 				cap *= 2;
1184 				offsets = fz_realloc_array(ctx, offsets, cap, int64_t);
1185 			}
1186 			offsets[len++] = ofs;
1187 
1188 			pdf_populate_next_xref_level(ctx, doc);
1189 			ofs = read_xref_section(ctx, doc, ofs, buf);
1190 			if (!read_previous)
1191 				break;
1192 		}
1193 	}
1194 	fz_always(ctx)
1195 	{
1196 		fz_free(ctx, offsets);
1197 	}
1198 	fz_catch(ctx)
1199 	{
1200 		fz_rethrow(ctx);
1201 	}
1202 }
1203 
1204 static void
pdf_prime_xref_index(fz_context * ctx,pdf_document * doc)1205 pdf_prime_xref_index(fz_context *ctx, pdf_document *doc)
1206 {
1207 	int i, j;
1208 	int *idx = doc->xref_index;
1209 
1210 	for (i = doc->num_xref_sections-1; i >= 0; i--)
1211 	{
1212 		pdf_xref *xref = &doc->xref_sections[i];
1213 		pdf_xref_subsec *subsec = xref->subsec;
1214 		while (subsec != NULL)
1215 		{
1216 			int start = subsec->start;
1217 			int end = subsec->start + subsec->len;
1218 			for (j = start; j < end; j++)
1219 			{
1220 				char t = subsec->table[j-start].type;
1221 				if (t != 0 && t != 'f')
1222 					idx[j] = i;
1223 			}
1224 
1225 			subsec = subsec->next;
1226 		}
1227 	}
1228 }
1229 
1230 /*
1231  * load xref tables from pdf
1232  *
1233  * File locked on entry, throughout and on exit.
1234  */
1235 
1236 static void
pdf_load_xref(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)1237 pdf_load_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
1238 {
1239 	int i;
1240 	int xref_len;
1241 	pdf_xref_entry *entry;
1242 
1243 	pdf_read_start_xref(ctx, doc);
1244 
1245 	pdf_read_xref_sections(ctx, doc, doc->startxref, buf, 1);
1246 
1247 	if (pdf_xref_len(ctx, doc) == 0)
1248 		fz_throw(ctx, FZ_ERROR_GENERIC, "found xref was empty");
1249 
1250 	pdf_prime_xref_index(ctx, doc);
1251 
1252 	entry = pdf_get_xref_entry(ctx, doc, 0);
1253 	/* broken pdfs where first object is missing */
1254 	if (!entry->type)
1255 	{
1256 		entry->type = 'f';
1257 		entry->gen = 65535;
1258 		entry->num = 0;
1259 	}
1260 	/* broken pdfs where first object is not free */
1261 	else if (entry->type != 'f')
1262 		fz_warn(ctx, "first object in xref is not free");
1263 
1264 	/* broken pdfs where object offsets are out of range */
1265 	xref_len = pdf_xref_len(ctx, doc);
1266 	for (i = 0; i < xref_len; i++)
1267 	{
1268 		entry = pdf_get_xref_entry(ctx, doc, i);
1269 		if (entry->type == 'n')
1270 		{
1271 			/* Special case code: "0000000000 * n" means free,
1272 			 * according to some producers (inc Quartz) */
1273 			if (entry->ofs == 0)
1274 				entry->type = 'f';
1275 			else if (entry->ofs <= 0 || entry->ofs >= doc->file_size)
1276 				fz_throw(ctx, FZ_ERROR_GENERIC, "object offset out of range: %d (%d 0 R)", (int)entry->ofs, i);
1277 		}
1278 		if (entry->type == 'o')
1279 		{
1280 			/* Read this into a local variable here, because pdf_get_xref_entry
1281 			 * may solidify the xref, hence invalidating "entry", meaning we
1282 			 * need a stashed value for the throw. */
1283 			int64_t ofs = entry->ofs;
1284 			if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry(ctx, doc, ofs)->type != 'n')
1285 				fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i);
1286 		}
1287 	}
1288 }
1289 
1290 static void
pdf_check_linear(fz_context * ctx,pdf_document * doc)1291 pdf_check_linear(fz_context *ctx, pdf_document *doc)
1292 {
1293 	pdf_obj *dict = NULL;
1294 	pdf_obj *o;
1295 	int num, gen;
1296 	int64_t stmofs;
1297 
1298 	fz_var(dict);
1299 
1300 	fz_try(ctx)
1301 	{
1302 		dict = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL);
1303 		if (!pdf_is_dict(ctx, dict))
1304 			break;
1305 		o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1306 		if (o == NULL)
1307 			break;
1308 		if (pdf_to_int(ctx, o) != 1)
1309 			break;
1310 		doc->has_linearization_object = 1;
1311 	}
1312 	fz_always(ctx)
1313 		pdf_drop_obj(ctx, dict);
1314 	fz_catch(ctx)
1315 	{
1316 		/* Silently swallow this error. */
1317 	}
1318 }
1319 
1320 static void
pdf_load_linear(fz_context * ctx,pdf_document * doc)1321 pdf_load_linear(fz_context *ctx, pdf_document *doc)
1322 {
1323 	pdf_obj *dict = NULL;
1324 	pdf_obj *hint = NULL;
1325 	pdf_obj *o;
1326 	int num, gen, lin, len;
1327 	int64_t stmofs;
1328 
1329 	fz_var(dict);
1330 	fz_var(hint);
1331 
1332 	fz_try(ctx)
1333 	{
1334 		pdf_xref_entry *entry;
1335 
1336 		dict = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL);
1337 		if (!pdf_is_dict(ctx, dict))
1338 			fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
1339 		o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1340 		if (o == NULL)
1341 			fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
1342 		lin = pdf_to_int(ctx, o);
1343 		if (lin != 1)
1344 			fz_throw(ctx, FZ_ERROR_GENERIC, "Unexpected version of Linearized tag (%d)", lin);
1345 		doc->has_linearization_object = 1;
1346 		len = pdf_dict_get_int(ctx, dict, PDF_NAME(L));
1347 		if (len != doc->file_length)
1348 			fz_throw(ctx, FZ_ERROR_GENERIC, "File has been updated since linearization");
1349 
1350 		pdf_read_xref_sections(ctx, doc, fz_tell(ctx, doc->file), &doc->lexbuf.base, 0);
1351 
1352 		doc->linear_page_count = pdf_dict_get_int(ctx, dict, PDF_NAME(N));
1353 		doc->linear_page_refs = fz_realloc_array(ctx, doc->linear_page_refs, doc->linear_page_count, pdf_obj *);
1354 		memset(doc->linear_page_refs, 0, doc->linear_page_count * sizeof(pdf_obj*));
1355 		doc->linear_obj = dict;
1356 		doc->linear_pos = fz_tell(ctx, doc->file);
1357 		doc->linear_page1_obj_num = pdf_dict_get_int(ctx, dict, PDF_NAME(O));
1358 		doc->linear_page_refs[0] = pdf_new_indirect(ctx, doc, doc->linear_page1_obj_num, 0);
1359 		doc->linear_page_num = 0;
1360 		hint = pdf_dict_get(ctx, dict, PDF_NAME(H));
1361 		doc->hint_object_offset = pdf_array_get_int(ctx, hint, 0);
1362 		doc->hint_object_length = pdf_array_get_int(ctx, hint, 1);
1363 
1364 		entry = pdf_get_populating_xref_entry(ctx, doc, 0);
1365 		entry->type = 'f';
1366 	}
1367 	fz_catch(ctx)
1368 	{
1369 		pdf_drop_obj(ctx, dict);
1370 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1371 		/* Drop back to non linearized reading mode */
1372 		doc->file_reading_linearly = 0;
1373 	}
1374 }
1375 
1376 /*
1377  * Initialize and load xref tables.
1378  * If password is not null, try to decrypt.
1379  */
1380 
1381 static void
pdf_init_document(fz_context * ctx,pdf_document * doc)1382 pdf_init_document(fz_context *ctx, pdf_document *doc)
1383 {
1384 	pdf_obj *encrypt, *id;
1385 	pdf_obj *dict = NULL;
1386 	pdf_obj *obj;
1387 	pdf_obj *nobj = NULL;
1388 	int i, repaired = 0;
1389 
1390 	fz_var(dict);
1391 	fz_var(nobj);
1392 
1393 	fz_try(ctx)
1394 	{
1395 		/* Check to see if we should work in progressive mode */
1396 		if (doc->file->progressive)
1397 		{
1398 			doc->file_reading_linearly = 1;
1399 			fz_seek(ctx, doc->file, 0, SEEK_END);
1400 			doc->file_length = fz_tell(ctx, doc->file);
1401 			if (doc->file_length < 0)
1402 				doc->file_length = 0;
1403 			fz_seek(ctx, doc->file, 0, SEEK_SET);
1404 		}
1405 
1406 		pdf_load_version(ctx, doc);
1407 
1408 		/* Try to load the linearized file if we are in progressive
1409 		 * mode. */
1410 		if (doc->file_reading_linearly)
1411 			pdf_load_linear(ctx, doc);
1412 		else
1413 			/* Even if we're not in progressive mode, check to see
1414 			 * if the file claims to be linearized. This is important
1415 			 * for checking signatures later on. */
1416 			pdf_check_linear(ctx, doc);
1417 
1418 		/* If we aren't in progressive mode (or the linear load failed
1419 		 * and has set us back to non-progressive mode), load normally.
1420 		 */
1421 		if (!doc->file_reading_linearly)
1422 			pdf_load_xref(ctx, doc, &doc->lexbuf.base);
1423 	}
1424 	fz_catch(ctx)
1425 	{
1426 		pdf_drop_xref_sections(ctx, doc);
1427 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1428 		fz_warn(ctx, "trying to repair broken xref");
1429 		repaired = 1;
1430 	}
1431 
1432 	fz_try(ctx)
1433 	{
1434 		int hasroot, hasinfo;
1435 
1436 		if (repaired)
1437 		{
1438 			/* pdf_repair_xref may access xref_index, so reset it properly */
1439 			if (doc->xref_index)
1440 				memset(doc->xref_index, 0, sizeof(int) * doc->max_xref_len);
1441 			pdf_repair_xref(ctx, doc);
1442 			pdf_prime_xref_index(ctx, doc);
1443 		}
1444 
1445 		encrypt = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt));
1446 		id = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID));
1447 		if (pdf_is_dict(ctx, encrypt))
1448 			doc->crypt = pdf_new_crypt(ctx, encrypt, id);
1449 
1450 		/* Allow lazy clients to read encrypted files with a blank password */
1451 		pdf_authenticate_password(ctx, doc, "");
1452 
1453 		if (repaired)
1454 		{
1455 			int xref_len = pdf_xref_len(ctx, doc);
1456 			pdf_repair_obj_stms(ctx, doc);
1457 
1458 			hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL);
1459 			hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL);
1460 
1461 			for (i = 1; i < xref_len && !hasinfo && !hasroot; ++i)
1462 			{
1463 				pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, i);
1464 				if (entry->type == 0 || entry->type == 'f')
1465 					continue;
1466 
1467 				fz_try(ctx)
1468 				{
1469 					dict = pdf_load_object(ctx, doc, i);
1470 				}
1471 				fz_catch(ctx)
1472 				{
1473 					fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1474 					fz_warn(ctx, "ignoring broken object (%d 0 R)", i);
1475 					continue;
1476 				}
1477 
1478 				if (!hasroot)
1479 				{
1480 					obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
1481 					if (pdf_name_eq(ctx, obj, PDF_NAME(Catalog)))
1482 					{
1483 						nobj = pdf_new_indirect(ctx, doc, i, 0);
1484 						pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj);
1485 						hasroot = 1;
1486 					}
1487 				}
1488 
1489 				if (!hasinfo)
1490 				{
1491 					if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer)))
1492 					{
1493 						nobj = pdf_new_indirect(ctx, doc, i, 0);
1494 						pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj);
1495 						hasinfo = 1;
1496 					}
1497 				}
1498 
1499 				pdf_drop_obj(ctx, dict);
1500 				dict = NULL;
1501 			}
1502 
1503 			/* ensure that strings are not used in their repaired, non-decrypted form */
1504 			if (doc->crypt)
1505 				pdf_clear_xref(ctx, doc);
1506 		}
1507 	}
1508 	fz_catch(ctx)
1509 	{
1510 		pdf_drop_obj(ctx, dict);
1511 		fz_rethrow(ctx);
1512 	}
1513 
1514 	fz_try(ctx)
1515 	{
1516 		pdf_read_ocg(ctx, doc);
1517 	}
1518 	fz_catch(ctx)
1519 	{
1520 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1521 		fz_warn(ctx, "Ignoring broken Optional Content configuration");
1522 	}
1523 }
1524 
1525 void
pdf_invalidate_xfa(fz_context * ctx,pdf_document * doc)1526 pdf_invalidate_xfa(fz_context *ctx, pdf_document *doc)
1527 {
1528 	int i;
1529 
1530 	if (doc == NULL)
1531 		return;
1532 
1533 	for (i = 0; i < doc->xfa.count; i++)
1534 	{
1535 		fz_free(ctx, doc->xfa.entries[i].key);
1536 		fz_drop_xml(ctx, doc->xfa.entries[i].value);
1537 	}
1538 	doc->xfa.count = 0;
1539 	fz_free(ctx, doc->xfa.entries);
1540 	doc->xfa.entries = 0;
1541 }
1542 
1543 static void
pdf_drop_document_imp(fz_context * ctx,pdf_document * doc)1544 pdf_drop_document_imp(fz_context *ctx, pdf_document *doc)
1545 {
1546 	int i;
1547 
1548 	fz_defer_reap_start(ctx);
1549 
1550 	/* Type3 glyphs in the glyph cache can contain pdf_obj pointers
1551 	 * that we are about to destroy. Simplest solution is to bin the
1552 	 * glyph cache at this point. */
1553 	fz_try(ctx)
1554 		fz_purge_glyph_cache(ctx);
1555 	fz_catch(ctx)
1556 	{
1557 		/* Swallow error, but continue dropping */
1558 	}
1559 
1560 	pdf_drop_js(ctx, doc->js);
1561 
1562 	pdf_drop_xref_sections(ctx, doc);
1563 	fz_free(ctx, doc->xref_index);
1564 
1565 	fz_drop_stream(ctx, doc->file);
1566 	pdf_drop_crypt(ctx, doc->crypt);
1567 
1568 	pdf_drop_obj(ctx, doc->linear_obj);
1569 	if (doc->linear_page_refs)
1570 	{
1571 		for (i=0; i < doc->linear_page_count; i++)
1572 			pdf_drop_obj(ctx, doc->linear_page_refs[i]);
1573 
1574 		fz_free(ctx, doc->linear_page_refs);
1575 	}
1576 
1577 	fz_free(ctx, doc->hint_page);
1578 	fz_free(ctx, doc->hint_shared_ref);
1579 	fz_free(ctx, doc->hint_shared);
1580 	fz_free(ctx, doc->hint_obj_offsets);
1581 
1582 	for (i=0; i < doc->num_type3_fonts; i++)
1583 	{
1584 		fz_try(ctx)
1585 			fz_decouple_type3_font(ctx, doc->type3_fonts[i], (void *)doc);
1586 		fz_always(ctx)
1587 			fz_drop_font(ctx, doc->type3_fonts[i]);
1588 		fz_catch(ctx)
1589 		{
1590 			/* Swallow error, but continue dropping */
1591 		}
1592 	}
1593 
1594 	fz_free(ctx, doc->type3_fonts);
1595 
1596 	pdf_drop_ocg(ctx, doc);
1597 
1598 	pdf_empty_store(ctx, doc);
1599 
1600 	pdf_lexbuf_fin(ctx, &doc->lexbuf.base);
1601 
1602 	pdf_drop_resource_tables(ctx, doc);
1603 
1604 	fz_drop_colorspace(ctx, doc->oi);
1605 
1606 	for (i = 0; i < doc->orphans_count; i++)
1607 		pdf_drop_obj(ctx, doc->orphans[i]);
1608 
1609 	fz_free(ctx, doc->orphans);
1610 
1611 	fz_free(ctx, doc->rev_page_map);
1612 
1613 	fz_defer_reap_end(ctx);
1614 
1615 	pdf_invalidate_xfa(ctx, doc);
1616 }
1617 
1618 void
pdf_drop_document(fz_context * ctx,pdf_document * doc)1619 pdf_drop_document(fz_context *ctx, pdf_document *doc)
1620 {
1621 	fz_drop_document(ctx, &doc->super);
1622 }
1623 
1624 pdf_document *
pdf_keep_document(fz_context * ctx,pdf_document * doc)1625 pdf_keep_document(fz_context *ctx, pdf_document *doc)
1626 {
1627 	return (pdf_document *)fz_keep_document(ctx, &doc->super);
1628 }
1629 
1630 /*
1631  * compressed object streams
1632  */
1633 
1634 static pdf_xref_entry *
pdf_load_obj_stm(fz_context * ctx,pdf_document * doc,int num,pdf_lexbuf * buf,int target)1635 pdf_load_obj_stm(fz_context *ctx, pdf_document *doc, int num, pdf_lexbuf *buf, int target)
1636 {
1637 	fz_stream *stm = NULL;
1638 	pdf_obj *objstm = NULL;
1639 	int *numbuf = NULL;
1640 	int64_t *ofsbuf = NULL;
1641 
1642 	pdf_obj *obj;
1643 	int64_t first;
1644 	int count;
1645 	int i;
1646 	pdf_token tok;
1647 	pdf_xref_entry *ret_entry = NULL;
1648 	int xref_len;
1649 	int found;
1650 
1651 	fz_var(numbuf);
1652 	fz_var(ofsbuf);
1653 	fz_var(objstm);
1654 	fz_var(stm);
1655 
1656 	fz_try(ctx)
1657 	{
1658 		objstm = pdf_load_object(ctx, doc, num);
1659 
1660 		if (pdf_obj_marked(ctx, objstm))
1661 			fz_throw(ctx, FZ_ERROR_GENERIC, "recursive object stream lookup");
1662 	}
1663 	fz_catch(ctx)
1664 	{
1665 		pdf_drop_obj(ctx, objstm);
1666 		fz_rethrow(ctx);
1667 	}
1668 
1669 	fz_try(ctx)
1670 	{
1671 		pdf_mark_obj(ctx, objstm);
1672 
1673 		count = pdf_dict_get_int(ctx, objstm, PDF_NAME(N));
1674 		first = pdf_dict_get_int(ctx, objstm, PDF_NAME(First));
1675 
1676 		if (count < 0 || count > PDF_MAX_OBJECT_NUMBER)
1677 			fz_throw(ctx, FZ_ERROR_GENERIC, "number of objects in object stream out of range");
1678 		if (first < 0 || first > PDF_MAX_OBJECT_NUMBER
1679 				|| count < 0 || count > PDF_MAX_OBJECT_NUMBER
1680 				|| first + count - 1 > PDF_MAX_OBJECT_NUMBER)
1681 			fz_throw(ctx, FZ_ERROR_GENERIC, "object stream object numbers are out of range");
1682 
1683 		numbuf = fz_calloc(ctx, count, sizeof(*numbuf));
1684 		ofsbuf = fz_calloc(ctx, count, sizeof(*ofsbuf));
1685 
1686 		xref_len = pdf_xref_len(ctx, doc);
1687 
1688 		found = 0;
1689 
1690 		stm = pdf_open_stream_number(ctx, doc, num);
1691 		for (i = 0; i < count; i++)
1692 		{
1693 			tok = pdf_lex(ctx, stm, buf);
1694 			if (tok != PDF_TOK_INT)
1695 				fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num);
1696 			numbuf[found] = buf->i;
1697 
1698 			tok = pdf_lex(ctx, stm, buf);
1699 			if (tok != PDF_TOK_INT)
1700 				fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num);
1701 			ofsbuf[found] = buf->i;
1702 
1703 			if (numbuf[found] <= 0 || numbuf[found] >= xref_len)
1704 				fz_warn(ctx, "object stream object out of range, skipping");
1705 			else
1706 				found++;
1707 		}
1708 
1709 		for (i = 0; i < found; i++)
1710 		{
1711 			pdf_xref_entry *entry;
1712 
1713 			fz_seek(ctx, stm, first + ofsbuf[i], SEEK_SET);
1714 
1715 			obj = pdf_parse_stm_obj(ctx, doc, stm, buf);
1716 
1717 			entry = pdf_get_xref_entry(ctx, doc, numbuf[i]);
1718 
1719 			pdf_set_obj_parent(ctx, obj, numbuf[i]);
1720 
1721 			if (entry->type == 'o' && entry->ofs == num)
1722 			{
1723 				/* If we already have an entry for this object,
1724 				 * we'd like to drop it and use the new one -
1725 				 * but this means that anyone currently holding
1726 				 * a pointer to the old one will be left with a
1727 				 * stale pointer. Instead, we drop the new one
1728 				 * and trust that the old one is correct. */
1729 				if (entry->obj)
1730 				{
1731 					if (pdf_objcmp(ctx, entry->obj, obj))
1732 						fz_warn(ctx, "Encountered new definition for object %d - keeping the original one", numbuf[i]);
1733 					pdf_drop_obj(ctx, obj);
1734 				}
1735 				else
1736 				{
1737 					entry->obj = obj;
1738 					fz_drop_buffer(ctx, entry->stm_buf);
1739 					entry->stm_buf = NULL;
1740 				}
1741 				if (numbuf[i] == target)
1742 					ret_entry = entry;
1743 			}
1744 			else
1745 			{
1746 				pdf_drop_obj(ctx, obj);
1747 			}
1748 		}
1749 	}
1750 	fz_always(ctx)
1751 	{
1752 		fz_drop_stream(ctx, stm);
1753 		fz_free(ctx, ofsbuf);
1754 		fz_free(ctx, numbuf);
1755 		pdf_unmark_obj(ctx, objstm);
1756 		pdf_drop_obj(ctx, objstm);
1757 	}
1758 	fz_catch(ctx)
1759 	{
1760 		fz_rethrow(ctx);
1761 	}
1762 	return ret_entry;
1763 }
1764 
1765 /*
1766  * object loading
1767  */
1768 static int
pdf_obj_read(fz_context * ctx,pdf_document * doc,int64_t * offset,int * nump,pdf_obj ** page)1769 pdf_obj_read(fz_context *ctx, pdf_document *doc, int64_t *offset, int *nump, pdf_obj **page)
1770 {
1771 	pdf_lexbuf *buf = &doc->lexbuf.base;
1772 	int num, gen, tok;
1773 	int64_t numofs, genofs, stmofs, tmpofs, newtmpofs;
1774 	int xref_len;
1775 	pdf_xref_entry *entry;
1776 
1777 	numofs = *offset;
1778 	fz_seek(ctx, doc->file, numofs, SEEK_SET);
1779 
1780 	/* We expect to read 'num' here */
1781 	tok = pdf_lex(ctx, doc->file, buf);
1782 	genofs = fz_tell(ctx, doc->file);
1783 	if (tok != PDF_TOK_INT)
1784 	{
1785 		/* Failed! */
1786 		DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, *offset));
1787 		*offset = genofs;
1788 		return tok == PDF_TOK_EOF;
1789 	}
1790 	*nump = num = buf->i;
1791 
1792 	/* We expect to read 'gen' here */
1793 	tok = pdf_lex(ctx, doc->file, buf);
1794 	tmpofs = fz_tell(ctx, doc->file);
1795 	if (tok != PDF_TOK_INT)
1796 	{
1797 		/* Failed! */
1798 		DEBUGMESS((ctx, "skipping unexpected data after \"%d\" (tok=%d) at %d", num, tok, *offset));
1799 		*offset = tmpofs;
1800 		return tok == PDF_TOK_EOF;
1801 	}
1802 	gen = buf->i;
1803 
1804 	/* We expect to read 'obj' here */
1805 	do
1806 	{
1807 		tmpofs = fz_tell(ctx, doc->file);
1808 		tok = pdf_lex(ctx, doc->file, buf);
1809 		if (tok == PDF_TOK_OBJ)
1810 			break;
1811 		if (tok != PDF_TOK_INT)
1812 		{
1813 			DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, tmpofs));
1814 			*offset = fz_tell(ctx, doc->file);
1815 			return tok == PDF_TOK_EOF;
1816 		}
1817 		DEBUGMESS((ctx, "skipping unexpected int %d at %d", num, numofs));
1818 		*nump = num = gen;
1819 		numofs = genofs;
1820 		gen = buf->i;
1821 		genofs = tmpofs;
1822 	}
1823 	while (1);
1824 
1825 	/* Now we read the actual object */
1826 	xref_len = pdf_xref_len(ctx, doc);
1827 
1828 	/* When we are reading a progressive file, we typically see:
1829 	 *    File Header
1830 	 *    obj m (Linearization params)
1831 	 *    xref #1 (refers to objects m-n)
1832 	 *    obj m+1
1833 	 *    ...
1834 	 *    obj n
1835 	 *    obj 1
1836 	 *    ...
1837 	 *    obj n-1
1838 	 *    xref #2
1839 	 *
1840 	 * The linearisation params are read elsewhere, hence
1841 	 * whenever we read an object it should just go into the
1842 	 * previous xref.
1843 	 */
1844 	tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs, NULL);
1845 
1846 	do /* So we can break out of it */
1847 	{
1848 		if (num <= 0 || num >= xref_len)
1849 		{
1850 			fz_warn(ctx, "Not a valid object number (%d %d obj)", num, gen);
1851 			break;
1852 		}
1853 		if (gen != 0)
1854 		{
1855 			fz_warn(ctx, "Unexpected non zero generation number in linearized file");
1856 		}
1857 		entry = pdf_get_populating_xref_entry(ctx, doc, num);
1858 		if (entry->type != 0)
1859 		{
1860 			DEBUGMESS((ctx, "Duplicate object found (%d %d obj)", num, gen));
1861 			break;
1862 		}
1863 		if (page && *page)
1864 		{
1865 			DEBUGMESS((ctx, "Successfully read object %d @ %d - and found page %d!", num, numofs, doc->linear_page_num));
1866 			if (!entry->obj)
1867 				entry->obj = pdf_keep_obj(ctx, *page);
1868 
1869 			if (doc->linear_page_refs[doc->linear_page_num] == NULL)
1870 				doc->linear_page_refs[doc->linear_page_num] = pdf_new_indirect(ctx, doc, num, gen);
1871 		}
1872 		else
1873 		{
1874 			DEBUGMESS((ctx, "Successfully read object %d @ %d", num, numofs));
1875 		}
1876 		entry->type = 'n';
1877 		entry->gen = gen; // XXX: was 0
1878 		entry->num = num;
1879 		entry->ofs = numofs;
1880 		entry->stm_ofs = stmofs;
1881 	}
1882 	while (0);
1883 	if (page && *page)
1884 		doc->linear_page_num++;
1885 
1886 	if (tok == PDF_TOK_ENDOBJ)
1887 	{
1888 		*offset = fz_tell(ctx, doc->file);
1889 	}
1890 	else
1891 	{
1892 		*offset = newtmpofs;
1893 	}
1894 	return 0;
1895 }
1896 
1897 static void
pdf_load_hinted_page(fz_context * ctx,pdf_document * doc,int pagenum)1898 pdf_load_hinted_page(fz_context *ctx, pdf_document *doc, int pagenum)
1899 {
1900 	pdf_obj *page = NULL;
1901 
1902 	if (!doc->hints_loaded || !doc->linear_page_refs)
1903 		return;
1904 
1905 	if (doc->linear_page_refs[pagenum])
1906 		return;
1907 
1908 	fz_var(page);
1909 
1910 	fz_try(ctx)
1911 	{
1912 		int num = doc->hint_page[pagenum].number;
1913 		page = pdf_load_object(ctx, doc, num);
1914 		if (pdf_name_eq(ctx, PDF_NAME(Page), pdf_dict_get(ctx, page, PDF_NAME(Type))))
1915 		{
1916 			/* We have found the page object! */
1917 			DEBUGMESS((ctx, "LoadHintedPage pagenum=%d num=%d", pagenum, num));
1918 			doc->linear_page_refs[pagenum] = pdf_new_indirect(ctx, doc, num, 0);
1919 		}
1920 	}
1921 	fz_always(ctx)
1922 		pdf_drop_obj(ctx, page);
1923 	fz_catch(ctx)
1924 	{
1925 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1926 		/* Silently swallow the error and proceed as normal */
1927 	}
1928 }
1929 
1930 static int
read_hinted_object(fz_context * ctx,pdf_document * doc,int num)1931 read_hinted_object(fz_context *ctx, pdf_document *doc, int num)
1932 {
1933 	/* Try to find the object using our hint table. Find the closest
1934 	 * object <= the one we want that has a hint and read forward from
1935 	 * there. */
1936 	int expected = num;
1937 	int curr_pos;
1938 	int64_t start, offset;
1939 
1940 	while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
1941 		expected--;
1942 	if (expected != num)
1943 		DEBUGMESS((ctx, "object %d is unhinted, will search forward from %d", expected, num));
1944 	if (expected == 0)	/* No hints found, just bail */
1945 		return 0;
1946 
1947 	curr_pos = fz_tell(ctx, doc->file);
1948 	offset = doc->hint_obj_offsets[expected];
1949 
1950 	fz_var(expected);
1951 
1952 	fz_try(ctx)
1953 	{
1954 		int found;
1955 
1956 		/* Try to read forward from there */
1957 		do
1958 		{
1959 			start = offset;
1960 			DEBUGMESS((ctx, "Searching for object %d @ %d", expected, offset));
1961 			pdf_obj_read(ctx, doc, &offset, &found, 0);
1962 			DEBUGMESS((ctx, "Found object %d - next will be @ %d", found, offset));
1963 			if (found <= expected)
1964 			{
1965 				/* We found the right one (or one earlier than
1966 				 * we expected). Update the hints. */
1967 				doc->hint_obj_offsets[expected] = offset;
1968 				doc->hint_obj_offsets[found] = start;
1969 				doc->hint_obj_offsets[found+1] = offset;
1970 				/* Retry with the next one */
1971 				expected = found+1;
1972 			}
1973 			else
1974 			{
1975 				/* We found one later than we expected. */
1976 				doc->hint_obj_offsets[expected] = 0;
1977 				doc->hint_obj_offsets[found] = start;
1978 				doc->hint_obj_offsets[found+1] = offset;
1979 				while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
1980 					expected--;
1981 				if (expected == 0)	/* No hints found, we give up */
1982 					break;
1983 			}
1984 		}
1985 		while (found != num);
1986 	}
1987 	fz_always(ctx)
1988 	{
1989 		fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
1990 	}
1991 	fz_catch(ctx)
1992 	{
1993 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1994 		/* FIXME: Currently we ignore the hint. Perhaps we should
1995 		 * drop back to non-hinted operation here. */
1996 		doc->hint_obj_offsets[expected] = 0;
1997 		fz_rethrow(ctx);
1998 	}
1999 	return expected != 0;
2000 }
2001 
2002 pdf_obj *
pdf_load_unencrypted_object(fz_context * ctx,pdf_document * doc,int num)2003 pdf_load_unencrypted_object(fz_context *ctx, pdf_document *doc, int num)
2004 {
2005 	pdf_xref_entry *x;
2006 
2007 	if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2008 		fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2009 
2010 	x = pdf_get_xref_entry(ctx, doc, num);
2011 	if (x->type == 'n')
2012 	{
2013 		fz_seek(ctx, doc->file, x->ofs, SEEK_SET);
2014 		return pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, NULL, NULL, NULL, NULL);
2015 	}
2016 	return NULL;
2017 }
2018 
2019 pdf_xref_entry *
pdf_cache_object(fz_context * ctx,pdf_document * doc,int num)2020 pdf_cache_object(fz_context *ctx, pdf_document *doc, int num)
2021 {
2022 	pdf_xref_entry *x;
2023 	int rnum, rgen, try_repair;
2024 
2025 	fz_var(try_repair);
2026 
2027 	if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2028 		fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2029 
2030 object_updated:
2031 	try_repair = 0;
2032 	rnum = num;
2033 
2034 	x = pdf_get_xref_entry(ctx, doc, num);
2035 
2036 	if (x->obj != NULL)
2037 		return x;
2038 
2039 	if (x->type == 'f')
2040 	{
2041 		x->obj = PDF_NULL;
2042 	}
2043 	else if (x->type == 'n')
2044 	{
2045 		fz_seek(ctx, doc->file, x->ofs, SEEK_SET);
2046 
2047 		fz_try(ctx)
2048 		{
2049 			x->obj = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base,
2050 					&rnum, &rgen, &x->stm_ofs, &try_repair);
2051 		}
2052 		fz_catch(ctx)
2053 		{
2054 			if (!try_repair || fz_caught(ctx) == FZ_ERROR_TRYLATER)
2055 				fz_rethrow(ctx);
2056 		}
2057 
2058 		if (!try_repair && rnum != num)
2059 		{
2060 			pdf_drop_obj(ctx, x->obj);
2061 			x->type = 'f';
2062 			x->ofs = -1;
2063 			x->gen = 0;
2064 			x->num = 0;
2065 			x->stm_ofs = 0;
2066 			x->obj = NULL;
2067 			try_repair = (doc->repair_attempted == 0);
2068 		}
2069 
2070 		if (try_repair)
2071 		{
2072 			fz_try(ctx)
2073 			{
2074 				pdf_repair_xref(ctx, doc);
2075 				pdf_prime_xref_index(ctx, doc);
2076 				pdf_repair_obj_stms(ctx, doc);
2077 			}
2078 			fz_catch(ctx)
2079 			{
2080 				fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2081 				if (rnum == num)
2082 					fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse object (%d 0 R)", num);
2083 				else
2084 					fz_throw(ctx, FZ_ERROR_GENERIC, "found object (%d 0 R) instead of (%d 0 R)", rnum, num);
2085 			}
2086 			goto object_updated;
2087 		}
2088 
2089 		if (doc->crypt)
2090 			pdf_crypt_obj(ctx, doc->crypt, x->obj, x->num, x->gen);
2091 	}
2092 	else if (x->type == 'o')
2093 	{
2094 		if (!x->obj)
2095 		{
2096 			x = pdf_load_obj_stm(ctx, doc, x->ofs, &doc->lexbuf.base, num);
2097 			if (x == NULL)
2098 				fz_throw(ctx, FZ_ERROR_GENERIC, "cannot load object stream containing object (%d 0 R)", num);
2099 			if (!x->obj)
2100 				fz_throw(ctx, FZ_ERROR_GENERIC, "object (%d 0 R) was not found in its object stream", num);
2101 		}
2102 	}
2103 	else if (doc->hint_obj_offsets && read_hinted_object(ctx, doc, num))
2104 	{
2105 		goto object_updated;
2106 	}
2107 	else if (doc->file_length && doc->linear_pos < doc->file_length)
2108 	{
2109 		fz_throw(ctx, FZ_ERROR_TRYLATER, "cannot find object in xref (%d 0 R) - not loaded yet?", num);
2110 	}
2111 	else
2112 	{
2113 		fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find object in xref (%d 0 R)", num);
2114 	}
2115 
2116 	pdf_set_obj_parent(ctx, x->obj, num);
2117 	return x;
2118 }
2119 
2120 pdf_obj *
pdf_load_object(fz_context * ctx,pdf_document * doc,int num)2121 pdf_load_object(fz_context *ctx, pdf_document *doc, int num)
2122 {
2123 	pdf_xref_entry *entry = pdf_cache_object(ctx, doc, num);
2124 	return pdf_keep_obj(ctx, entry->obj);
2125 }
2126 
2127 pdf_obj *
pdf_resolve_indirect(fz_context * ctx,pdf_obj * ref)2128 pdf_resolve_indirect(fz_context *ctx, pdf_obj *ref)
2129 {
2130 	if (pdf_is_indirect(ctx, ref))
2131 	{
2132 		pdf_document *doc = pdf_get_indirect_document(ctx, ref);
2133 		int num = pdf_to_num(ctx, ref);
2134 		pdf_xref_entry *entry;
2135 
2136 		if (!doc)
2137 			return NULL;
2138 		if (num <= 0)
2139 		{
2140 			fz_warn(ctx, "invalid indirect reference (%d 0 R)", num);
2141 			return NULL;
2142 		}
2143 
2144 		fz_try(ctx)
2145 			entry = pdf_cache_object(ctx, doc, num);
2146 		fz_catch(ctx)
2147 		{
2148 			fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2149 			fz_warn(ctx, "cannot load object (%d 0 R) into cache", num);
2150 			return NULL;
2151 		}
2152 
2153 		ref = entry->obj;
2154 	}
2155 	return ref;
2156 }
2157 
2158 pdf_obj *
pdf_resolve_indirect_chain(fz_context * ctx,pdf_obj * ref)2159 pdf_resolve_indirect_chain(fz_context *ctx, pdf_obj *ref)
2160 {
2161 	int sanity = 10;
2162 
2163 	while (pdf_is_indirect(ctx, ref))
2164 	{
2165 		if (--sanity == 0)
2166 		{
2167 			fz_warn(ctx, "too many indirections (possible indirection cycle involving %d 0 R)", pdf_to_num(ctx, ref));
2168 			return NULL;
2169 		}
2170 
2171 		ref = pdf_resolve_indirect(ctx, ref);
2172 	}
2173 
2174 	return ref;
2175 }
2176 
2177 int
pdf_count_objects(fz_context * ctx,pdf_document * doc)2178 pdf_count_objects(fz_context *ctx, pdf_document *doc)
2179 {
2180 	return pdf_xref_len(ctx, doc);
2181 }
2182 
2183 int
pdf_create_object(fz_context * ctx,pdf_document * doc)2184 pdf_create_object(fz_context *ctx, pdf_document *doc)
2185 {
2186 	/* TODO: reuse free object slots by properly linking free object chains in the ofs field */
2187 	pdf_xref_entry *entry;
2188 	int num = pdf_xref_len(ctx, doc);
2189 
2190 	if (num > PDF_MAX_OBJECT_NUMBER)
2191 		fz_throw(ctx, FZ_ERROR_GENERIC, "too many objects stored in pdf");
2192 
2193 	entry = pdf_get_incremental_xref_entry(ctx, doc, num);
2194 	entry->type = 'f';
2195 	entry->ofs = -1;
2196 	entry->gen = 0;
2197 	entry->num = num;
2198 	entry->stm_ofs = 0;
2199 	entry->stm_buf = NULL;
2200 	entry->obj = NULL;
2201 	return num;
2202 }
2203 
2204 void
pdf_delete_object(fz_context * ctx,pdf_document * doc,int num)2205 pdf_delete_object(fz_context *ctx, pdf_document *doc, int num)
2206 {
2207 	pdf_xref_entry *x;
2208 
2209 	if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2210 	{
2211 		fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2212 		return;
2213 	}
2214 
2215 	x = pdf_get_incremental_xref_entry(ctx, doc, num);
2216 
2217 	fz_drop_buffer(ctx, x->stm_buf);
2218 	pdf_drop_obj(ctx, x->obj);
2219 
2220 	x->type = 'f';
2221 	x->ofs = 0;
2222 	x->gen += 1;
2223 	x->num = 0;
2224 	x->stm_ofs = 0;
2225 	x->stm_buf = NULL;
2226 	x->obj = NULL;
2227 }
2228 
2229 void
pdf_update_object(fz_context * ctx,pdf_document * doc,int num,pdf_obj * newobj)2230 pdf_update_object(fz_context *ctx, pdf_document *doc, int num, pdf_obj *newobj)
2231 {
2232 	pdf_xref_entry *x;
2233 
2234 	if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2235 	{
2236 		fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2237 		return;
2238 	}
2239 
2240 	if (!newobj)
2241 	{
2242 		pdf_delete_object(ctx, doc, num);
2243 		return;
2244 	}
2245 
2246 	x = pdf_get_incremental_xref_entry(ctx, doc, num);
2247 
2248 	pdf_drop_obj(ctx, x->obj);
2249 
2250 	x->type = 'n';
2251 	x->ofs = 0;
2252 	x->obj = pdf_keep_obj(ctx, newobj);
2253 
2254 	pdf_set_obj_parent(ctx, newobj, num);
2255 }
2256 
2257 void
pdf_update_stream(fz_context * ctx,pdf_document * doc,pdf_obj * obj,fz_buffer * newbuf,int compressed)2258 pdf_update_stream(fz_context *ctx, pdf_document *doc, pdf_obj *obj, fz_buffer *newbuf, int compressed)
2259 {
2260 	int num;
2261 	pdf_xref_entry *x;
2262 
2263 	if (pdf_is_indirect(ctx, obj))
2264 		num = pdf_to_num(ctx, obj);
2265 	else
2266 		num = pdf_obj_parent_num(ctx, obj);
2267 	if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2268 	{
2269 		fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2270 		return;
2271 	}
2272 
2273 	x = pdf_get_xref_entry(ctx, doc, num);
2274 
2275 	fz_drop_buffer(ctx, x->stm_buf);
2276 	x->stm_buf = fz_keep_buffer(ctx, newbuf);
2277 
2278 	pdf_dict_put_int(ctx, obj, PDF_NAME(Length), (int)fz_buffer_storage(ctx, newbuf, NULL));
2279 	if (!compressed)
2280 	{
2281 		pdf_dict_del(ctx, obj, PDF_NAME(Filter));
2282 		pdf_dict_del(ctx, obj, PDF_NAME(DecodeParms));
2283 	}
2284 }
2285 
2286 int
pdf_lookup_metadata(fz_context * ctx,pdf_document * doc,const char * key,char * buf,int size)2287 pdf_lookup_metadata(fz_context *ctx, pdf_document *doc, const char *key, char *buf, int size)
2288 {
2289 	if (!strcmp(key, FZ_META_FORMAT))
2290 	{
2291 		int version = pdf_version(ctx, doc);
2292 		return 1 + (int)fz_snprintf(buf, size, "PDF %d.%d", version/10, version % 10);
2293 	}
2294 
2295 	if (!strcmp(key, FZ_META_ENCRYPTION))
2296 	{
2297 		if (doc->crypt)
2298 			return 1 + (int)fz_snprintf(buf, size, "Standard V%d R%d %d-bit %s",
2299 					pdf_crypt_version(ctx, doc->crypt),
2300 					pdf_crypt_revision(ctx, doc->crypt),
2301 					pdf_crypt_length(ctx, doc->crypt),
2302 					pdf_crypt_method(ctx, doc->crypt));
2303 		else
2304 			return 1 + (int)fz_strlcpy(buf, "None", size);
2305 	}
2306 
2307 	if (strstr(key, "info:") == key)
2308 	{
2309 		pdf_obj *info;
2310 		const char *s;
2311 		int n;
2312 
2313 		info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info));
2314 		if (!info)
2315 			return -1;
2316 
2317 		info = pdf_dict_gets(ctx, info, key + 5);
2318 		if (!info)
2319 			return -1;
2320 
2321 		s = pdf_to_text_string(ctx, info);
2322 		n = 1 + (int)fz_strlcpy(buf, s, size);
2323 		return n;
2324 	}
2325 
2326 	return -1;
2327 }
2328 
2329 
2330 static fz_location
pdf_resolve_link_imp(fz_context * ctx,fz_document * doc_,const char * uri,float * xp,float * yp)2331 pdf_resolve_link_imp(fz_context *ctx, fz_document *doc_, const char *uri, float *xp, float *yp)
2332 {
2333 	pdf_document *doc = (pdf_document*)doc_;
2334 	return fz_make_location(0, pdf_resolve_link(ctx, doc, uri, xp, yp));
2335 }
2336 
2337 /*
2338 	Initializers for the fz_document interface.
2339 
2340 	The functions are split across two files to allow calls to a
2341 	version of the constructor that does not link in the interpreter.
2342 	The interpreter references the built-in font and cmap resources
2343 	which are quite big. Not linking those into the mutool binary
2344 	saves roughly 6MB of space.
2345 */
2346 
2347 static pdf_document *
pdf_new_document(fz_context * ctx,fz_stream * file)2348 pdf_new_document(fz_context *ctx, fz_stream *file)
2349 {
2350 	pdf_document *doc = fz_new_derived_document(ctx, pdf_document);
2351 
2352 	doc->super.drop_document = (fz_document_drop_fn*)pdf_drop_document_imp;
2353 	doc->super.get_output_intent = (fz_document_output_intent_fn*)pdf_document_output_intent;
2354 	doc->super.needs_password = (fz_document_needs_password_fn*)pdf_needs_password;
2355 	doc->super.authenticate_password = (fz_document_authenticate_password_fn*)pdf_authenticate_password;
2356 	doc->super.has_permission = (fz_document_has_permission_fn*)pdf_has_permission;
2357 	doc->super.load_outline = (fz_document_load_outline_fn*)pdf_load_outline;
2358 	doc->super.resolve_link = pdf_resolve_link_imp;
2359 	doc->super.count_pages = pdf_count_pages_imp;
2360 	doc->super.load_page = pdf_load_page_imp;
2361 	doc->super.lookup_metadata = (fz_document_lookup_metadata_fn*)pdf_lookup_metadata;
2362 
2363 	pdf_lexbuf_init(ctx, &doc->lexbuf.base, PDF_LEXBUF_LARGE);
2364 	doc->file = fz_keep_stream(ctx, file);
2365 
2366 	return doc;
2367 }
2368 
2369 pdf_document *
pdf_open_document_with_stream(fz_context * ctx,fz_stream * file)2370 pdf_open_document_with_stream(fz_context *ctx, fz_stream *file)
2371 {
2372 	pdf_document *doc = pdf_new_document(ctx, file);
2373 	fz_try(ctx)
2374 	{
2375 		pdf_init_document(ctx, doc);
2376 	}
2377 	fz_catch(ctx)
2378 	{
2379 		/* fz_drop_document may clobber our error code/message so we have to stash them temporarily. */
2380 		char message[256];
2381 		int caught = fz_caught(ctx);
2382 		fz_strlcpy(message, fz_caught_message(ctx), sizeof message);
2383 		fz_drop_document(ctx, &doc->super);
2384 		fz_throw(ctx, caught, "%s", message);
2385 	}
2386 	return doc;
2387 }
2388 
2389 pdf_document *
pdf_open_document(fz_context * ctx,const char * filename)2390 pdf_open_document(fz_context *ctx, const char *filename)
2391 {
2392 	fz_stream *file = NULL;
2393 	pdf_document *doc = NULL;
2394 
2395 	fz_var(file);
2396 	fz_var(doc);
2397 
2398 	fz_try(ctx)
2399 	{
2400 		file = fz_open_file(ctx, filename);
2401 		doc = pdf_new_document(ctx, file);
2402 		pdf_init_document(ctx, doc);
2403 	}
2404 	fz_always(ctx)
2405 	{
2406 		fz_drop_stream(ctx, file);
2407 	}
2408 	fz_catch(ctx)
2409 	{
2410 		fz_drop_document(ctx, &doc->super);
2411 		fz_rethrow(ctx);
2412 	}
2413 	return doc;
2414 }
2415 
2416 static void
pdf_load_hints(fz_context * ctx,pdf_document * doc,int objnum)2417 pdf_load_hints(fz_context *ctx, pdf_document *doc, int objnum)
2418 {
2419 	fz_stream *stream = NULL;
2420 	pdf_obj *dict;
2421 
2422 	fz_var(stream);
2423 	fz_var(dict);
2424 
2425 	fz_try(ctx)
2426 	{
2427 		int i, j, least_num_page_objs, page_obj_num_bits;
2428 		int least_page_len, page_len_num_bits, shared_hint_offset;
2429 		/* int least_page_offset, page_offset_num_bits; */
2430 		/* int least_content_stream_len, content_stream_len_num_bits; */
2431 		int num_shared_obj_num_bits, shared_obj_num_bits;
2432 		/* int numerator_bits, denominator_bits; */
2433 		int shared;
2434 		int shared_obj_num, shared_obj_offset, shared_obj_count_page1;
2435 		int shared_obj_count_total;
2436 		int least_shared_group_len, shared_group_len_num_bits;
2437 		int max_object_num = pdf_xref_len(ctx, doc);
2438 
2439 		stream = pdf_open_stream_number(ctx, doc, objnum);
2440 		dict = pdf_get_xref_entry(ctx, doc, objnum)->obj;
2441 		if (dict == NULL || !pdf_is_dict(ctx, dict))
2442 			fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint object");
2443 
2444 		shared_hint_offset = pdf_dict_get_int(ctx, dict, PDF_NAME(S));
2445 
2446 		/* Malloc the structures (use realloc to cope with the fact we
2447 		 * may try this several times before enough data is loaded) */
2448 		doc->hint_page = fz_realloc_array(ctx, doc->hint_page, doc->linear_page_count+1, pdf_hint_page);
2449 		memset(doc->hint_page, 0, sizeof(*doc->hint_page) * (doc->linear_page_count+1));
2450 		doc->hint_obj_offsets = fz_realloc_array(ctx, doc->hint_obj_offsets, max_object_num, int64_t);
2451 		memset(doc->hint_obj_offsets, 0, sizeof(*doc->hint_obj_offsets) * max_object_num);
2452 		doc->hint_obj_offsets_max = max_object_num;
2453 
2454 		/* Read the page object hints table: Header first */
2455 		least_num_page_objs = fz_read_bits(ctx, stream, 32);
2456 		/* The following is sometimes a lie, but we read this version,
2457 		 * as other table values are built from it. In
2458 		 * pdf_reference17.pdf, this points to 2 objects before the
2459 		 * first pages page object. */
2460 		doc->hint_page[0].offset = fz_read_bits(ctx, stream, 32);
2461 		if (doc->hint_page[0].offset > doc->hint_object_offset)
2462 			doc->hint_page[0].offset += doc->hint_object_length;
2463 		page_obj_num_bits = fz_read_bits(ctx, stream, 16);
2464 		least_page_len = fz_read_bits(ctx, stream, 32);
2465 		page_len_num_bits = fz_read_bits(ctx, stream, 16);
2466 		/* least_page_offset = */ (void) fz_read_bits(ctx, stream, 32);
2467 		/* page_offset_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
2468 		/* least_content_stream_len = */ (void) fz_read_bits(ctx, stream, 32);
2469 		/* content_stream_len_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
2470 		num_shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2471 		shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2472 		/* numerator_bits = */ (void) fz_read_bits(ctx, stream, 16);
2473 		/* denominator_bits = */ (void) fz_read_bits(ctx, stream, 16);
2474 
2475 		/* Item 1: Page object numbers */
2476 		doc->hint_page[0].number = doc->linear_page1_obj_num;
2477 		/* We don't care about the number of objects in the first page */
2478 		(void)fz_read_bits(ctx, stream, page_obj_num_bits);
2479 		j = 1;
2480 		for (i = 1; i < doc->linear_page_count; i++)
2481 		{
2482 			int delta_page_objs = fz_read_bits(ctx, stream, page_obj_num_bits);
2483 
2484 			doc->hint_page[i].number = j;
2485 			j += least_num_page_objs + delta_page_objs;
2486 		}
2487 		doc->hint_page[i].number = j; /* Not a real page object */
2488 		fz_sync_bits(ctx, stream);
2489 		/* Item 2: Page lengths */
2490 		j = doc->hint_page[0].offset;
2491 		for (i = 0; i < doc->linear_page_count; i++)
2492 		{
2493 			int delta_page_len = fz_read_bits(ctx, stream, page_len_num_bits);
2494 			int old = j;
2495 
2496 			doc->hint_page[i].offset = j;
2497 			j += least_page_len + delta_page_len;
2498 			if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2499 				j += doc->hint_object_length;
2500 		}
2501 		doc->hint_page[i].offset = j;
2502 		fz_sync_bits(ctx, stream);
2503 		/* Item 3: Shared references */
2504 		shared = 0;
2505 		for (i = 0; i < doc->linear_page_count; i++)
2506 		{
2507 			int num_shared_objs = fz_read_bits(ctx, stream, num_shared_obj_num_bits);
2508 			doc->hint_page[i].index = shared;
2509 			shared += num_shared_objs;
2510 		}
2511 		doc->hint_page[i].index = shared;
2512 		doc->hint_shared_ref = fz_realloc_array(ctx, doc->hint_shared_ref, shared, int);
2513 		memset(doc->hint_shared_ref, 0, sizeof(*doc->hint_shared_ref) * shared);
2514 		fz_sync_bits(ctx, stream);
2515 		/* Item 4: Shared references */
2516 		for (i = 0; i < shared; i++)
2517 		{
2518 			int ref = fz_read_bits(ctx, stream, shared_obj_num_bits);
2519 			doc->hint_shared_ref[i] = ref;
2520 		}
2521 		/* Skip items 5,6,7 as we don't use them */
2522 
2523 		fz_seek(ctx, stream, shared_hint_offset, SEEK_SET);
2524 
2525 		/* Read the shared object hints table: Header first */
2526 		shared_obj_num = fz_read_bits(ctx, stream, 32);
2527 		shared_obj_offset = fz_read_bits(ctx, stream, 32);
2528 		if (shared_obj_offset > doc->hint_object_offset)
2529 			shared_obj_offset += doc->hint_object_length;
2530 		shared_obj_count_page1 = fz_read_bits(ctx, stream, 32);
2531 		shared_obj_count_total = fz_read_bits(ctx, stream, 32);
2532 		shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2533 		least_shared_group_len = fz_read_bits(ctx, stream, 32);
2534 		shared_group_len_num_bits = fz_read_bits(ctx, stream, 16);
2535 
2536 		/* Sanity check the references in Item 4 above to ensure we
2537 		 * don't access out of range with malicious files. */
2538 		for (i = 0; i < shared; i++)
2539 		{
2540 			if (doc->hint_shared_ref[i] >= shared_obj_count_total)
2541 			{
2542 				fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint stream (shared refs)");
2543 			}
2544 		}
2545 
2546 		doc->hint_shared = fz_realloc_array(ctx, doc->hint_shared, shared_obj_count_total+1, pdf_hint_shared);
2547 		memset(doc->hint_shared, 0, sizeof(*doc->hint_shared) * (shared_obj_count_total+1));
2548 
2549 		/* Item 1: Shared references */
2550 		j = doc->hint_page[0].offset;
2551 		for (i = 0; i < shared_obj_count_page1; i++)
2552 		{
2553 			int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
2554 			int old = j;
2555 			doc->hint_shared[i].offset = j;
2556 			j += off + least_shared_group_len;
2557 			if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2558 				j += doc->hint_object_length;
2559 		}
2560 		/* FIXME: We would have problems recreating the length of the
2561 		 * last page 1 shared reference group. But we'll never need
2562 		 * to, so ignore it. */
2563 		j = shared_obj_offset;
2564 		for (; i < shared_obj_count_total; i++)
2565 		{
2566 			int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
2567 			int old = j;
2568 			doc->hint_shared[i].offset = j;
2569 			j += off + least_shared_group_len;
2570 			if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2571 				j += doc->hint_object_length;
2572 		}
2573 		doc->hint_shared[i].offset = j;
2574 		fz_sync_bits(ctx, stream);
2575 		/* Item 2: Signature flags: read these just so we can skip */
2576 		for (i = 0; i < shared_obj_count_total; i++)
2577 		{
2578 			doc->hint_shared[i].number = fz_read_bits(ctx, stream, 1);
2579 		}
2580 		fz_sync_bits(ctx, stream);
2581 		/* Item 3: Signatures: just skip */
2582 		for (i = 0; i < shared_obj_count_total; i++)
2583 		{
2584 			if (doc->hint_shared[i].number)
2585 			{
2586 				(void) fz_read_bits(ctx, stream, 128);
2587 			}
2588 		}
2589 		fz_sync_bits(ctx, stream);
2590 		/* Item 4: Shared object object numbers */
2591 		j = doc->linear_page1_obj_num; /* FIXME: This is a lie! */
2592 		for (i = 0; i < shared_obj_count_page1; i++)
2593 		{
2594 			doc->hint_shared[i].number = j;
2595 			j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
2596 		}
2597 		j = shared_obj_num;
2598 		for (; i < shared_obj_count_total; i++)
2599 		{
2600 			doc->hint_shared[i].number = j;
2601 			j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
2602 		}
2603 		doc->hint_shared[i].number = j;
2604 
2605 		/* Now, actually use the data we have gathered. */
2606 		for (i = 0 /*shared_obj_count_page1*/; i < shared_obj_count_total; i++)
2607 		{
2608 			doc->hint_obj_offsets[doc->hint_shared[i].number] = doc->hint_shared[i].offset;
2609 		}
2610 		for (i = 0; i < doc->linear_page_count; i++)
2611 		{
2612 			doc->hint_obj_offsets[doc->hint_page[i].number] = doc->hint_page[i].offset;
2613 		}
2614 	}
2615 	fz_always(ctx)
2616 	{
2617 		fz_drop_stream(ctx, stream);
2618 	}
2619 	fz_catch(ctx)
2620 	{
2621 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2622 		/* Don't try to load hints again */
2623 		doc->hints_loaded = 1;
2624 		/* We won't use the linearized object anymore. */
2625 		doc->file_reading_linearly = 0;
2626 		/* Any other error becomes a TRYLATER */
2627 		fz_throw(ctx, FZ_ERROR_TRYLATER, "malformed hints object");
2628 	}
2629 	doc->hints_loaded = 1;
2630 }
2631 
2632 static void
pdf_load_hint_object(fz_context * ctx,pdf_document * doc)2633 pdf_load_hint_object(fz_context *ctx, pdf_document *doc)
2634 {
2635 	pdf_lexbuf *buf = &doc->lexbuf.base;
2636 	int64_t curr_pos;
2637 
2638 	curr_pos = fz_tell(ctx, doc->file);
2639 	fz_seek(ctx, doc->file, doc->hint_object_offset, SEEK_SET);
2640 	fz_try(ctx)
2641 	{
2642 		while (1)
2643 		{
2644 			pdf_obj *page = NULL;
2645 			int64_t tmpofs;
2646 			int num, tok;
2647 
2648 			tok = pdf_lex(ctx, doc->file, buf);
2649 			if (tok != PDF_TOK_INT)
2650 				break;
2651 			num = buf->i;
2652 			tok = pdf_lex(ctx, doc->file, buf);
2653 			if (tok != PDF_TOK_INT)
2654 				break;
2655 			/* Ignore gen = buf->i */
2656 			tok = pdf_lex(ctx, doc->file, buf);
2657 			if (tok != PDF_TOK_OBJ)
2658 				break;
2659 			(void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs, NULL);
2660 			pdf_load_hints(ctx, doc, num);
2661 		}
2662 	}
2663 	fz_always(ctx)
2664 	{
2665 		fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2666 	}
2667 	fz_catch(ctx)
2668 	{
2669 		fz_rethrow(ctx);
2670 	}
2671 }
2672 
pdf_progressive_advance(fz_context * ctx,pdf_document * doc,int pagenum)2673 pdf_obj *pdf_progressive_advance(fz_context *ctx, pdf_document *doc, int pagenum)
2674 {
2675 	pdf_lexbuf *buf = &doc->lexbuf.base;
2676 	int curr_pos;
2677 	pdf_obj *page = NULL;
2678 
2679 	pdf_load_hinted_page(ctx, doc, pagenum);
2680 
2681 	if (pagenum < 0 || pagenum >= doc->linear_page_count)
2682 		fz_throw(ctx, FZ_ERROR_GENERIC, "page load out of range (%d of %d)", pagenum, doc->linear_page_count);
2683 
2684 	if (doc->linear_pos == doc->file_length)
2685 		return doc->linear_page_refs[pagenum];
2686 
2687 	/* Only load hints once, and then only after we have got page 0 */
2688 	if (pagenum > 0 && !doc->hints_loaded && doc->hint_object_offset > 0 && doc->linear_pos >= doc->hint_object_offset)
2689 	{
2690 		/* Found hint object */
2691 		pdf_load_hint_object(ctx, doc);
2692 	}
2693 
2694 	DEBUGMESS((ctx, "continuing to try to advance from %d", doc->linear_pos));
2695 	curr_pos = fz_tell(ctx, doc->file);
2696 
2697 	fz_var(page);
2698 
2699 	fz_try(ctx)
2700 	{
2701 		int eof;
2702 		do
2703 		{
2704 			int num;
2705 			eof = pdf_obj_read(ctx, doc, &doc->linear_pos, &num, &page);
2706 			pdf_drop_obj(ctx, page);
2707 			page = NULL;
2708 		}
2709 		while (!eof);
2710 
2711 		{
2712 			pdf_obj *catalog;
2713 			pdf_obj *pages;
2714 			doc->linear_pos = doc->file_length;
2715 			pdf_load_xref(ctx, doc, buf);
2716 			catalog = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
2717 			pages = pdf_dict_get(ctx, catalog, PDF_NAME(Pages));
2718 
2719 			if (!pdf_is_dict(ctx, pages))
2720 				fz_throw(ctx, FZ_ERROR_GENERIC, "missing page tree");
2721 			break;
2722 		}
2723 	}
2724 	fz_always(ctx)
2725 	{
2726 		fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2727 	}
2728 	fz_catch(ctx)
2729 	{
2730 		pdf_drop_obj(ctx, page);
2731 		if (fz_caught(ctx) == FZ_ERROR_TRYLATER)
2732 		{
2733 			if (doc->linear_page_refs[pagenum] == NULL)
2734 			{
2735 				/* Still not got a page */
2736 				fz_rethrow(ctx);
2737 			}
2738 		}
2739 		else
2740 			fz_rethrow(ctx);
2741 	}
2742 
2743 	return doc->linear_page_refs[pagenum];
2744 }
2745 
pdf_document_from_fz_document(fz_context * ctx,fz_document * ptr)2746 pdf_document *pdf_document_from_fz_document(fz_context *ctx, fz_document *ptr)
2747 {
2748 	return (pdf_document *)((ptr && ptr->count_pages == pdf_count_pages_imp) ? ptr : NULL);
2749 }
2750 
pdf_page_from_fz_page(fz_context * ctx,fz_page * ptr)2751 pdf_page *pdf_page_from_fz_page(fz_context *ctx, fz_page *ptr)
2752 {
2753 	return (pdf_page *)((ptr && ptr->bound_page == (fz_page_bound_page_fn*)pdf_bound_page) ? ptr : NULL);
2754 }
2755 
pdf_specifics(fz_context * ctx,fz_document * doc)2756 pdf_document *pdf_specifics(fz_context *ctx, fz_document *doc)
2757 {
2758 	return pdf_document_from_fz_document(ctx, doc);
2759 }
2760 
2761 pdf_obj *
pdf_add_object(fz_context * ctx,pdf_document * doc,pdf_obj * obj)2762 pdf_add_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2763 {
2764 	pdf_document *orig_doc;
2765 	int num;
2766 
2767 	orig_doc = pdf_get_bound_document(ctx, obj);
2768 	if (orig_doc && orig_doc != doc)
2769 		fz_throw(ctx, FZ_ERROR_GENERIC, "tried to add an object belonging to a different document");
2770 	if (pdf_is_indirect(ctx, obj))
2771 		return pdf_keep_obj(ctx, obj);
2772 	num = pdf_create_object(ctx, doc);
2773 	pdf_update_object(ctx, doc, num, obj);
2774 	return pdf_new_indirect(ctx, doc, num, 0);
2775 }
2776 
2777 pdf_obj *
pdf_add_object_drop(fz_context * ctx,pdf_document * doc,pdf_obj * obj)2778 pdf_add_object_drop(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2779 {
2780 	pdf_obj *ind = NULL;
2781 	fz_try(ctx)
2782 		ind = pdf_add_object(ctx, doc, obj);
2783 	fz_always(ctx)
2784 		pdf_drop_obj(ctx, obj);
2785 	fz_catch(ctx)
2786 		fz_rethrow(ctx);
2787 	return ind;
2788 }
2789 
2790 pdf_obj *
pdf_add_new_dict(fz_context * ctx,pdf_document * doc,int initial)2791 pdf_add_new_dict(fz_context *ctx, pdf_document *doc, int initial)
2792 {
2793 	return pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, initial));
2794 }
2795 
2796 pdf_obj *
pdf_add_new_array(fz_context * ctx,pdf_document * doc,int initial)2797 pdf_add_new_array(fz_context *ctx, pdf_document *doc, int initial)
2798 {
2799 	return pdf_add_object_drop(ctx, doc, pdf_new_array(ctx, doc, initial));
2800 }
2801 
2802 pdf_obj *
pdf_add_stream(fz_context * ctx,pdf_document * doc,fz_buffer * buf,pdf_obj * obj,int compressed)2803 pdf_add_stream(fz_context *ctx, pdf_document *doc, fz_buffer *buf, pdf_obj *obj, int compressed)
2804 {
2805 	pdf_obj *ind;
2806 	if (!obj)
2807 		ind = pdf_add_new_dict(ctx, doc, 4);
2808 	else
2809 		ind = pdf_add_object(ctx, doc, obj);
2810 	fz_try(ctx)
2811 		pdf_update_stream(ctx, doc, ind, buf, compressed);
2812 	fz_catch(ctx)
2813 	{
2814 		pdf_drop_obj(ctx, ind);
2815 		fz_rethrow(ctx);
2816 	}
2817 	return ind;
2818 }
2819 
pdf_create_document(fz_context * ctx)2820 pdf_document *pdf_create_document(fz_context *ctx)
2821 {
2822 	pdf_document *doc;
2823 	pdf_obj *root;
2824 	pdf_obj *pages;
2825 	pdf_obj *trailer = NULL;
2826 
2827 	fz_var(trailer);
2828 
2829 	doc = pdf_new_document(ctx, NULL);
2830 	fz_try(ctx)
2831 	{
2832 		doc->version = 17;
2833 		doc->file_size = 0;
2834 		doc->startxref = 0;
2835 		doc->num_xref_sections = 0;
2836 		doc->num_incremental_sections = 0;
2837 		doc->xref_base = 0;
2838 		doc->disallow_new_increments = 0;
2839 		pdf_get_populating_xref_entry(ctx, doc, 0);
2840 
2841 		trailer = pdf_new_dict(ctx, doc, 2);
2842 		pdf_dict_put_int(ctx, trailer, PDF_NAME(Size), 3);
2843 		pdf_dict_put_drop(ctx, trailer, PDF_NAME(Root), root = pdf_add_new_dict(ctx, doc, 2));
2844 		pdf_dict_put(ctx, root, PDF_NAME(Type), PDF_NAME(Catalog));
2845 		pdf_dict_put_drop(ctx, root, PDF_NAME(Pages), pages = pdf_add_new_dict(ctx, doc, 3));
2846 		pdf_dict_put(ctx, pages, PDF_NAME(Type), PDF_NAME(Pages));
2847 		pdf_dict_put_int(ctx, pages, PDF_NAME(Count), 0);
2848 		pdf_dict_put_array(ctx, pages, PDF_NAME(Kids), 1);
2849 
2850 		/* Set the trailer of the final xref section. */
2851 		doc->xref_sections[0].trailer = trailer;
2852 	}
2853 	fz_catch(ctx)
2854 	{
2855 		pdf_drop_obj(ctx, trailer);
2856 		fz_drop_document(ctx, &doc->super);
2857 		fz_rethrow(ctx);
2858 	}
2859 	return doc;
2860 }
2861 
2862 static const char *pdf_extensions[] =
2863 {
2864 	"pdf",
2865 	"pclm",
2866 	"ai",
2867 	NULL
2868 };
2869 
2870 static const char *pdf_mimetypes[] =
2871 {
2872 	"application/pdf",
2873 	"application/PCLm",
2874 	NULL
2875 };
2876 
2877 fz_document_handler pdf_document_handler =
2878 {
2879 	NULL,
2880 	(fz_document_open_fn*)pdf_open_document,
2881 	(fz_document_open_with_stream_fn*)pdf_open_document_with_stream,
2882 	pdf_extensions,
2883 	pdf_mimetypes,
2884 	NULL,
2885 	NULL
2886 };
2887 
pdf_mark_xref(fz_context * ctx,pdf_document * doc)2888 void pdf_mark_xref(fz_context *ctx, pdf_document *doc)
2889 {
2890 	int x, e;
2891 
2892 	for (x = 0; x < doc->num_xref_sections; x++)
2893 	{
2894 		pdf_xref *xref = &doc->xref_sections[x];
2895 		pdf_xref_subsec *sub;
2896 
2897 		for (sub = xref->subsec; sub != NULL; sub = sub->next)
2898 		{
2899 			for (e = 0; e < sub->len; e++)
2900 			{
2901 				pdf_xref_entry *entry = &sub->table[e];
2902 				if (entry->obj)
2903 				{
2904 					entry->marked = 1;
2905 				}
2906 			}
2907 		}
2908 	}
2909 }
2910 
pdf_clear_xref(fz_context * ctx,pdf_document * doc)2911 void pdf_clear_xref(fz_context *ctx, pdf_document *doc)
2912 {
2913 	int x, e;
2914 
2915 	for (x = 0; x < doc->num_xref_sections; x++)
2916 	{
2917 		pdf_xref *xref = &doc->xref_sections[x];
2918 		pdf_xref_subsec *sub;
2919 
2920 		for (sub = xref->subsec; sub != NULL; sub = sub->next)
2921 		{
2922 			for (e = 0; e < sub->len; e++)
2923 			{
2924 				pdf_xref_entry *entry = &sub->table[e];
2925 				/* We cannot drop objects if the stream
2926 				 * buffer has been updated */
2927 				if (entry->obj != NULL && entry->stm_buf == NULL)
2928 				{
2929 					if (pdf_obj_refs(ctx, entry->obj) == 1)
2930 					{
2931 						pdf_drop_obj(ctx, entry->obj);
2932 						entry->obj = NULL;
2933 					}
2934 				}
2935 			}
2936 		}
2937 	}
2938 }
2939 
pdf_clear_xref_to_mark(fz_context * ctx,pdf_document * doc)2940 void pdf_clear_xref_to_mark(fz_context *ctx, pdf_document *doc)
2941 {
2942 	int x, e;
2943 
2944 	for (x = 0; x < doc->num_xref_sections; x++)
2945 	{
2946 		pdf_xref *xref = &doc->xref_sections[x];
2947 		pdf_xref_subsec *sub;
2948 
2949 		for (sub = xref->subsec; sub != NULL; sub = sub->next)
2950 		{
2951 			for (e = 0; e < sub->len; e++)
2952 			{
2953 				pdf_xref_entry *entry = &sub->table[e];
2954 
2955 				/* We cannot drop objects if the stream buffer has
2956 				 * been updated */
2957 				if (entry->obj != NULL && entry->stm_buf == NULL)
2958 				{
2959 					if (!entry->marked && pdf_obj_refs(ctx, entry->obj) == 1)
2960 					{
2961 						pdf_drop_obj(ctx, entry->obj);
2962 						entry->obj = NULL;
2963 					}
2964 				}
2965 			}
2966 		}
2967 	}
2968 }
2969 
2970 int
pdf_count_versions(fz_context * ctx,pdf_document * doc)2971 pdf_count_versions(fz_context *ctx, pdf_document *doc)
2972 {
2973 	return doc->num_xref_sections-doc->num_incremental_sections-doc->has_linearization_object;
2974 }
2975 
2976 int
pdf_count_unsaved_versions(fz_context * ctx,pdf_document * doc)2977 pdf_count_unsaved_versions(fz_context *ctx, pdf_document *doc)
2978 {
2979 	return doc->num_incremental_sections;
2980 }
2981 
2982 int
pdf_doc_was_linearized(fz_context * ctx,pdf_document * doc)2983 pdf_doc_was_linearized(fz_context *ctx, pdf_document *doc)
2984 {
2985 	return doc->has_linearization_object;
2986 }
2987 
pdf_obj_exists(fz_context * ctx,pdf_document * doc,int i)2988 static int pdf_obj_exists(fz_context *ctx, pdf_document *doc, int i)
2989 {
2990 	pdf_xref_subsec *sub;
2991 	int j;
2992 
2993 	if (i < 0)
2994 		fz_throw(ctx, FZ_ERROR_GENERIC, "Negative object number requested");
2995 
2996 	if (i <= doc->max_xref_len)
2997 		j = doc->xref_index[i];
2998 	else
2999 		j = 0;
3000 
3001 	/* We may be accessing an earlier version of the document using xref_base
3002 	 * and j may be an index into a later xref section */
3003 	if (doc->xref_base > j)
3004 		j = doc->xref_base;
3005 
3006 	/* Find the first xref section where the entry is defined. */
3007 	for (; j < doc->num_xref_sections; j++)
3008 	{
3009 		pdf_xref *xref = &doc->xref_sections[j];
3010 
3011 		if (i < xref->num_objects)
3012 		{
3013 			for (sub = xref->subsec; sub != NULL; sub = sub->next)
3014 			{
3015 				if (i < sub->start || i >= sub->start + sub->len)
3016 					continue;
3017 
3018 				if (sub->table[i - sub->start].type)
3019 					return 1;
3020 			}
3021 		}
3022 	}
3023 
3024 	return 0;
3025 }
3026 
3027 enum {
3028 	FIELD_CHANGED = 1,
3029 	FIELD_CHANGE_VALID = 2,
3030 	FIELD_CHANGE_INVALID = 4
3031 };
3032 
3033 typedef struct
3034 {
3035 	int num_obj;
3036 	int obj_changes[1];
3037 } pdf_changes;
3038 
3039 static int
check_unchanged_between(fz_context * ctx,pdf_document * doc,pdf_changes * changes,pdf_obj * nobj,pdf_obj * oobj)3040 check_unchanged_between(fz_context *ctx, pdf_document *doc, pdf_changes *changes, pdf_obj *nobj, pdf_obj *oobj)
3041 {
3042 	int marked = 0;
3043 	int changed = 0;
3044 
3045 	/* Trivially identical => trivially unchanged. */
3046 	if (nobj == oobj)
3047 		return 0;
3048 
3049 	/* Strictly speaking we shouldn't need to call fz_var,
3050 	 * but I suspect static analysis tools are not smart
3051 	 * enough to figure that out. */
3052 	fz_var(marked);
3053 
3054 	if (pdf_is_indirect(ctx, nobj))
3055 	{
3056 		int o_xref_base = doc->xref_base;
3057 
3058 		/* Both must be indirect if one is. */
3059 		if (!pdf_is_indirect(ctx, oobj))
3060 		{
3061 			changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3062 			return 1;
3063 		}
3064 
3065 		/* Handle recursing back into ourselves. */
3066 		if (pdf_obj_marked(ctx, nobj))
3067 		{
3068 			if (pdf_obj_marked(ctx, oobj))
3069 				return 0;
3070 			changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3071 			return 1;
3072 		}
3073 		else if (pdf_obj_marked(ctx, oobj))
3074 		{
3075 			changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3076 			return 1;
3077 		}
3078 
3079 		nobj = pdf_resolve_indirect_chain(ctx, nobj);
3080 		doc->xref_base = o_xref_base+1;
3081 		fz_try(ctx)
3082 		{
3083 			oobj = pdf_resolve_indirect_chain(ctx, oobj);
3084 			if (oobj != nobj)
3085 			{
3086 				/* Different objects, so lock them */
3087 				if (!pdf_obj_marked(ctx, nobj) && !pdf_obj_marked(ctx, oobj))
3088 				{
3089 					pdf_mark_obj(ctx, nobj);
3090 					pdf_mark_obj(ctx, oobj);
3091 					marked = 1;
3092 				}
3093 			}
3094 		}
3095 		fz_always(ctx)
3096 			doc->xref_base = o_xref_base;
3097 		fz_catch(ctx)
3098 			fz_rethrow(ctx);
3099 
3100 		if (nobj == oobj)
3101 			return 0; /* Trivially identical */
3102 	}
3103 
3104 	fz_var(changed);
3105 
3106 	fz_try(ctx)
3107 	{
3108 		if (pdf_is_dict(ctx, nobj))
3109 		{
3110 			int i, n = pdf_dict_len(ctx, nobj);
3111 
3112 			if (!pdf_is_dict(ctx, oobj) || n != pdf_dict_len(ctx, oobj))
3113 			{
3114 change_found:
3115 				changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3116 				changed = 1;
3117 				break;
3118 			}
3119 
3120 			for (i = 0; i < n; i++)
3121 			{
3122 				pdf_obj *key = pdf_dict_get_key(ctx, nobj, i);
3123 				pdf_obj *nval = pdf_dict_get(ctx, nobj, key);
3124 				pdf_obj *oval = pdf_dict_get(ctx, oobj, key);
3125 
3126 				changed |= check_unchanged_between(ctx, doc, changes, nval, oval);
3127 			}
3128 		}
3129 		else if (pdf_is_array(ctx, nobj))
3130 		{
3131 			int i, n = pdf_array_len(ctx, nobj);
3132 
3133 			if (!pdf_is_array(ctx, oobj) || n != pdf_array_len(ctx, oobj))
3134 				goto change_found;
3135 
3136 			for (i = 0; i < n; i++)
3137 			{
3138 				pdf_obj *nval = pdf_array_get(ctx, nobj, i);
3139 				pdf_obj *oval = pdf_array_get(ctx, oobj, i);
3140 
3141 				changed |= check_unchanged_between(ctx, doc, changes, nval, oval);
3142 			}
3143 		}
3144 		else if (pdf_objcmp(ctx, nobj, oobj))
3145 			goto change_found;
3146 	}
3147 	fz_always(ctx)
3148 	{
3149 		if (marked)
3150 		{
3151 			pdf_unmark_obj(ctx, nobj);
3152 			pdf_unmark_obj(ctx, oobj);
3153 		}
3154 	}
3155 	fz_catch(ctx)
3156 		fz_rethrow(ctx);
3157 
3158 	return changed;
3159 }
3160 
3161 typedef struct
3162 {
3163 	int max;
3164 	int len;
3165 	char **list;
3166 } char_list;
3167 
3168 /* This structure is used to hold the definition of which fields
3169  * are locked. */
3170 struct pdf_locked_fields
3171 {
3172 	int p;
3173 	int all;
3174 	char_list includes;
3175 	char_list excludes;
3176 };
3177 
3178 static void
free_char_list(fz_context * ctx,char_list * c)3179 free_char_list(fz_context *ctx, char_list *c)
3180 {
3181 	int i;
3182 
3183 	if (c == NULL)
3184 		return;
3185 
3186 	for (i = c->len-1; i >= 0; i--)
3187 		fz_free(ctx, c->list[i]);
3188 	fz_free(ctx, c->list);
3189 	c->len = 0;
3190 	c->max = 0;
3191 }
3192 
3193 void
pdf_drop_locked_fields(fz_context * ctx,pdf_locked_fields * fl)3194 pdf_drop_locked_fields(fz_context *ctx, pdf_locked_fields *fl)
3195 {
3196 	if (fl == NULL)
3197 		return;
3198 
3199 	free_char_list(ctx, &fl->includes);
3200 	free_char_list(ctx, &fl->excludes);
3201 	fz_free(ctx, fl);
3202 }
3203 
3204 static void
char_list_append(fz_context * ctx,char_list * list,const char * s)3205 char_list_append(fz_context *ctx, char_list *list, const char *s)
3206 {
3207 	if (list->len == list->max)
3208 	{
3209 		int n = list->max * 2;
3210 		if (n == 0) n = 4;
3211 
3212 		list->list = fz_realloc_array(ctx, list->list, n, char *);
3213 		list->max = n;
3214 	}
3215 	list->list[list->len] = fz_strdup(ctx, s);
3216 	list->len++;
3217 }
3218 
3219 int
pdf_is_field_locked(fz_context * ctx,pdf_locked_fields * locked,const char * name)3220 pdf_is_field_locked(fz_context *ctx, pdf_locked_fields *locked, const char *name)
3221 {
3222 	int i;
3223 
3224 	if (locked->p == 1)
3225 	{
3226 		/* Permissions were set, and say that field changes are not to be allowed. */
3227 		return 1; /* Locked */
3228 	}
3229 
3230 	if(locked->all)
3231 	{
3232 		/* The only way we might not be unlocked is if
3233 		 * we are listed in the excludes. */
3234 		for (i = 0; i < locked->excludes.len; i++)
3235 			if (!strcmp(locked->excludes.list[i], name))
3236 				return 0;
3237 		return 1;
3238 	}
3239 
3240 	/* The only way we can be locked is for us to be in the includes. */
3241 	for (i = 0; i < locked->includes.len; i++)
3242 		if (strcmp(locked->includes.list[i], name) == 0)
3243 			return 1;
3244 
3245 	/* Anything else is unlocked */
3246 	return 0;
3247 }
3248 
3249 /* Unfortunately, in C, there is no legal way to define a function
3250  * type that returns itself. We therefore have to use a struct
3251  * wrapper. */
3252 typedef struct filter_wrap
3253 {
3254 	struct filter_wrap (*func)(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
3255 } filter_wrap;
3256 
3257 typedef struct filter_wrap (*filter_fn)(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
3258 
3259 #define RETURN_FILTER(f) { filter_wrap rf; rf.func = (f); return rf; }
3260 
3261 static filter_wrap filter_simple(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3262 {
3263 	RETURN_FILTER(NULL);
3264 }
3265 
filter_transformparams(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3266 static filter_wrap filter_transformparams(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3267 {
3268 	if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
3269 		pdf_name_eq(ctx, key, PDF_NAME(P)) ||
3270 		pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3271 		pdf_name_eq(ctx, key, PDF_NAME(Document)) ||
3272 		pdf_name_eq(ctx, key, PDF_NAME(Msg)) ||
3273 		pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3274 		pdf_name_eq(ctx, key, PDF_NAME(Annots)) ||
3275 		pdf_name_eq(ctx, key, PDF_NAME(Form)) ||
3276 		pdf_name_eq(ctx, key, PDF_NAME(FormEx)) ||
3277 		pdf_name_eq(ctx, key, PDF_NAME(EF)) ||
3278 		pdf_name_eq(ctx, key, PDF_NAME(P)) ||
3279 		pdf_name_eq(ctx, key, PDF_NAME(Action)) ||
3280 		pdf_name_eq(ctx, key, PDF_NAME(Fields)))
3281 		RETURN_FILTER(&filter_simple);
3282 	RETURN_FILTER(NULL);
3283 }
3284 
filter_reference(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3285 static filter_wrap filter_reference(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3286 {
3287 	if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
3288 		pdf_name_eq(ctx, key, PDF_NAME(TransformMethod)) ||
3289 		pdf_name_eq(ctx, key, PDF_NAME(DigestMethod)) ||
3290 		pdf_name_eq(ctx, key, PDF_NAME(DigestValue)) ||
3291 		pdf_name_eq(ctx, key, PDF_NAME(DigestLocation)))
3292 		RETURN_FILTER(&filter_simple);
3293 	if (pdf_name_eq(ctx, key, PDF_NAME(TransformParams)))
3294 		RETURN_FILTER(&filter_transformparams);
3295 	RETURN_FILTER(NULL);
3296 }
3297 
filter_prop_build_sub(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3298 static filter_wrap filter_prop_build_sub(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3299 {
3300 	if (pdf_name_eq(ctx, key, PDF_NAME(Name)) ||
3301 		pdf_name_eq(ctx, key, PDF_NAME(Date)) ||
3302 		pdf_name_eq(ctx, key, PDF_NAME(R)) ||
3303 		pdf_name_eq(ctx, key, PDF_NAME(PreRelease)) ||
3304 		pdf_name_eq(ctx, key, PDF_NAME(OS)) ||
3305 		pdf_name_eq(ctx, key, PDF_NAME(NonEFontNoWarn)) ||
3306 		pdf_name_eq(ctx, key, PDF_NAME(TrustedMode)) ||
3307 		pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3308 		pdf_name_eq(ctx, key, PDF_NAME(REx)) ||
3309 		pdf_name_eq(ctx, key, PDF_NAME(Preview)))
3310 		RETURN_FILTER(&filter_simple);
3311 	RETURN_FILTER(NULL);
3312 }
3313 
filter_prop_build(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3314 static filter_wrap filter_prop_build(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3315 {
3316 	if (pdf_name_eq(ctx, key, PDF_NAME(Filter)) ||
3317 		pdf_name_eq(ctx, key, PDF_NAME(PubSec)) ||
3318 		pdf_name_eq(ctx, key, PDF_NAME(App)) ||
3319 		pdf_name_eq(ctx, key, PDF_NAME(SigQ)))
3320 		RETURN_FILTER(&filter_prop_build_sub);
3321 	RETURN_FILTER(NULL);
3322 }
3323 
filter_v(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3324 static filter_wrap filter_v(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3325 {
3326 	/* Text can point to a stream object */
3327 	if (pdf_name_eq(ctx, key, PDF_NAME(Length)) && pdf_is_stream(ctx, dict))
3328 		RETURN_FILTER(&filter_simple);
3329 	/* Sigs point to a dict. */
3330 	if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
3331 		pdf_name_eq(ctx, key, PDF_NAME(Filter)) ||
3332 		pdf_name_eq(ctx, key, PDF_NAME(SubFilter)) ||
3333 		pdf_name_eq(ctx, key, PDF_NAME(Contents)) ||
3334 		pdf_name_eq(ctx, key, PDF_NAME(Cert)) ||
3335 		pdf_name_eq(ctx, key, PDF_NAME(ByteRange)) ||
3336 		pdf_name_eq(ctx, key, PDF_NAME(Changes)) ||
3337 		pdf_name_eq(ctx, key, PDF_NAME(Name)) ||
3338 		pdf_name_eq(ctx, key, PDF_NAME(M)) ||
3339 		pdf_name_eq(ctx, key, PDF_NAME(Location)) ||
3340 		pdf_name_eq(ctx, key, PDF_NAME(Reason)) ||
3341 		pdf_name_eq(ctx, key, PDF_NAME(ContactInfo)) ||
3342 		pdf_name_eq(ctx, key, PDF_NAME(R)) ||
3343 		pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3344 		pdf_name_eq(ctx, key, PDF_NAME(Prop_AuthTime)) ||
3345 		pdf_name_eq(ctx, key, PDF_NAME(Prop_AuthType)))
3346 	RETURN_FILTER(&filter_simple);
3347 	if (pdf_name_eq(ctx, key, PDF_NAME(Reference)))
3348 		RETURN_FILTER(filter_reference);
3349 	if (pdf_name_eq(ctx, key, PDF_NAME(Prop_Build)))
3350 		RETURN_FILTER(filter_prop_build);
3351 	RETURN_FILTER(NULL);
3352 }
3353 
3354 static filter_wrap filter_appearance(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
3355 
filter_xobject_list(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3356 static filter_wrap filter_xobject_list(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3357 {
3358 	/* FIXME: Infinite recursion possible here? */
3359 	RETURN_FILTER(&filter_appearance);
3360 }
3361 
filter_font(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3362 static filter_wrap filter_font(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3363 {
3364 	/* In the example I've seen the /Name field was dropped, so we'll allow
3365 	 * local changes, but none that follow an indirection. */
3366 	RETURN_FILTER(NULL);
3367 }
3368 
3369 /* FIXME: One idea here is to make filter_font_list and filter_xobject_list
3370  * only accept NEW objects as changes. Will think about this. */
filter_font_list(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3371 static filter_wrap filter_font_list(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3372 {
3373 	RETURN_FILTER(&filter_font);
3374 }
3375 
filter_resources(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3376 static filter_wrap filter_resources(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3377 {
3378 	if (pdf_name_eq(ctx, key, PDF_NAME(XObject)))
3379 		RETURN_FILTER(&filter_xobject_list);
3380 	if (pdf_name_eq(ctx, key, PDF_NAME(Font)))
3381 		RETURN_FILTER(&filter_font_list);
3382 	RETURN_FILTER(NULL);
3383 }
3384 
filter_appearance(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3385 static filter_wrap filter_appearance(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3386 {
3387 	if (pdf_name_eq(ctx, key, PDF_NAME(Resources)))
3388 		RETURN_FILTER(&filter_resources);
3389 	RETURN_FILTER(NULL);
3390 }
3391 
filter_ap(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3392 static filter_wrap filter_ap(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3393 {
3394 	/* Just the /N entry for now. May need to add more later. */
3395 	if (pdf_name_eq(ctx, key, PDF_NAME(N)) && pdf_is_stream(ctx, pdf_dict_get(ctx, dict, key)))
3396 		RETURN_FILTER(&filter_appearance);
3397 	RETURN_FILTER(NULL);
3398 }
3399 
filter_xfa(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3400 static filter_wrap filter_xfa(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3401 {
3402 	/* Text can point to a stream object */
3403 	if (pdf_is_stream(ctx, dict))
3404 		RETURN_FILTER(&filter_simple);
3405 	RETURN_FILTER(NULL);
3406 }
3407 
3408 static void
filter_changes_accepted(fz_context * ctx,pdf_changes * changes,pdf_obj * obj,filter_fn filter)3409 filter_changes_accepted(fz_context *ctx, pdf_changes *changes, pdf_obj *obj, filter_fn filter)
3410 {
3411 	int obj_num;
3412 
3413 	if (obj == NULL || pdf_obj_marked(ctx, obj))
3414 		return;
3415 
3416 	obj_num = pdf_to_num(ctx, obj);
3417 
3418 	fz_try(ctx)
3419 	{
3420 		if (obj_num != 0)
3421 		{
3422 			pdf_mark_obj(ctx, obj);
3423 			changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
3424 		}
3425 		if (filter == NULL)
3426 			break;
3427 		if (pdf_is_dict(ctx, obj))
3428 		{
3429 			int i, n = pdf_dict_len(ctx, obj);
3430 
3431 			for (i = 0; i < n; i++)
3432 			{
3433 				pdf_obj *key = pdf_dict_get_key(ctx, obj, i);
3434 				pdf_obj *val = pdf_dict_get_val(ctx, obj, i);
3435 				filter_fn f = (filter(ctx, obj, key)).func;
3436 				if (f != NULL)
3437 					filter_changes_accepted(ctx, changes, val, f);
3438 			}
3439 		}
3440 		else if (pdf_is_array(ctx, obj))
3441 		{
3442 			int i, n = pdf_array_len(ctx, obj);
3443 
3444 			for (i = 0; i < n; i++)
3445 			{
3446 				pdf_obj *val = pdf_array_get(ctx, obj, i);
3447 				filter_changes_accepted(ctx, changes, val, filter);
3448 			}
3449 		}
3450 	}
3451 	fz_always(ctx)
3452 		if (obj_num != 0)
3453 			pdf_unmark_obj(ctx, obj);
3454 	fz_catch(ctx)
3455 		fz_rethrow(ctx);
3456 }
3457 
3458 static void
check_field(fz_context * ctx,pdf_document * doc,pdf_changes * changes,pdf_obj * obj,pdf_locked_fields * locked,const char * name_prefix,pdf_obj * new_v,pdf_obj * old_v)3459 check_field(fz_context *ctx, pdf_document *doc, pdf_changes *changes, pdf_obj *obj, pdf_locked_fields *locked, const char *name_prefix, pdf_obj *new_v, pdf_obj *old_v)
3460 {
3461 	pdf_obj *old_obj, *new_obj, *n_v, *o_v;
3462 	int o_xref_base;
3463 	int obj_num;
3464 	char *field_name = NULL;
3465 
3466 	/* All fields MUST be indirections, either in the Fields array
3467 	 * or AcroForms, or in the Kids array of other Fields. */
3468 	if (!pdf_is_indirect(ctx, obj))
3469 		return;
3470 
3471 	obj_num = pdf_to_num(ctx, obj);
3472 	o_xref_base = doc->xref_base;
3473 	new_obj = pdf_resolve_indirect_chain(ctx, obj);
3474 
3475 	/* Similarly, all fields must be dicts */
3476 	if (!pdf_is_dict(ctx, new_obj))
3477 		return;
3478 
3479 	if (pdf_obj_marked(ctx, obj))
3480 		return;
3481 
3482 	fz_var(field_name);
3483 
3484 	fz_try(ctx)
3485 	{
3486 		int i, len;
3487 		const char *name;
3488 		size_t n;
3489 		pdf_obj *t;
3490 		int is_locked;
3491 
3492 		pdf_mark_obj(ctx, obj);
3493 
3494 		/* Do this within the try, so we can catch any problems */
3495 		doc->xref_base = o_xref_base+1;
3496 		old_obj = pdf_resolve_indirect_chain(ctx, obj);
3497 
3498 		t = pdf_dict_get(ctx, old_obj, PDF_NAME(T));
3499 		if (t != NULL)
3500 		{
3501 			name = pdf_to_text_string(ctx, pdf_dict_get(ctx, old_obj, PDF_NAME(T)));
3502 			n = strlen(name)+1;
3503 			if (*name_prefix)
3504 				n += 1 + strlen(name_prefix);
3505 			field_name = fz_malloc(ctx, n);
3506 			if (*name_prefix)
3507 			{
3508 				strcpy(field_name, name_prefix);
3509 				strcat(field_name, ".");
3510 			}
3511 			else
3512 				*field_name = 0;
3513 			strcat(field_name, name);
3514 			name_prefix = field_name;
3515 		}
3516 
3517 		doc->xref_base = o_xref_base;
3518 
3519 		if (!pdf_is_dict(ctx, old_obj))
3520 			break;
3521 
3522 		/* Check V explicitly, allowing for it being inherited. */
3523 		n_v = pdf_dict_get(ctx, new_obj, PDF_NAME(V));
3524 		if (n_v == NULL)
3525 			n_v = new_v;
3526 		o_v = pdf_dict_get(ctx, old_obj, PDF_NAME(V));
3527 		if (o_v == NULL)
3528 			o_v = old_v;
3529 
3530 		is_locked = pdf_is_field_locked(ctx, locked, name_prefix);
3531 		if (pdf_name_eq(ctx, pdf_dict_get(ctx, new_obj, PDF_NAME(Type)), PDF_NAME(Annot)) &&
3532 			pdf_name_eq(ctx, pdf_dict_get(ctx, new_obj, PDF_NAME(Subtype)), PDF_NAME(Widget)))
3533 		{
3534 			if (is_locked)
3535 			{
3536 				/* If locked, V must not change! */
3537 				if (check_unchanged_between(ctx, doc, changes, n_v, o_v))
3538 					changes->obj_changes[obj_num] |= FIELD_CHANGE_INVALID;
3539 			}
3540 			else
3541 			{
3542 				/* If not locked, V can change to be filled in! */
3543 				filter_changes_accepted(ctx, changes, n_v, &filter_v);
3544 				changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
3545 			}
3546 		}
3547 
3548 		/* Check all the fields in the new object are
3549 		 * either the same as the old object, or are
3550 		 * expected changes. */
3551 		len = pdf_dict_len(ctx, new_obj);
3552 		for (i = 0; i < len; i++)
3553 		{
3554 			pdf_obj *key = pdf_dict_get_key(ctx, new_obj, i);
3555 			pdf_obj *nval = pdf_dict_get(ctx, new_obj, key);
3556 			pdf_obj *oval = pdf_dict_get(ctx, old_obj, key);
3557 
3558 			/* Kids arrays shouldn't change. */
3559 			if (pdf_name_eq(ctx, key, PDF_NAME(Kids)))
3560 			{
3561 				int j, m;
3562 
3563 				/* Kids must be an array. If it's not, count it as a difference. */
3564 				if (!pdf_is_array(ctx, nval) || !pdf_is_array(ctx, oval))
3565 				{
3566 change_found:
3567 					changes->obj_changes[obj_num] |= FIELD_CHANGE_INVALID;
3568 					break;
3569 				}
3570 				m = pdf_array_len(ctx, nval);
3571 				/* Any change in length counts as a difference */
3572 				if (m != pdf_array_len(ctx, oval))
3573 					goto change_found;
3574 				for (j = 0; j < m; j++)
3575 				{
3576 					pdf_obj *nkid = pdf_array_get(ctx, nval, j);
3577 					pdf_obj *okid = pdf_array_get(ctx, oval, j);
3578 					/* Kids arrays are supposed to all be indirect. If they aren't,
3579 					 * count it as a difference. */
3580 					if (!pdf_is_indirect(ctx, nkid) || !pdf_is_indirect(ctx, okid))
3581 						goto change_found;
3582 					/* For now at least, we'll count any change in number as a difference. */
3583 					if (pdf_to_num(ctx, nkid) != pdf_to_num(ctx, okid))
3584 						goto change_found;
3585 					check_field(ctx, doc, changes, nkid, locked, name_prefix, n_v, o_v);
3586 				}
3587 			}
3588 			else if (pdf_name_eq(ctx, key, PDF_NAME(V)))
3589 			{
3590 				/* V is checked above */
3591 			}
3592 			else if (pdf_name_eq(ctx, key, PDF_NAME(AP)))
3593 			{
3594 				/* If we're locked, then nothing can change. If not,
3595 				 * we can change to be filled in. */
3596 				if (is_locked)
3597 					check_unchanged_between(ctx, doc, changes, nval, oval);
3598 				else
3599 					filter_changes_accepted(ctx, changes, nval, &filter_ap);
3600 			}
3601 			/* All other fields can't change */
3602 			else
3603 				check_unchanged_between(ctx, doc, changes, nval, oval);
3604 		}
3605 
3606 		/* Now check all the fields in the old object to
3607 		 * make sure none were dropped. */
3608 		len = pdf_dict_len(ctx, old_obj);
3609 		for (i = 0; i < len; i++)
3610 		{
3611 			pdf_obj *key = pdf_dict_get_key(ctx, old_obj, i);
3612 			pdf_obj *nval, *oval;
3613 
3614 			/* V is checked above */
3615 			if (pdf_name_eq(ctx, key, PDF_NAME(V)))
3616 				continue;
3617 
3618 			nval = pdf_dict_get(ctx, new_obj, key);
3619 			oval = pdf_dict_get(ctx, old_obj, key);
3620 
3621 			if (nval == NULL && oval != NULL)
3622 				changes->obj_changes[pdf_to_num(ctx, nval)] |= FIELD_CHANGE_INVALID;
3623 		}
3624 		changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
3625 
3626 	}
3627 	fz_always(ctx)
3628 	{
3629 		pdf_unmark_obj(ctx, obj);
3630 		fz_free(ctx, field_name);
3631 		doc->xref_base = o_xref_base;
3632 	}
3633 	fz_catch(ctx)
3634 		fz_rethrow(ctx);
3635 }
3636 
3637 static int
pdf_obj_changed_in_version(fz_context * ctx,pdf_document * doc,int num,int version)3638 pdf_obj_changed_in_version(fz_context *ctx, pdf_document *doc, int num, int version)
3639 {
3640 	if (num < 0 || num > doc->max_xref_len)
3641 		fz_throw(ctx, FZ_ERROR_GENERIC, "Invalid object number requested");
3642 
3643 	return version == doc->xref_index[num];
3644 }
3645 
3646 static void
merge_lock_specification(fz_context * ctx,pdf_locked_fields * fields,pdf_obj * lock)3647 merge_lock_specification(fz_context *ctx, pdf_locked_fields *fields, pdf_obj *lock)
3648 {
3649 	pdf_obj *action;
3650 	int i, r, w;
3651 
3652 	if (lock == NULL)
3653 		return;
3654 
3655 	action = pdf_dict_get(ctx, lock, PDF_NAME(Action));
3656 
3657 	if (pdf_name_eq(ctx, action, PDF_NAME(All)))
3658 	{
3659 		/* All fields locked means we don't need any stored
3660 		 * includes/excludes. */
3661 		fields->all = 1;
3662 		free_char_list(ctx, &fields->includes);
3663 		free_char_list(ctx, &fields->excludes);
3664 	}
3665 	else
3666 	{
3667 		pdf_obj *f = pdf_dict_get(ctx, lock, PDF_NAME(Fields));
3668 		int len = pdf_array_len(ctx, f);
3669 
3670 		if (pdf_name_eq(ctx, action, PDF_NAME(Include)))
3671 		{
3672 			if (fields->all)
3673 			{
3674 				/* Current state = "All except <excludes> are locked".
3675 				 * We need to remove <Fields> from <excludes>. */
3676 				for (i = 0; i < len; i++)
3677 				{
3678 					const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3679 					int r, w;
3680 
3681 					for (r = w = 0; r < fields->excludes.len; r++)
3682 					{
3683 						if (strcmp(s, fields->excludes.list[r]))
3684 							fields->excludes.list[w++] = fields->excludes.list[r];
3685 					}
3686 					fields->excludes.len = w;
3687 				}
3688 			}
3689 			else
3690 			{
3691 				/* Current state = <includes> are locked.
3692 				 * We need to add <Fields> to <include> (avoiding repetition). */
3693 				for (i = 0; i < len; i++)
3694 				{
3695 					const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3696 
3697 					for (r = 0; r < fields->includes.len; r++)
3698 					{
3699 						if (!strcmp(s, fields->includes.list[r]))
3700 							break;
3701 					}
3702 					if (r == fields->includes.len)
3703 						char_list_append(ctx, &fields->includes, s);
3704 				}
3705 			}
3706 		}
3707 		else if (pdf_name_eq(ctx, action, PDF_NAME(Exclude)))
3708 		{
3709 			if (fields->all)
3710 			{
3711 				/* Current state = "All except <excludes> are locked.
3712 				 * We need to remove anything from <excludes> that isn't in <Fields>. */
3713 				for (r = w = 0; r < fields->excludes.len; r++)
3714 				{
3715 					for (i = 0; i < len; i++)
3716 					{
3717 						const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3718 						if (!strcmp(s, fields->excludes.list[r]))
3719 							break;
3720 					}
3721 					if (i != len) /* we found a match */
3722 						fields->excludes.list[w++] = fields->excludes.list[r];
3723 				}
3724 				fields->excludes.len = w;
3725 			}
3726 			else
3727 			{
3728 				/* Current state = <includes> are locked.
3729 				 * Set all. <excludes> becomes <Fields> less <includes>. Remove <includes>. */
3730 				fields->all = 1;
3731 				for (i = 0; i < len; i++)
3732 				{
3733 					const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3734 					for (r = 0; r < fields->includes.len; r++)
3735 					{
3736 						if (!strcmp(s, fields->includes.list[r]))
3737 							break;
3738 					}
3739 					if (r == fields->includes.len)
3740 						char_list_append(ctx, &fields->excludes, s);
3741 				}
3742 				free_char_list(ctx, &fields->includes);
3743 			}
3744 		}
3745 	}
3746 }
3747 
3748 static void
find_locked_fields_value(fz_context * ctx,pdf_locked_fields * fields,pdf_obj * v)3749 find_locked_fields_value(fz_context *ctx, pdf_locked_fields *fields, pdf_obj *v)
3750 {
3751 	pdf_obj *ref = pdf_dict_get(ctx, v, PDF_NAME(Reference));
3752 	int i, n;
3753 
3754 	if (!ref)
3755 		return;
3756 
3757 	n = pdf_array_len(ctx, ref);
3758 	for (i = 0; i < n; i++)
3759 	{
3760 		pdf_obj *sr = pdf_array_get(ctx, ref, i);
3761 		pdf_obj *tm, *tp, *type;
3762 
3763 		/* Type is optional, but if it exists, it'd better be SigRef. */
3764 		type = pdf_dict_get(ctx, sr, PDF_NAME(Type));
3765 		if (type != NULL && !pdf_name_eq(ctx, type, PDF_NAME(SigRef)))
3766 			continue;
3767 		tm = pdf_dict_get(ctx, sr, PDF_NAME(TransformMethod));
3768 		tp = pdf_dict_get(ctx, sr, PDF_NAME(TransformParams));
3769 		if (pdf_name_eq(ctx, tm, PDF_NAME(DocMDP)))
3770 		{
3771 			int p = pdf_to_int(ctx, pdf_dict_get(ctx, tp, PDF_NAME(P)));
3772 
3773 			if (p == 0)
3774 				p = 2;
3775 			if (fields->p == 0)
3776 				fields->p = p;
3777 			else
3778 				fields->p = fz_mini(fields->p, p);
3779 		}
3780 		else if (pdf_name_eq(ctx, tm, PDF_NAME(FieldMDP)))
3781 			merge_lock_specification(ctx, fields, tp);
3782 	}
3783 }
3784 
3785 static void
find_locked_fields_aux(fz_context * ctx,pdf_obj * field,pdf_locked_fields * fields,pdf_obj * inherit_v,pdf_obj * inherit_ft)3786 find_locked_fields_aux(fz_context *ctx, pdf_obj *field, pdf_locked_fields *fields, pdf_obj *inherit_v, pdf_obj *inherit_ft)
3787 {
3788 	int i, n;
3789 
3790 	if (!pdf_name_eq(ctx, pdf_dict_get(ctx, field, PDF_NAME(Type)), PDF_NAME(Annot)))
3791 		return;
3792 
3793 	if (pdf_obj_marked(ctx, field))
3794 		return;
3795 
3796 	fz_try(ctx)
3797 	{
3798 		pdf_obj *kids, *v, *ft;
3799 
3800 		pdf_mark_obj(ctx, field);
3801 
3802 		v = pdf_dict_get(ctx, field, PDF_NAME(V));
3803 		if (v == NULL)
3804 			v = inherit_v;
3805 		ft = pdf_dict_get(ctx, field, PDF_NAME(FT));
3806 		if (ft == NULL)
3807 			ft = inherit_ft;
3808 
3809 		/* We are looking for Widget annotations of type Sig that are
3810 		 * signed (i.e. have a 'V' field). */
3811 		if (pdf_name_eq(ctx, pdf_dict_get(ctx, field, PDF_NAME(Subtype)), PDF_NAME(Widget)) &&
3812 			pdf_name_eq(ctx, ft, PDF_NAME(Sig)) &&
3813 			pdf_name_eq(ctx, pdf_dict_get(ctx, v, PDF_NAME(Type)), PDF_NAME(Sig)))
3814 		{
3815 			/* Signed Sig Widgets (i.e. ones with a 'V' field) need
3816 			 * to have their lock field respected. */
3817 			merge_lock_specification(ctx, fields, pdf_dict_get(ctx, field, PDF_NAME(Lock)));
3818 
3819 			/* Look for DocMDP and FieldMDP entries to see what
3820 			 * flavours of alterations are allowed. */
3821 			find_locked_fields_value(ctx, fields, v);
3822 		}
3823 
3824 		/* Recurse as required */
3825 		kids = pdf_dict_get(ctx, field, PDF_NAME(Kids));
3826 		if (kids)
3827 		{
3828 			n = pdf_array_len(ctx, kids);
3829 			for (i = 0; i < n; i++)
3830 				find_locked_fields_aux(ctx, pdf_array_get(ctx, kids, i), fields, v, ft);
3831 		}
3832 	}
3833 	fz_always(ctx)
3834 		pdf_unmark_obj(ctx, field);
3835 	fz_catch(ctx)
3836 		fz_rethrow(ctx);
3837 }
3838 
3839 pdf_locked_fields *
pdf_find_locked_fields(fz_context * ctx,pdf_document * doc,int version)3840 pdf_find_locked_fields(fz_context *ctx, pdf_document *doc, int version)
3841 {
3842 	pdf_locked_fields *fields = fz_malloc_struct(ctx, pdf_locked_fields);
3843 	int o_xref_base = doc->xref_base;
3844 	doc->xref_base = version;
3845 
3846 	fz_var(fields);
3847 
3848 	fz_try(ctx)
3849 	{
3850 		pdf_obj *fobj = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm/Fields");
3851 		int i, len = pdf_array_len(ctx, fobj);
3852 
3853 		if (len == 0)
3854 			break;
3855 
3856 		for (i = 0; i < len; i++)
3857 			find_locked_fields_aux(ctx, pdf_array_get(ctx, fobj, i), fields, NULL, NULL);
3858 
3859 		/* Add in any DocMDP referenced directly from the Perms dict. */
3860 		find_locked_fields_value(ctx, fields, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Perms/DocMDP"));
3861 	}
3862 	fz_always(ctx)
3863 		doc->xref_base = o_xref_base;
3864 	fz_catch(ctx)
3865 	{
3866 		pdf_drop_locked_fields(ctx, fields);
3867 		fz_rethrow(ctx);
3868 	}
3869 
3870 	return fields;
3871 }
3872 
3873 pdf_locked_fields *
pdf_find_locked_fields_for_sig(fz_context * ctx,pdf_document * doc,pdf_obj * sig)3874 pdf_find_locked_fields_for_sig(fz_context *ctx, pdf_document *doc, pdf_obj *sig)
3875 {
3876 	pdf_locked_fields *fields = fz_malloc_struct(ctx, pdf_locked_fields);
3877 
3878 	fz_var(fields);
3879 
3880 	fz_try(ctx)
3881 	{
3882 		pdf_obj *ref;
3883 		int i, len;
3884 
3885 		/* Ensure it really is a sig */
3886 		if (!pdf_name_eq(ctx, pdf_dict_get(ctx, sig, PDF_NAME(Subtype)), PDF_NAME(Widget)) ||
3887 			!pdf_name_eq(ctx, pdf_dict_get_inheritable(ctx, sig, PDF_NAME(FT)), PDF_NAME(Sig)))
3888 			break;
3889 
3890 		/* Check the locking details given in the V (i.e. what the signature value
3891 		 * claims to lock). */
3892 		ref = pdf_dict_getp(ctx, sig, "V/Reference");
3893 		len = pdf_array_len(ctx, ref);
3894 		for (i = 0; i < len; i++)
3895 		{
3896 			pdf_obj *tp = pdf_dict_get(ctx, pdf_array_get(ctx, ref, i), PDF_NAME(TransformParams));
3897 			merge_lock_specification(ctx, fields, tp);
3898 		}
3899 
3900 		/* Also, check the locking details given in the Signature definition. This may
3901 		 * not strictly be necessary as it's supposed to be "what the form author told
3902 		 * the signature that it should lock". A well-formed signature should lock
3903 		 * at least that much (possibly with extra fields locked from the XFA). If the
3904 		 * signature doesn't lock as much as it was told to, we should be suspicious
3905 		 * of the signing application. It is not clear that this test is actually
3906 		 * necessary, or in keeping with what Acrobat does. */
3907 		merge_lock_specification(ctx, fields, pdf_dict_get(ctx, sig, PDF_NAME(Lock)));
3908 	}
3909 	fz_catch(ctx)
3910 	{
3911 		pdf_drop_locked_fields(ctx, fields);
3912 		fz_rethrow(ctx);
3913 	}
3914 
3915 	return fields;
3916 }
3917 
3918 static int
validate_locked_fields(fz_context * ctx,pdf_document * doc,int version,pdf_locked_fields * locked)3919 validate_locked_fields(fz_context *ctx, pdf_document *doc, int version, pdf_locked_fields *locked)
3920 {
3921 	int o_xref_base = doc->xref_base;
3922 	pdf_changes *changes;
3923 	int num_objs;
3924 	int i, n;
3925 	int all_indirects = 1;
3926 
3927 	num_objs = doc->max_xref_len;
3928 	changes = Memento_label(fz_calloc(ctx, 1, sizeof(*changes) + sizeof(int)*(num_objs-1)), "pdf_changes");
3929 	changes->num_obj = num_objs;
3930 
3931 	fz_try(ctx)
3932 	{
3933 		pdf_obj *acroform, *new_acroform, *old_acroform;
3934 		int len, acroform_num;
3935 
3936 		doc->xref_base = version;
3937 
3938 		/* Detect every object that has changed */
3939 		for (i = 1; i < num_objs; i++)
3940 		{
3941 			if (pdf_obj_changed_in_version(ctx, doc, i, version))
3942 				changes->obj_changes[i] = FIELD_CHANGED;
3943 		}
3944 
3945 		/* FIXME: Compare PageTrees and NumberTrees (just to allow for them being regenerated
3946 		 * and having produced stuff that represents the same stuff). */
3947 
3948 		/* The metadata of a document may be regenerated. Allow for that. */
3949 		filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Metadata"), &filter_simple);
3950 
3951 		/* The ModDate of document info may be regenerated. Allow for that. */
3952 		/* FIXME: We accept all changes in document info, when maybe we ought to just
3953 		 * accept ModDate? */
3954 		filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Info"), &filter_simple);
3955 
3956 		/* The Encryption dict may be rewritten for the new Xref. */
3957 		filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Encrypt"), &filter_simple);
3958 
3959 		/* We have to accept certain changes in the top level AcroForms dict,
3960 		 * so get the 2 versions... */
3961 		acroform = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm");
3962 		acroform_num = pdf_to_num(ctx, acroform);
3963 		new_acroform = pdf_resolve_indirect_chain(ctx, acroform);
3964 		doc->xref_base = version+1;
3965 		old_acroform = pdf_resolve_indirect_chain(ctx, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm"));
3966 		doc->xref_base = version;
3967 		n = pdf_dict_len(ctx, new_acroform);
3968 		for (i = 0; i < n; i++)
3969 		{
3970 			pdf_obj *key = pdf_dict_get_key(ctx, new_acroform, i);
3971 			pdf_obj *nval = pdf_dict_get(ctx, new_acroform, key);
3972 			pdf_obj *oval = pdf_dict_get(ctx, old_acroform, key);
3973 
3974 			if (pdf_name_eq(ctx, key, PDF_NAME(Fields)))
3975 			{
3976 				int j;
3977 
3978 				len = pdf_array_len(ctx, nval);
3979 				for (j = 0; j < len; j++)
3980 				{
3981 					pdf_obj *field = pdf_array_get(ctx, nval, j);
3982 					if (!pdf_is_indirect(ctx, field))
3983 						all_indirects = 0;
3984 					check_field(ctx, doc, changes, field, locked, "", NULL, NULL);
3985 				}
3986 			}
3987 			else if (pdf_name_eq(ctx, key, PDF_NAME(SigFlags)))
3988 			{
3989 				/* Accept this */
3990 				changes->obj_changes[acroform_num] |= FIELD_CHANGE_VALID;
3991 			}
3992 			else if (pdf_name_eq(ctx, key, PDF_NAME(DR)))
3993 			{
3994 				/* Accept any changes from within the Document Resources */
3995 				filter_changes_accepted(ctx, changes, nval, &filter_resources);
3996 			}
3997 			else if (pdf_name_eq(ctx, key, PDF_NAME(XFA)))
3998 			{
3999 				/* Allow any changes within the XFA streams. */
4000 				filter_changes_accepted(ctx, changes, nval, &filter_xfa);
4001 			}
4002 			else if (pdf_objcmp(ctx, nval, oval))
4003 			{
4004 				changes->obj_changes[acroform_num] |= FIELD_CHANGE_INVALID;
4005 			}
4006 		}
4007 
4008 		/* Allow for any object streams/XRefs to be changed. */
4009 		doc->xref_base = version+1;
4010 		for (i = 1; i < num_objs; i++)
4011 		{
4012 			pdf_obj *oobj, *otype;
4013 			if (changes->obj_changes[i] != FIELD_CHANGED)
4014 				continue;
4015 			if (!pdf_obj_exists(ctx, doc, i))
4016 			{
4017 				/* Not present this version - must be newly created, can't be a change. */
4018 				changes->obj_changes[i] |= FIELD_CHANGE_VALID;
4019 				continue;
4020 			}
4021 			oobj = pdf_load_object(ctx, doc, i);
4022 			otype = pdf_dict_get(ctx, oobj, PDF_NAME(Type));
4023 			if (pdf_name_eq(ctx, otype, PDF_NAME(ObjStm)) ||
4024 				pdf_name_eq(ctx, otype, PDF_NAME(XRef)))
4025 			{
4026 				changes->obj_changes[i] |= FIELD_CHANGE_VALID;
4027 			}
4028 			pdf_drop_obj(ctx, oobj);
4029 		}
4030 	}
4031 	fz_always(ctx)
4032 		doc->xref_base = o_xref_base;
4033 	fz_catch(ctx)
4034 		fz_rethrow(ctx);
4035 
4036 	for (i = 1; i < num_objs; i++)
4037 	{
4038 		if (changes->obj_changes[i] == FIELD_CHANGED)
4039 			/* Change with no reason */
4040 			break;
4041 		if (changes->obj_changes[i] & FIELD_CHANGE_INVALID)
4042 			/* Illegal Change */
4043 			break;
4044 	}
4045 
4046 	fz_free(ctx, changes);
4047 
4048 	return (i == num_objs) && all_indirects;
4049 }
4050 
4051 int
pdf_validate_changes(fz_context * ctx,pdf_document * doc,int version)4052 pdf_validate_changes(fz_context *ctx, pdf_document *doc, int version)
4053 {
4054 	int unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4055 	int n = pdf_count_versions(ctx, doc);
4056 	pdf_locked_fields *locked = NULL;
4057 	int result;
4058 
4059 	if (version < 0 || version >= n)
4060 		fz_throw(ctx, FZ_ERROR_GENERIC, "There aren't that many changes to find in this document!");
4061 
4062 	/* We are wanting to compare version+1 with version to make sure
4063 	 * that the only changes made in going to version are conformant
4064 	 * with what was allowed in version+1. The production of version
4065 	 * might have involved signing a signature field and locking down
4066 	 * more fields - this means that taking the list of locked things
4067 	 * from version rather than version+1 will give us bad results! */
4068 	locked = pdf_find_locked_fields(ctx, doc, unsaved_versions+version+1);
4069 
4070 	if (!locked->all && locked->includes.len == 0 && locked->p == 0)
4071 	{
4072 		/* If nothing is locked at all, then all changes are permissible. */
4073 		result = 1;
4074 	}
4075 	else
4076 		result = validate_locked_fields(ctx, doc, unsaved_versions+version, locked);
4077 
4078 	pdf_drop_locked_fields(ctx, locked);
4079 
4080 	return result;
4081 }
4082 
4083 int
pdf_validate_change_history(fz_context * ctx,pdf_document * doc)4084 pdf_validate_change_history(fz_context *ctx, pdf_document *doc)
4085 {
4086 	int num_versions = pdf_count_versions(ctx, doc);
4087 	int v;
4088 
4089 	if (num_versions < 2)
4090 		return 0; /* Unless there are at least 2 versions, there have been no updates. */
4091 
4092 	for(v = num_versions - 2; v >= 0; v--)
4093 	{
4094 		if (!pdf_validate_changes(ctx, doc, v))
4095 			return v+1;
4096 	}
4097 	return 0;
4098 }
4099 
4100 /* Return the version that obj appears in, or -1 for not found. */
4101 static int
pdf_find_incremental_update_num_for_obj(fz_context * ctx,pdf_document * doc,pdf_obj * obj)4102 pdf_find_incremental_update_num_for_obj(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
4103 {
4104 	pdf_xref *xref = NULL;
4105 	pdf_xref_subsec *sub;
4106 	int i, j;
4107 
4108 	if (obj == NULL)
4109 		return -1;
4110 
4111 	/* obj needs to be indirect for us to get a num out of it. */
4112 	i = pdf_to_num(ctx, obj);
4113 	if (i <= 0)
4114 		return -1;
4115 
4116 	/* obj can't be indirect below, so resolve it here. */
4117 	obj = pdf_resolve_indirect_chain(ctx, obj);
4118 
4119 	/* Find the first xref section where the entry is defined. */
4120 	for (j = 0; j < doc->num_xref_sections; j++)
4121 	{
4122 		xref = &doc->xref_sections[j];
4123 
4124 		if (i < xref->num_objects)
4125 		{
4126 			for (sub = xref->subsec; sub != NULL; sub = sub->next)
4127 			{
4128 				pdf_xref_entry *entry;
4129 
4130 				if (i < sub->start || i >= sub->start + sub->len)
4131 					continue;
4132 
4133 				entry = &sub->table[i - sub->start];
4134 				if (entry->obj == obj)
4135 					return j;
4136 			}
4137 		}
4138 	}
4139 	return -1;
4140 }
4141 
pdf_find_version_for_obj(fz_context * ctx,pdf_document * doc,pdf_obj * obj)4142 int pdf_find_version_for_obj(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
4143 {
4144 	int v = pdf_find_incremental_update_num_for_obj(ctx, doc, obj);
4145 	int n;
4146 
4147 	if (v == -1)
4148 		return -1;
4149 
4150 	n = pdf_count_versions(ctx, doc) + pdf_count_unsaved_versions(ctx, doc);
4151 	if (v > n)
4152 		return n;
4153 
4154 	return v;
4155 }
4156 
pdf_validate_signature(fz_context * ctx,pdf_widget * widget)4157 int pdf_validate_signature(fz_context *ctx, pdf_widget *widget)
4158 {
4159 	pdf_document *doc = widget->page->doc;
4160 	int unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4161 	int num_versions = pdf_count_versions(ctx, doc) + unsaved_versions;
4162 	int version = pdf_find_version_for_obj(ctx, doc, widget->obj);
4163 	int i;
4164 	pdf_locked_fields *locked = NULL;
4165 	int o_xref_base;
4166 
4167 	if (version > num_versions-1)
4168 		version = num_versions-1;
4169 
4170 	/* Get the locked definition from the object when it was signed. */
4171 	o_xref_base = doc->xref_base;
4172 	doc->xref_base = version;
4173 
4174 	fz_var(locked); /* Not really needed, but it stops warnings */
4175 
4176 	fz_try(ctx)
4177 	{
4178 		locked = pdf_find_locked_fields_for_sig(ctx, doc, widget->obj);
4179 		for (i = version-1; i >= unsaved_versions; i--)
4180 		{
4181 			doc->xref_base = i;
4182 			if (!validate_locked_fields(ctx, doc, i, locked))
4183 				break;
4184 		}
4185 	}
4186 	fz_always(ctx)
4187 	{
4188 		doc->xref_base = o_xref_base;
4189 		pdf_drop_locked_fields(ctx, locked);
4190 	}
4191 	fz_catch(ctx)
4192 		fz_rethrow(ctx);
4193 
4194 	return i+1-unsaved_versions;
4195 }
4196 
pdf_was_pure_xfa(fz_context * ctx,pdf_document * doc)4197 int pdf_was_pure_xfa(fz_context *ctx, pdf_document *doc)
4198 {
4199 	int num_unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4200 	int num_versions = pdf_count_versions(ctx, doc);
4201 	int v;
4202 	int o_xref_base = doc->xref_base;
4203 	int pure_xfa = 0;
4204 
4205 	fz_var(pure_xfa);
4206 
4207 	fz_try(ctx)
4208 	{
4209 		for(v = num_versions + num_unsaved_versions; !pure_xfa && v >= num_unsaved_versions; v--)
4210 		{
4211 			pdf_obj *o;
4212 			doc->xref_base = v;
4213 			o = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm");
4214 			/* If we find a version that had an empty Root/AcroForm/Fields, but had a
4215 			 * Root/AcroForm/XFA entry, then we deduce that this was at one time a
4216 			 * pure XFA form. */
4217 			if (pdf_array_len(ctx, pdf_dict_get(ctx, o, PDF_NAME(Fields))) == 0 &&
4218 				pdf_dict_get(ctx, o, PDF_NAME(XFA)) != NULL)
4219 				pure_xfa = 1;
4220 		}
4221 	}
4222 	fz_always(ctx)
4223 		doc->xref_base = o_xref_base;
4224 	fz_catch(ctx)
4225 		fz_rethrow(ctx);
4226 
4227 	return pure_xfa;
4228 }
4229