1 #include "mupdf/fitz.h"
2 #include "mupdf/pdf.h"
3
4 #include <assert.h>
5 #include <limits.h>
6 #include <string.h>
7
8 #undef DEBUG_PROGESSIVE_ADVANCE
9
10 #ifdef DEBUG_PROGESSIVE_ADVANCE
11 #define DEBUGMESS(A) do { fz_warn A; } while (0)
12 #else
13 #define DEBUGMESS(A) do { } while (0)
14 #endif
15
16 #define isdigit(c) (c >= '0' && c <= '9')
17
iswhite(int ch)18 static inline int iswhite(int ch)
19 {
20 return
21 ch == '\000' || ch == '\011' || ch == '\012' ||
22 ch == '\014' || ch == '\015' || ch == '\040';
23 }
24
25 /*
26 * xref tables
27 */
28
pdf_drop_xref_sections_imp(fz_context * ctx,pdf_document * doc,pdf_xref * xref_sections,int num_xref_sections)29 static void pdf_drop_xref_sections_imp(fz_context *ctx, pdf_document *doc, pdf_xref *xref_sections, int num_xref_sections)
30 {
31 pdf_unsaved_sig *usig;
32 int x, e;
33
34 for (x = 0; x < num_xref_sections; x++)
35 {
36 pdf_xref *xref = &xref_sections[x];
37 pdf_xref_subsec *sub = xref->subsec;
38
39 while (sub != NULL)
40 {
41 pdf_xref_subsec *next_sub = sub->next;
42 for (e = 0; e < sub->len; e++)
43 {
44 pdf_xref_entry *entry = &sub->table[e];
45 if (entry->obj)
46 {
47 pdf_drop_obj(ctx, entry->obj);
48 fz_drop_buffer(ctx, entry->stm_buf);
49 }
50 }
51 fz_free(ctx, sub->table);
52 fz_free(ctx, sub);
53 sub = next_sub;
54 }
55
56 pdf_drop_obj(ctx, xref->pre_repair_trailer);
57 pdf_drop_obj(ctx, xref->trailer);
58
59 while ((usig = xref->unsaved_sigs) != NULL)
60 {
61 xref->unsaved_sigs = usig->next;
62 pdf_drop_obj(ctx, usig->field);
63 pdf_drop_signer(ctx, usig->signer);
64 fz_free(ctx, usig);
65 }
66 }
67
68 fz_free(ctx, xref_sections);
69 }
70
pdf_drop_xref_sections(fz_context * ctx,pdf_document * doc)71 static void pdf_drop_xref_sections(fz_context *ctx, pdf_document *doc)
72 {
73 pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
74 pdf_drop_xref_sections_imp(ctx, doc, doc->xref_sections, doc->num_xref_sections);
75
76 doc->saved_xref_sections = NULL;
77 doc->saved_num_xref_sections = 0;
78 doc->xref_sections = NULL;
79 doc->num_xref_sections = 0;
80 doc->num_incremental_sections = 0;
81 }
82
83 static void
extend_xref_index(fz_context * ctx,pdf_document * doc,int newlen)84 extend_xref_index(fz_context *ctx, pdf_document *doc, int newlen)
85 {
86 int i;
87
88 doc->xref_index = fz_realloc_array(ctx, doc->xref_index, newlen, int);
89 for (i = doc->max_xref_len; i < newlen; i++)
90 {
91 doc->xref_index[i] = 0;
92 }
93 doc->max_xref_len = newlen;
94 }
95
96 /* This is only ever called when we already have an incremental
97 * xref. This means there will only be 1 subsec, and it will be
98 * a complete subsec. */
pdf_resize_xref(fz_context * ctx,pdf_document * doc,int newlen)99 static void pdf_resize_xref(fz_context *ctx, pdf_document *doc, int newlen)
100 {
101 int i;
102 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
103 pdf_xref_subsec *sub;
104
105 assert(xref != NULL);
106 sub = xref->subsec;
107 assert(sub->next == NULL && sub->start == 0 && sub->len == xref->num_objects);
108 assert(newlen > xref->num_objects);
109
110 sub->table = fz_realloc_array(ctx, sub->table, newlen, pdf_xref_entry);
111 for (i = xref->num_objects; i < newlen; i++)
112 {
113 sub->table[i].type = 0;
114 sub->table[i].ofs = 0;
115 sub->table[i].gen = 0;
116 sub->table[i].num = 0;
117 sub->table[i].stm_ofs = 0;
118 sub->table[i].stm_buf = NULL;
119 sub->table[i].obj = NULL;
120 }
121 xref->num_objects = newlen;
122 sub->len = newlen;
123 if (doc->max_xref_len < newlen)
124 extend_xref_index(ctx, doc, newlen);
125 }
126
pdf_populate_next_xref_level(fz_context * ctx,pdf_document * doc)127 static void pdf_populate_next_xref_level(fz_context *ctx, pdf_document *doc)
128 {
129 pdf_xref *xref;
130 doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
131 doc->num_xref_sections++;
132
133 xref = &doc->xref_sections[doc->num_xref_sections - 1];
134 xref->subsec = NULL;
135 xref->num_objects = 0;
136 xref->trailer = NULL;
137 xref->pre_repair_trailer = NULL;
138 xref->unsaved_sigs = NULL;
139 xref->unsaved_sigs_end = NULL;
140 }
141
pdf_trailer(fz_context * ctx,pdf_document * doc)142 pdf_obj *pdf_trailer(fz_context *ctx, pdf_document *doc)
143 {
144 /* Return the document's trailer (of the appopriate vintage) */
145 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
146
147 return xref ? xref->trailer : NULL;
148 }
149
pdf_set_populating_xref_trailer(fz_context * ctx,pdf_document * doc,pdf_obj * trailer)150 void pdf_set_populating_xref_trailer(fz_context *ctx, pdf_document *doc, pdf_obj *trailer)
151 {
152 /* Update the trailer of the xref section being populated */
153 pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections - 1];
154 if (xref->trailer)
155 {
156 pdf_drop_obj(ctx, xref->pre_repair_trailer);
157 xref->pre_repair_trailer = xref->trailer;
158 }
159 xref->trailer = pdf_keep_obj(ctx, trailer);
160 }
161
pdf_xref_len(fz_context * ctx,pdf_document * doc)162 int pdf_xref_len(fz_context *ctx, pdf_document *doc)
163 {
164 return doc->max_xref_len;
165 }
166
167 /* Ensure that the given xref has a single subsection
168 * that covers the entire range. */
169 static void
ensure_solid_xref(fz_context * ctx,pdf_document * doc,int num,int which)170 ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num, int which)
171 {
172 pdf_xref *xref = &doc->xref_sections[which];
173 pdf_xref_subsec *sub = xref->subsec;
174 pdf_xref_subsec *new_sub;
175
176 if (num < xref->num_objects)
177 num = xref->num_objects;
178
179 if (sub != NULL && sub->next == NULL && sub->start == 0 && sub->len >= num)
180 return;
181
182 new_sub = fz_malloc_struct(ctx, pdf_xref_subsec);
183 fz_try(ctx)
184 {
185 new_sub->table = fz_calloc(ctx, num, sizeof(pdf_xref_entry));
186 new_sub->start = 0;
187 new_sub->len = num;
188 new_sub->next = NULL;
189 }
190 fz_catch(ctx)
191 {
192 fz_free(ctx, new_sub);
193 fz_rethrow(ctx);
194 }
195
196 /* Move objects over to the new subsection and destroy the old
197 * ones */
198 sub = xref->subsec;
199 while (sub != NULL)
200 {
201 pdf_xref_subsec *next = sub->next;
202 int i;
203
204 for (i = 0; i < sub->len; i++)
205 {
206 new_sub->table[i+sub->start] = sub->table[i];
207 }
208 fz_free(ctx, sub->table);
209 fz_free(ctx, sub);
210 sub = next;
211 }
212 xref->num_objects = num;
213 xref->subsec = new_sub;
214 if (doc->max_xref_len < num)
215 extend_xref_index(ctx, doc, num);
216 }
217
pdf_get_populating_xref_entry(fz_context * ctx,pdf_document * doc,int num)218 pdf_xref_entry *pdf_get_populating_xref_entry(fz_context *ctx, pdf_document *doc, int num)
219 {
220 /* Return an entry within the xref currently being populated */
221 pdf_xref *xref;
222 pdf_xref_subsec *sub;
223
224 if (doc->num_xref_sections == 0)
225 {
226 doc->xref_sections = fz_malloc_struct(ctx, pdf_xref);
227 doc->num_xref_sections = 1;
228 }
229
230 /* Prevent accidental heap underflow */
231 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
232 fz_throw(ctx, FZ_ERROR_GENERIC, "object number out of range (%d)", num);
233
234 /* Return the pointer to the entry in the last section. */
235 xref = &doc->xref_sections[doc->num_xref_sections-1];
236
237 for (sub = xref->subsec; sub != NULL; sub = sub->next)
238 {
239 if (num >= sub->start && num < sub->start + sub->len)
240 return &sub->table[num-sub->start];
241 }
242
243 /* We've been asked for an object that's not in a subsec. */
244 ensure_solid_xref(ctx, doc, num+1, doc->num_xref_sections-1);
245 xref = &doc->xref_sections[doc->num_xref_sections-1];
246 sub = xref->subsec;
247
248 return &sub->table[num-sub->start];
249 }
250
pdf_get_xref_entry(fz_context * ctx,pdf_document * doc,int i)251 pdf_xref_entry *pdf_get_xref_entry(fz_context *ctx, pdf_document *doc, int i)
252 {
253 pdf_xref *xref = NULL;
254 pdf_xref_subsec *sub;
255 int j;
256
257 if (i < 0)
258 fz_throw(ctx, FZ_ERROR_GENERIC, "Negative object number requested");
259
260 if (i <= doc->max_xref_len)
261 j = doc->xref_index[i];
262 else
263 j = 0;
264
265 /* We may be accessing an earlier version of the document using xref_base
266 * and j may be an index into a later xref section */
267 if (doc->xref_base > j)
268 j = doc->xref_base;
269
270 /* Find the first xref section where the entry is defined. */
271 for (; j < doc->num_xref_sections; j++)
272 {
273 xref = &doc->xref_sections[j];
274
275 if (i < xref->num_objects)
276 {
277 for (sub = xref->subsec; sub != NULL; sub = sub->next)
278 {
279 pdf_xref_entry *entry;
280
281 if (i < sub->start || i >= sub->start + sub->len)
282 continue;
283
284 entry = &sub->table[i - sub->start];
285 if (entry->type)
286 {
287 /* Don't update xref_index if xref_base may have
288 * influenced the value of j */
289 if (doc->xref_base == 0)
290 doc->xref_index[i] = j;
291 return entry;
292 }
293 }
294 }
295 }
296
297 /* Didn't find the entry in any section. Return the entry from
298 * the final section. */
299 doc->xref_index[i] = 0;
300 if (xref == NULL || i < xref->num_objects)
301 {
302 xref = &doc->xref_sections[doc->xref_base];
303 for (sub = xref->subsec; sub != NULL; sub = sub->next)
304 {
305 if (i >= sub->start && i < sub->start + sub->len)
306 return &sub->table[i - sub->start];
307 }
308 }
309
310 /* At this point, we solidify the xref. This ensures that we
311 * can return a pointer. This is the only case where this function
312 * might throw an exception, and it will never happen when we are
313 * working within a 'solid' xref. */
314 ensure_solid_xref(ctx, doc, i+1, 0);
315 xref = &doc->xref_sections[0];
316 sub = xref->subsec;
317 return &sub->table[i - sub->start];
318 }
319
320 /*
321 Ensure we have an incremental xref section where we can store
322 updated versions of indirect objects. This is a new xref section
323 consisting of a single xref subsection.
324 */
ensure_incremental_xref(fz_context * ctx,pdf_document * doc)325 static void ensure_incremental_xref(fz_context *ctx, pdf_document *doc)
326 {
327 /* If there are as yet no incremental sections, or if the most recent
328 * one has been used to sign a signature field, then we need a new one.
329 * After a signing, any further document changes require a new increment */
330 if ((doc->num_incremental_sections == 0 || doc->xref_sections[0].unsaved_sigs != NULL)
331 && !doc->disallow_new_increments)
332 {
333 pdf_xref *xref = &doc->xref_sections[0];
334 pdf_xref *pxref;
335 pdf_xref_entry *new_table = fz_calloc(ctx, xref->num_objects, sizeof(pdf_xref_entry));
336 pdf_xref_subsec *sub = NULL;
337 pdf_obj *trailer = NULL;
338 int i;
339
340 fz_var(trailer);
341 fz_var(sub);
342 fz_try(ctx)
343 {
344 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
345 trailer = xref->trailer ? pdf_copy_dict(ctx, xref->trailer) : NULL;
346 doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
347 xref = &doc->xref_sections[0];
348 pxref = &doc->xref_sections[1];
349 memmove(pxref, xref, doc->num_xref_sections * sizeof(pdf_xref));
350 /* xref->num_objects is already correct */
351 xref->subsec = sub;
352 sub = NULL;
353 xref->trailer = trailer;
354 xref->pre_repair_trailer = NULL;
355 xref->unsaved_sigs = NULL;
356 xref->unsaved_sigs_end = NULL;
357 xref->subsec->next = NULL;
358 xref->subsec->len = xref->num_objects;
359 xref->subsec->start = 0;
360 xref->subsec->table = new_table;
361 doc->num_xref_sections++;
362 doc->num_incremental_sections++;
363 }
364 fz_catch(ctx)
365 {
366 fz_free(ctx, sub);
367 fz_free(ctx, new_table);
368 pdf_drop_obj(ctx, trailer);
369 fz_rethrow(ctx);
370 }
371
372 /* Update the xref_index */
373 for (i = 0; i < doc->max_xref_len; i++)
374 {
375 doc->xref_index[i]++;
376 }
377 }
378 }
379
380 /* Used when altering a document */
pdf_get_incremental_xref_entry(fz_context * ctx,pdf_document * doc,int i)381 static pdf_xref_entry *pdf_get_incremental_xref_entry(fz_context *ctx, pdf_document *doc, int i)
382 {
383 pdf_xref *xref;
384 pdf_xref_subsec *sub;
385
386 /* Make a new final xref section if we haven't already */
387 ensure_incremental_xref(ctx, doc);
388
389 xref = &doc->xref_sections[doc->xref_base];
390 if (i >= xref->num_objects)
391 pdf_resize_xref(ctx, doc, i + 1);
392
393 sub = xref->subsec;
394 assert(sub != NULL && sub->next == NULL);
395 assert(i >= sub->start && i < sub->start + sub->len);
396 doc->xref_index[i] = 0;
397 return &sub->table[i - sub->start];
398 }
399
pdf_xref_is_incremental(fz_context * ctx,pdf_document * doc,int num)400 int pdf_xref_is_incremental(fz_context *ctx, pdf_document *doc, int num)
401 {
402 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
403 pdf_xref_subsec *sub = xref->subsec;
404
405 assert(sub != NULL && sub->next == NULL && sub->len == xref->num_objects && sub->start == 0);
406
407 return num < xref->num_objects && sub->table[num].type;
408 }
409
pdf_xref_store_unsaved_signature(fz_context * ctx,pdf_document * doc,pdf_obj * field,pdf_pkcs7_signer * signer)410 void pdf_xref_store_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field, pdf_pkcs7_signer *signer)
411 {
412 pdf_xref *xref = &doc->xref_sections[0];
413 pdf_unsaved_sig *unsaved_sig;
414
415 /* Record details within the document structure so that contents
416 * and byte_range can be updated with their correct values at
417 * saving time */
418 unsaved_sig = fz_malloc_struct(ctx, pdf_unsaved_sig);
419 unsaved_sig->field = pdf_keep_obj(ctx, field);
420 unsaved_sig->signer = signer->keep(ctx, signer);
421 unsaved_sig->next = NULL;
422 if (xref->unsaved_sigs_end == NULL)
423 xref->unsaved_sigs_end = &xref->unsaved_sigs;
424
425 *xref->unsaved_sigs_end = unsaved_sig;
426 xref->unsaved_sigs_end = &unsaved_sig->next;
427 }
428
pdf_xref_obj_is_unsaved_signature(pdf_document * doc,pdf_obj * obj)429 int pdf_xref_obj_is_unsaved_signature(pdf_document *doc, pdf_obj *obj)
430 {
431 int i;
432 for (i = 0; i < doc->num_incremental_sections; i++)
433 {
434 pdf_xref *xref = &doc->xref_sections[i];
435 pdf_unsaved_sig *usig;
436
437 for (usig = xref->unsaved_sigs; usig; usig = usig->next)
438 {
439 if (usig->field == obj)
440 return 1;
441 }
442 }
443
444 return 0;
445 }
446
pdf_ensure_solid_xref(fz_context * ctx,pdf_document * doc,int num)447 void pdf_ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num)
448 {
449 if (doc->num_xref_sections == 0)
450 pdf_populate_next_xref_level(ctx, doc);
451
452 ensure_solid_xref(ctx, doc, num, doc->num_xref_sections-1);
453 }
454
pdf_xref_ensure_incremental_object(fz_context * ctx,pdf_document * doc,int num)455 void pdf_xref_ensure_incremental_object(fz_context *ctx, pdf_document *doc, int num)
456 {
457 pdf_xref_entry *new_entry, *old_entry;
458 pdf_xref_subsec *sub = NULL;
459 int i;
460
461 /* Make sure we have created an xref section for incremental updates */
462 ensure_incremental_xref(ctx, doc);
463
464 /* Search for the section that contains this object */
465 for (i = doc->xref_index[num]; i < doc->num_xref_sections; i++)
466 {
467 pdf_xref *xref = &doc->xref_sections[i];
468
469 if (num < 0 && num >= xref->num_objects)
470 break;
471 for (sub = xref->subsec; sub != NULL; sub = sub->next)
472 {
473 if (sub->start <= num && num < sub->start + sub->len && sub->table[num - sub->start].type)
474 break;
475 }
476 if (sub != NULL)
477 break;
478 }
479 /* sub == NULL implies we did not find it */
480
481 /* If we don't find it, or it's already in the incremental section, return */
482 if (i == 0 || sub == NULL)
483 return;
484
485 /* Move the object to the incremental section */
486 doc->xref_index[num] = 0;
487 old_entry = &sub->table[num - sub->start];
488 new_entry = pdf_get_incremental_xref_entry(ctx, doc, num);
489 *new_entry = *old_entry;
490 if (i < doc->num_incremental_sections)
491 {
492 /* old entry is incremental and may have changes.
493 * Better keep a copy. We must override the old entry with
494 * the copy because the caller may be holding a reference to
495 * the original and expect it to end up in the new entry */
496 old_entry->obj = pdf_deep_copy_obj(ctx, old_entry->obj);
497 }
498 else
499 {
500 old_entry->obj = NULL;
501 }
502 old_entry->stm_buf = NULL;
503 }
504
pdf_replace_xref(fz_context * ctx,pdf_document * doc,pdf_xref_entry * entries,int n)505 void pdf_replace_xref(fz_context *ctx, pdf_document *doc, pdf_xref_entry *entries, int n)
506 {
507 int *xref_index = NULL;
508 pdf_xref *xref = NULL;
509 pdf_xref_subsec *sub;
510
511 fz_var(xref_index);
512 fz_var(xref);
513
514 fz_try(ctx)
515 {
516 xref_index = fz_calloc(ctx, n, sizeof(int));
517 xref = fz_malloc_struct(ctx, pdf_xref);
518 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
519 }
520 fz_catch(ctx)
521 {
522 fz_free(ctx, xref);
523 fz_free(ctx, xref_index);
524 fz_rethrow(ctx);
525 }
526
527 sub->table = entries;
528 sub->start = 0;
529 sub->len = n;
530
531 xref->subsec = sub;
532 xref->num_objects = n;
533 xref->trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
534
535 /* The new table completely replaces the previous separate sections */
536 pdf_drop_xref_sections(ctx, doc);
537
538 doc->xref_sections = xref;
539 doc->num_xref_sections = 1;
540 doc->num_incremental_sections = 0;
541 doc->xref_base = 0;
542 doc->disallow_new_increments = 0;
543 doc->max_xref_len = n;
544
545 fz_free(ctx, doc->xref_index);
546 doc->xref_index = xref_index;
547 }
548
pdf_forget_xref(fz_context * ctx,pdf_document * doc)549 void pdf_forget_xref(fz_context *ctx, pdf_document *doc)
550 {
551 pdf_obj *trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
552
553 if (doc->saved_xref_sections)
554 pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
555
556 doc->saved_xref_sections = doc->xref_sections;
557 doc->saved_num_xref_sections = doc->num_xref_sections;
558
559 doc->startxref = 0;
560 doc->num_xref_sections = 0;
561 doc->num_incremental_sections = 0;
562 doc->xref_base = 0;
563 doc->disallow_new_increments = 0;
564
565 fz_try(ctx)
566 {
567 pdf_get_populating_xref_entry(ctx, doc, 0);
568 }
569 fz_catch(ctx)
570 {
571 pdf_drop_obj(ctx, trailer);
572 fz_rethrow(ctx);
573 }
574
575 /* Set the trailer of the final xref section. */
576 doc->xref_sections[0].trailer = trailer;
577 }
578
579 /*
580 * magic version tag and startxref
581 */
582
583 int
pdf_version(fz_context * ctx,pdf_document * doc)584 pdf_version(fz_context *ctx, pdf_document *doc)
585 {
586 int version = doc->version;
587 fz_try(ctx)
588 {
589 pdf_obj *obj = pdf_dict_getl(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), PDF_NAME(Version), NULL);
590 const char *str = pdf_to_name(ctx, obj);
591 if (*str)
592 version = 10 * (fz_atof(str) + 0.05f);
593 }
594 fz_catch(ctx)
595 {
596 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
597 fz_warn(ctx, "Ignoring broken Root/Version number.");
598 }
599 return version;
600 }
601
602 static void
pdf_load_version(fz_context * ctx,pdf_document * doc)603 pdf_load_version(fz_context *ctx, pdf_document *doc)
604 {
605 char buf[20];
606
607 fz_seek(ctx, doc->file, 0, SEEK_SET);
608 fz_read_line(ctx, doc->file, buf, sizeof buf);
609 if (strlen(buf) < 5 || memcmp(buf, "%PDF-", 5) != 0)
610 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize version marker");
611
612 doc->version = 10 * (fz_atof(buf+5) + 0.05f);
613 if (doc->version < 10 || doc->version > 17)
614 if (doc->version != 20)
615 fz_warn(ctx, "unknown PDF version: %d.%d", doc->version / 10, doc->version % 10);
616 }
617
618 static void
pdf_read_start_xref(fz_context * ctx,pdf_document * doc)619 pdf_read_start_xref(fz_context *ctx, pdf_document *doc)
620 {
621 unsigned char buf[1024];
622 size_t i, n;
623 int64_t t;
624
625 fz_seek(ctx, doc->file, 0, SEEK_END);
626
627 doc->file_size = fz_tell(ctx, doc->file);
628
629 t = fz_maxi64(0, doc->file_size - (int64_t)sizeof buf);
630 fz_seek(ctx, doc->file, t, SEEK_SET);
631
632 n = fz_read(ctx, doc->file, buf, sizeof buf);
633 if (n < 9)
634 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
635
636 i = n - 9;
637 do
638 {
639 if (memcmp(buf + i, "startxref", 9) == 0)
640 {
641 i += 9;
642 while (i < n && iswhite(buf[i]))
643 i ++;
644 doc->startxref = 0;
645 while (i < n && isdigit(buf[i]))
646 {
647 if (doc->startxref >= INT64_MAX/10)
648 fz_throw(ctx, FZ_ERROR_GENERIC, "startxref too large");
649 doc->startxref = doc->startxref * 10 + (buf[i++] - '0');
650 }
651 if (doc->startxref != 0)
652 return;
653 break;
654 }
655 } while (i-- > 0);
656
657 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
658 }
659
660 static void
fz_skip_space(fz_context * ctx,fz_stream * stm)661 fz_skip_space(fz_context *ctx, fz_stream *stm)
662 {
663 do
664 {
665 int c = fz_peek_byte(ctx, stm);
666 if (c == EOF || c > 32)
667 return;
668 (void)fz_read_byte(ctx, stm);
669 }
670 while (1);
671 }
672
fz_skip_string(fz_context * ctx,fz_stream * stm,const char * str)673 static int fz_skip_string(fz_context *ctx, fz_stream *stm, const char *str)
674 {
675 while (*str)
676 {
677 int c = fz_peek_byte(ctx, stm);
678 if (c == EOF || c != *str++)
679 return 1;
680 (void)fz_read_byte(ctx, stm);
681 }
682 return 0;
683 }
684
685 /*
686 * trailer dictionary
687 */
688
689 static int
pdf_xref_size_from_old_trailer(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)690 pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
691 {
692 int len;
693 char *s;
694 int64_t t;
695 pdf_token tok;
696 int c;
697 int size = 0;
698 int64_t ofs;
699 pdf_obj *trailer = NULL;
700 size_t n;
701
702 fz_var(trailer);
703
704 /* Record the current file read offset so that we can reinstate it */
705 ofs = fz_tell(ctx, doc->file);
706
707 fz_skip_space(ctx, doc->file);
708 if (fz_skip_string(ctx, doc->file, "xref"))
709 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
710 fz_skip_space(ctx, doc->file);
711
712 while (1)
713 {
714 c = fz_peek_byte(ctx, doc->file);
715 if (!isdigit(c))
716 break;
717
718 fz_read_line(ctx, doc->file, buf->scratch, buf->size);
719 s = buf->scratch;
720 fz_strsep(&s, " "); /* ignore start */
721 if (!s)
722 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing");
723 len = fz_atoi(fz_strsep(&s, " "));
724 if (len < 0)
725 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive");
726
727 /* broken pdfs where the section is not on a separate line */
728 if (s && *s != '\0')
729 fz_seek(ctx, doc->file, -(2 + (int)strlen(s)), SEEK_CUR);
730
731 t = fz_tell(ctx, doc->file);
732 if (t < 0)
733 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
734
735 /* Spec says xref entries should be 20 bytes, but it's not infrequent
736 * to see 19, in particular for some PCLm drivers. Cope. */
737 if (len > 0)
738 {
739 n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, 20);
740 if (n < 19)
741 fz_throw(ctx, FZ_ERROR_GENERIC, "malformed xref table");
742 if (n == 20 && buf->scratch[19] > 32)
743 n = 19;
744 }
745 else
746 n = 20;
747
748 if (len > (int64_t)((INT64_MAX - t) / n))
749 fz_throw(ctx, FZ_ERROR_GENERIC, "xref has too many entries");
750
751 fz_seek(ctx, doc->file, t + n * (int64_t)len, SEEK_SET);
752 }
753
754 fz_try(ctx)
755 {
756 tok = pdf_lex(ctx, doc->file, buf);
757 if (tok != PDF_TOK_TRAILER)
758 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer marker");
759
760 tok = pdf_lex(ctx, doc->file, buf);
761 if (tok != PDF_TOK_OPEN_DICT)
762 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary");
763
764 trailer = pdf_parse_dict(ctx, doc, doc->file, buf);
765
766 size = pdf_dict_get_int(ctx, trailer, PDF_NAME(Size));
767 if (size < 0 || size > PDF_MAX_OBJECT_NUMBER + 1)
768 fz_throw(ctx, FZ_ERROR_GENERIC, "trailer Size entry out of range");
769 }
770 fz_always(ctx)
771 {
772 pdf_drop_obj(ctx, trailer);
773 }
774 fz_catch(ctx)
775 {
776 fz_rethrow(ctx);
777 }
778
779 fz_seek(ctx, doc->file, ofs, SEEK_SET);
780
781 return size;
782 }
783
784 static pdf_xref_entry *
pdf_xref_find_subsection(fz_context * ctx,pdf_document * doc,int start,int len)785 pdf_xref_find_subsection(fz_context *ctx, pdf_document *doc, int start, int len)
786 {
787 pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections-1];
788 pdf_xref_subsec *sub;
789 int num_objects;
790
791 /* Different cases here. Case 1) We might be asking for a
792 * subsection (or a subset of a subsection) that we already
793 * have - Just return it. Case 2) We might be asking for a
794 * completely new subsection - Create it and return it.
795 * Case 3) We might have an overlapping one - Create a 'solid'
796 * subsection and return that. */
797
798 /* Sanity check */
799 for (sub = xref->subsec; sub != NULL; sub = sub->next)
800 {
801 if (start >= sub->start && start + len <= sub->start + sub->len)
802 return &sub->table[start-sub->start]; /* Case 1 */
803 if (start + len > sub->start && start <= sub->start + sub->len)
804 break; /* Case 3 */
805 }
806
807 num_objects = xref->num_objects;
808 if (num_objects < start + len)
809 num_objects = start + len;
810
811 if (sub == NULL)
812 {
813 /* Case 2 */
814 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
815 fz_try(ctx)
816 {
817 sub->table = fz_calloc(ctx, len, sizeof(pdf_xref_entry));
818 sub->start = start;
819 sub->len = len;
820 sub->next = xref->subsec;
821 xref->subsec = sub;
822 }
823 fz_catch(ctx)
824 {
825 fz_free(ctx, sub);
826 fz_rethrow(ctx);
827 }
828 xref->num_objects = num_objects;
829 if (doc->max_xref_len < num_objects)
830 extend_xref_index(ctx, doc, num_objects);
831 }
832 else
833 {
834 /* Case 3 */
835 ensure_solid_xref(ctx, doc, num_objects, doc->num_xref_sections-1);
836 xref = &doc->xref_sections[doc->num_xref_sections-1];
837 sub = xref->subsec;
838 }
839 return &sub->table[start-sub->start];
840 }
841
842 static pdf_obj *
pdf_read_old_xref(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)843 pdf_read_old_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
844 {
845 int start, len, c, i, xref_len, carried;
846 fz_stream *file = doc->file;
847 pdf_xref_entry *table;
848 pdf_token tok;
849 size_t n;
850 char *s, *e;
851
852 xref_len = pdf_xref_size_from_old_trailer(ctx, doc, buf);
853
854 fz_skip_space(ctx, doc->file);
855 if (fz_skip_string(ctx, doc->file, "xref"))
856 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
857 fz_skip_space(ctx, doc->file);
858
859 while (1)
860 {
861 c = fz_peek_byte(ctx, file);
862 if (!isdigit(c))
863 break;
864
865 fz_read_line(ctx, file, buf->scratch, buf->size);
866 s = buf->scratch;
867 start = fz_atoi(fz_strsep(&s, " "));
868 len = fz_atoi(fz_strsep(&s, " "));
869
870 /* broken pdfs where the section is not on a separate line */
871 if (s && *s != '\0')
872 {
873 fz_warn(ctx, "broken xref subsection. proceeding anyway.");
874 fz_seek(ctx, file, -(2 + (int)strlen(s)), SEEK_CUR);
875 }
876
877 if (start < 0 || start > PDF_MAX_OBJECT_NUMBER
878 || len < 0 || len > PDF_MAX_OBJECT_NUMBER
879 || start + len - 1 > PDF_MAX_OBJECT_NUMBER)
880 {
881 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range");
882 }
883 /* broken pdfs where size in trailer undershoots entries in xref sections */
884 if (start + len > xref_len)
885 {
886 fz_warn(ctx, "broken xref subsection, proceeding anyway.");
887 }
888
889 table = pdf_xref_find_subsection(ctx, doc, start, len);
890
891 /* Xref entries SHOULD be 20 bytes long, but we see 19 byte
892 * ones more frequently than we'd like (e.g. PCLm drivers).
893 * Cope with this by 'carrying' data forward. */
894 carried = 0;
895 for (i = 0; i < len; i++)
896 {
897 pdf_xref_entry *entry = &table[i];
898 n = fz_read(ctx, file, (unsigned char *) buf->scratch + carried, 20-carried);
899 if (n != (size_t)(20-carried))
900 fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected EOF in xref table");
901 n += carried;
902 buf->scratch[n] = '\0';
903 if (!entry->type)
904 {
905 s = buf->scratch;
906 e = s + n;
907
908 entry->num = start + i;
909
910 /* broken pdfs where line start with white space */
911 while (s < e && iswhite(*s))
912 s++;
913
914 if (s == e || !isdigit(*s))
915 fz_throw(ctx, FZ_ERROR_GENERIC, "xref offset missing");
916 while (s < e && isdigit(*s))
917 entry->ofs = entry->ofs * 10 + *s++ - '0';
918
919 while (s < e && iswhite(*s))
920 s++;
921 if (s == e || !isdigit(*s))
922 fz_throw(ctx, FZ_ERROR_GENERIC, "xref generation number missing");
923 while (s < e && isdigit(*s))
924 entry->gen = entry->gen * 10 + *s++ - '0';
925
926 while (s < e && iswhite(*s))
927 s++;
928 if (s == e || (*s != 'f' && *s != 'n' && *s != 'o'))
929 fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected xref type: 0x%x (%d %d R)", s == e ? 0 : *s, entry->num, entry->gen);
930 entry->type = *s++;
931
932 /* If the last byte of our buffer isn't an EOL (or space), carry one byte forward */
933 carried = buf->scratch[19] > 32;
934 if (carried)
935 buf->scratch[0] = buf->scratch[19];
936 }
937 }
938 if (carried)
939 fz_unread_byte(ctx, file);
940 }
941
942 tok = pdf_lex(ctx, file, buf);
943 if (tok != PDF_TOK_TRAILER)
944 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer marker");
945
946 tok = pdf_lex(ctx, file, buf);
947 if (tok != PDF_TOK_OPEN_DICT)
948 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary");
949
950 doc->has_old_style_xrefs = 1;
951
952 return pdf_parse_dict(ctx, doc, file, buf);
953 }
954
955 static void
pdf_read_new_xref_section(fz_context * ctx,pdf_document * doc,fz_stream * stm,int i0,int i1,int w0,int w1,int w2)956 pdf_read_new_xref_section(fz_context *ctx, pdf_document *doc, fz_stream *stm, int i0, int i1, int w0, int w1, int w2)
957 {
958 pdf_xref_entry *table;
959 int i, n;
960
961 if (i0 < 0 || i0 > PDF_MAX_OBJECT_NUMBER || i1 < 0 || i1 > PDF_MAX_OBJECT_NUMBER || i0 + i1 - 1 > PDF_MAX_OBJECT_NUMBER)
962 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range");
963
964 table = pdf_xref_find_subsection(ctx, doc, i0, i1);
965 for (i = i0; i < i0 + i1; i++)
966 {
967 pdf_xref_entry *entry = &table[i-i0];
968 int a = 0;
969 int64_t b = 0;
970 int c = 0;
971
972 if (fz_is_eof(ctx, stm))
973 fz_throw(ctx, FZ_ERROR_GENERIC, "truncated xref stream");
974
975 for (n = 0; n < w0; n++)
976 a = (a << 8) + fz_read_byte(ctx, stm);
977 for (n = 0; n < w1; n++)
978 b = (b << 8) + fz_read_byte(ctx, stm);
979 for (n = 0; n < w2; n++)
980 c = (c << 8) + fz_read_byte(ctx, stm);
981
982 if (!entry->type)
983 {
984 int t = w0 ? a : 1;
985 entry->type = t == 0 ? 'f' : t == 1 ? 'n' : t == 2 ? 'o' : 0;
986 entry->ofs = w1 ? b : 0;
987 entry->gen = w2 ? c : 0;
988 entry->num = i;
989 }
990 }
991
992 doc->has_xref_streams = 1;
993 }
994
995 /* Entered with file locked, remains locked throughout. */
996 static pdf_obj *
pdf_read_new_xref(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)997 pdf_read_new_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
998 {
999 fz_stream *stm = NULL;
1000 pdf_obj *trailer = NULL;
1001 pdf_obj *index = NULL;
1002 pdf_obj *obj = NULL;
1003 int gen, num = 0;
1004 int64_t ofs, stm_ofs;
1005 int size, w0, w1, w2;
1006 int t;
1007
1008 fz_var(trailer);
1009 fz_var(stm);
1010
1011 fz_try(ctx)
1012 {
1013 ofs = fz_tell(ctx, doc->file);
1014 trailer = pdf_parse_ind_obj(ctx, doc, doc->file, buf, &num, &gen, &stm_ofs, NULL);
1015 }
1016 fz_catch(ctx)
1017 {
1018 pdf_drop_obj(ctx, trailer);
1019 fz_rethrow(ctx);
1020 }
1021
1022 fz_try(ctx)
1023 {
1024 pdf_xref_entry *entry;
1025
1026 obj = pdf_dict_get(ctx, trailer, PDF_NAME(Size));
1027 if (!obj)
1028 fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing Size entry (%d 0 R)", num);
1029
1030 size = pdf_to_int(ctx, obj);
1031
1032 obj = pdf_dict_get(ctx, trailer, PDF_NAME(W));
1033 if (!obj)
1034 fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing W entry (%d R)", num);
1035 w0 = pdf_array_get_int(ctx, obj, 0);
1036 w1 = pdf_array_get_int(ctx, obj, 1);
1037 w2 = pdf_array_get_int(ctx, obj, 2);
1038
1039 if (w0 < 0)
1040 fz_warn(ctx, "xref stream objects have corrupt type");
1041 if (w1 < 0)
1042 fz_warn(ctx, "xref stream objects have corrupt offset");
1043 if (w2 < 0)
1044 fz_warn(ctx, "xref stream objects have corrupt generation");
1045
1046 w0 = w0 < 0 ? 0 : w0;
1047 w1 = w1 < 0 ? 0 : w1;
1048 w2 = w2 < 0 ? 0 : w2;
1049
1050 index = pdf_dict_get(ctx, trailer, PDF_NAME(Index));
1051
1052 stm = pdf_open_stream_with_offset(ctx, doc, num, trailer, stm_ofs);
1053
1054 if (!index)
1055 {
1056 pdf_read_new_xref_section(ctx, doc, stm, 0, size, w0, w1, w2);
1057 }
1058 else
1059 {
1060 int n = pdf_array_len(ctx, index);
1061 for (t = 0; t < n; t += 2)
1062 {
1063 int i0 = pdf_array_get_int(ctx, index, t + 0);
1064 int i1 = pdf_array_get_int(ctx, index, t + 1);
1065 pdf_read_new_xref_section(ctx, doc, stm, i0, i1, w0, w1, w2);
1066 }
1067 }
1068 entry = pdf_get_populating_xref_entry(ctx, doc, num);
1069 entry->ofs = ofs;
1070 entry->gen = gen;
1071 entry->num = num;
1072 entry->stm_ofs = stm_ofs;
1073 pdf_drop_obj(ctx, entry->obj);
1074 entry->obj = pdf_keep_obj(ctx, trailer);
1075 entry->type = 'n';
1076 }
1077 fz_always(ctx)
1078 {
1079 fz_drop_stream(ctx, stm);
1080 }
1081 fz_catch(ctx)
1082 {
1083 pdf_drop_obj(ctx, trailer);
1084 fz_rethrow(ctx);
1085 }
1086
1087 return trailer;
1088 }
1089
1090 static pdf_obj *
pdf_read_xref(fz_context * ctx,pdf_document * doc,int64_t ofs,pdf_lexbuf * buf)1091 pdf_read_xref(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf)
1092 {
1093 pdf_obj *trailer;
1094 int c;
1095
1096 fz_seek(ctx, doc->file, ofs, SEEK_SET);
1097
1098 while (iswhite(fz_peek_byte(ctx, doc->file)))
1099 fz_read_byte(ctx, doc->file);
1100
1101 c = fz_peek_byte(ctx, doc->file);
1102 if (c == 'x')
1103 trailer = pdf_read_old_xref(ctx, doc, buf);
1104 else if (isdigit(c))
1105 trailer = pdf_read_new_xref(ctx, doc, buf);
1106 else
1107 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize xref format");
1108
1109 return trailer;
1110 }
1111
1112 static int64_t
read_xref_section(fz_context * ctx,pdf_document * doc,int64_t ofs,pdf_lexbuf * buf)1113 read_xref_section(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf)
1114 {
1115 pdf_obj *trailer = NULL;
1116 pdf_obj *prevobj;
1117 int64_t xrefstmofs = 0;
1118 int64_t prevofs = 0;
1119
1120 trailer = pdf_read_xref(ctx, doc, ofs, buf);
1121 fz_try(ctx)
1122 {
1123 pdf_set_populating_xref_trailer(ctx, doc, trailer);
1124
1125 /* FIXME: do we overwrite free entries properly? */
1126 /* FIXME: Does this work properly with progression? */
1127 xrefstmofs = pdf_to_int64(ctx, pdf_dict_get(ctx, trailer, PDF_NAME(XRefStm)));
1128 if (xrefstmofs)
1129 {
1130 if (xrefstmofs < 0)
1131 fz_throw(ctx, FZ_ERROR_GENERIC, "negative xref stream offset");
1132
1133 /*
1134 Read the XRefStm stream, but throw away the resulting trailer. We do not
1135 follow any Prev tag therein, as specified on Page 108 of the PDF reference
1136 1.7
1137 */
1138 pdf_drop_obj(ctx, pdf_read_xref(ctx, doc, xrefstmofs, buf));
1139 }
1140
1141 prevobj = pdf_dict_get(ctx, trailer, PDF_NAME(Prev));
1142 if (pdf_is_int(ctx, prevobj))
1143 {
1144 prevofs = pdf_to_int64(ctx, prevobj);
1145 if (prevofs <= 0)
1146 fz_throw(ctx, FZ_ERROR_GENERIC, "invalid offset for previous xref section");
1147 }
1148 }
1149 fz_always(ctx)
1150 pdf_drop_obj(ctx, trailer);
1151 fz_catch(ctx)
1152 fz_rethrow(ctx);
1153
1154 return prevofs;
1155 }
1156
1157 static void
pdf_read_xref_sections(fz_context * ctx,pdf_document * doc,int64_t ofs,pdf_lexbuf * buf,int read_previous)1158 pdf_read_xref_sections(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf, int read_previous)
1159 {
1160 int i, len, cap;
1161 int64_t *offsets;
1162
1163 len = 0;
1164 cap = 10;
1165 offsets = fz_malloc_array(ctx, cap, int64_t);
1166
1167 fz_try(ctx)
1168 {
1169 while(ofs)
1170 {
1171 for (i = 0; i < len; i ++)
1172 {
1173 if (offsets[i] == ofs)
1174 break;
1175 }
1176 if (i < len)
1177 {
1178 fz_warn(ctx, "ignoring xref section recursion at offset %d", (int)ofs);
1179 break;
1180 }
1181 if (len == cap)
1182 {
1183 cap *= 2;
1184 offsets = fz_realloc_array(ctx, offsets, cap, int64_t);
1185 }
1186 offsets[len++] = ofs;
1187
1188 pdf_populate_next_xref_level(ctx, doc);
1189 ofs = read_xref_section(ctx, doc, ofs, buf);
1190 if (!read_previous)
1191 break;
1192 }
1193 }
1194 fz_always(ctx)
1195 {
1196 fz_free(ctx, offsets);
1197 }
1198 fz_catch(ctx)
1199 {
1200 fz_rethrow(ctx);
1201 }
1202 }
1203
1204 static void
pdf_prime_xref_index(fz_context * ctx,pdf_document * doc)1205 pdf_prime_xref_index(fz_context *ctx, pdf_document *doc)
1206 {
1207 int i, j;
1208 int *idx = doc->xref_index;
1209
1210 for (i = doc->num_xref_sections-1; i >= 0; i--)
1211 {
1212 pdf_xref *xref = &doc->xref_sections[i];
1213 pdf_xref_subsec *subsec = xref->subsec;
1214 while (subsec != NULL)
1215 {
1216 int start = subsec->start;
1217 int end = subsec->start + subsec->len;
1218 for (j = start; j < end; j++)
1219 {
1220 char t = subsec->table[j-start].type;
1221 if (t != 0 && t != 'f')
1222 idx[j] = i;
1223 }
1224
1225 subsec = subsec->next;
1226 }
1227 }
1228 }
1229
1230 /*
1231 * load xref tables from pdf
1232 *
1233 * File locked on entry, throughout and on exit.
1234 */
1235
1236 static void
pdf_load_xref(fz_context * ctx,pdf_document * doc,pdf_lexbuf * buf)1237 pdf_load_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
1238 {
1239 int i;
1240 int xref_len;
1241 pdf_xref_entry *entry;
1242
1243 pdf_read_start_xref(ctx, doc);
1244
1245 pdf_read_xref_sections(ctx, doc, doc->startxref, buf, 1);
1246
1247 if (pdf_xref_len(ctx, doc) == 0)
1248 fz_throw(ctx, FZ_ERROR_GENERIC, "found xref was empty");
1249
1250 pdf_prime_xref_index(ctx, doc);
1251
1252 entry = pdf_get_xref_entry(ctx, doc, 0);
1253 /* broken pdfs where first object is missing */
1254 if (!entry->type)
1255 {
1256 entry->type = 'f';
1257 entry->gen = 65535;
1258 entry->num = 0;
1259 }
1260 /* broken pdfs where first object is not free */
1261 else if (entry->type != 'f')
1262 fz_warn(ctx, "first object in xref is not free");
1263
1264 /* broken pdfs where object offsets are out of range */
1265 xref_len = pdf_xref_len(ctx, doc);
1266 for (i = 0; i < xref_len; i++)
1267 {
1268 entry = pdf_get_xref_entry(ctx, doc, i);
1269 if (entry->type == 'n')
1270 {
1271 /* Special case code: "0000000000 * n" means free,
1272 * according to some producers (inc Quartz) */
1273 if (entry->ofs == 0)
1274 entry->type = 'f';
1275 else if (entry->ofs <= 0 || entry->ofs >= doc->file_size)
1276 fz_throw(ctx, FZ_ERROR_GENERIC, "object offset out of range: %d (%d 0 R)", (int)entry->ofs, i);
1277 }
1278 if (entry->type == 'o')
1279 {
1280 /* Read this into a local variable here, because pdf_get_xref_entry
1281 * may solidify the xref, hence invalidating "entry", meaning we
1282 * need a stashed value for the throw. */
1283 int64_t ofs = entry->ofs;
1284 if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry(ctx, doc, ofs)->type != 'n')
1285 fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i);
1286 }
1287 }
1288 }
1289
1290 static void
pdf_check_linear(fz_context * ctx,pdf_document * doc)1291 pdf_check_linear(fz_context *ctx, pdf_document *doc)
1292 {
1293 pdf_obj *dict = NULL;
1294 pdf_obj *o;
1295 int num, gen;
1296 int64_t stmofs;
1297
1298 fz_var(dict);
1299
1300 fz_try(ctx)
1301 {
1302 dict = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL);
1303 if (!pdf_is_dict(ctx, dict))
1304 break;
1305 o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1306 if (o == NULL)
1307 break;
1308 if (pdf_to_int(ctx, o) != 1)
1309 break;
1310 doc->has_linearization_object = 1;
1311 }
1312 fz_always(ctx)
1313 pdf_drop_obj(ctx, dict);
1314 fz_catch(ctx)
1315 {
1316 /* Silently swallow this error. */
1317 }
1318 }
1319
1320 static void
pdf_load_linear(fz_context * ctx,pdf_document * doc)1321 pdf_load_linear(fz_context *ctx, pdf_document *doc)
1322 {
1323 pdf_obj *dict = NULL;
1324 pdf_obj *hint = NULL;
1325 pdf_obj *o;
1326 int num, gen, lin, len;
1327 int64_t stmofs;
1328
1329 fz_var(dict);
1330 fz_var(hint);
1331
1332 fz_try(ctx)
1333 {
1334 pdf_xref_entry *entry;
1335
1336 dict = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL);
1337 if (!pdf_is_dict(ctx, dict))
1338 fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
1339 o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1340 if (o == NULL)
1341 fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
1342 lin = pdf_to_int(ctx, o);
1343 if (lin != 1)
1344 fz_throw(ctx, FZ_ERROR_GENERIC, "Unexpected version of Linearized tag (%d)", lin);
1345 doc->has_linearization_object = 1;
1346 len = pdf_dict_get_int(ctx, dict, PDF_NAME(L));
1347 if (len != doc->file_length)
1348 fz_throw(ctx, FZ_ERROR_GENERIC, "File has been updated since linearization");
1349
1350 pdf_read_xref_sections(ctx, doc, fz_tell(ctx, doc->file), &doc->lexbuf.base, 0);
1351
1352 doc->linear_page_count = pdf_dict_get_int(ctx, dict, PDF_NAME(N));
1353 doc->linear_page_refs = fz_realloc_array(ctx, doc->linear_page_refs, doc->linear_page_count, pdf_obj *);
1354 memset(doc->linear_page_refs, 0, doc->linear_page_count * sizeof(pdf_obj*));
1355 doc->linear_obj = dict;
1356 doc->linear_pos = fz_tell(ctx, doc->file);
1357 doc->linear_page1_obj_num = pdf_dict_get_int(ctx, dict, PDF_NAME(O));
1358 doc->linear_page_refs[0] = pdf_new_indirect(ctx, doc, doc->linear_page1_obj_num, 0);
1359 doc->linear_page_num = 0;
1360 hint = pdf_dict_get(ctx, dict, PDF_NAME(H));
1361 doc->hint_object_offset = pdf_array_get_int(ctx, hint, 0);
1362 doc->hint_object_length = pdf_array_get_int(ctx, hint, 1);
1363
1364 entry = pdf_get_populating_xref_entry(ctx, doc, 0);
1365 entry->type = 'f';
1366 }
1367 fz_catch(ctx)
1368 {
1369 pdf_drop_obj(ctx, dict);
1370 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1371 /* Drop back to non linearized reading mode */
1372 doc->file_reading_linearly = 0;
1373 }
1374 }
1375
1376 /*
1377 * Initialize and load xref tables.
1378 * If password is not null, try to decrypt.
1379 */
1380
1381 static void
pdf_init_document(fz_context * ctx,pdf_document * doc)1382 pdf_init_document(fz_context *ctx, pdf_document *doc)
1383 {
1384 pdf_obj *encrypt, *id;
1385 pdf_obj *dict = NULL;
1386 pdf_obj *obj;
1387 pdf_obj *nobj = NULL;
1388 int i, repaired = 0;
1389
1390 fz_var(dict);
1391 fz_var(nobj);
1392
1393 fz_try(ctx)
1394 {
1395 /* Check to see if we should work in progressive mode */
1396 if (doc->file->progressive)
1397 {
1398 doc->file_reading_linearly = 1;
1399 fz_seek(ctx, doc->file, 0, SEEK_END);
1400 doc->file_length = fz_tell(ctx, doc->file);
1401 if (doc->file_length < 0)
1402 doc->file_length = 0;
1403 fz_seek(ctx, doc->file, 0, SEEK_SET);
1404 }
1405
1406 pdf_load_version(ctx, doc);
1407
1408 /* Try to load the linearized file if we are in progressive
1409 * mode. */
1410 if (doc->file_reading_linearly)
1411 pdf_load_linear(ctx, doc);
1412 else
1413 /* Even if we're not in progressive mode, check to see
1414 * if the file claims to be linearized. This is important
1415 * for checking signatures later on. */
1416 pdf_check_linear(ctx, doc);
1417
1418 /* If we aren't in progressive mode (or the linear load failed
1419 * and has set us back to non-progressive mode), load normally.
1420 */
1421 if (!doc->file_reading_linearly)
1422 pdf_load_xref(ctx, doc, &doc->lexbuf.base);
1423 }
1424 fz_catch(ctx)
1425 {
1426 pdf_drop_xref_sections(ctx, doc);
1427 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1428 fz_warn(ctx, "trying to repair broken xref");
1429 repaired = 1;
1430 }
1431
1432 fz_try(ctx)
1433 {
1434 int hasroot, hasinfo;
1435
1436 if (repaired)
1437 {
1438 /* pdf_repair_xref may access xref_index, so reset it properly */
1439 if (doc->xref_index)
1440 memset(doc->xref_index, 0, sizeof(int) * doc->max_xref_len);
1441 pdf_repair_xref(ctx, doc);
1442 pdf_prime_xref_index(ctx, doc);
1443 }
1444
1445 encrypt = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt));
1446 id = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID));
1447 if (pdf_is_dict(ctx, encrypt))
1448 doc->crypt = pdf_new_crypt(ctx, encrypt, id);
1449
1450 /* Allow lazy clients to read encrypted files with a blank password */
1451 pdf_authenticate_password(ctx, doc, "");
1452
1453 if (repaired)
1454 {
1455 int xref_len = pdf_xref_len(ctx, doc);
1456 pdf_repair_obj_stms(ctx, doc);
1457
1458 hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL);
1459 hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL);
1460
1461 for (i = 1; i < xref_len && !hasinfo && !hasroot; ++i)
1462 {
1463 pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, i);
1464 if (entry->type == 0 || entry->type == 'f')
1465 continue;
1466
1467 fz_try(ctx)
1468 {
1469 dict = pdf_load_object(ctx, doc, i);
1470 }
1471 fz_catch(ctx)
1472 {
1473 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1474 fz_warn(ctx, "ignoring broken object (%d 0 R)", i);
1475 continue;
1476 }
1477
1478 if (!hasroot)
1479 {
1480 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
1481 if (pdf_name_eq(ctx, obj, PDF_NAME(Catalog)))
1482 {
1483 nobj = pdf_new_indirect(ctx, doc, i, 0);
1484 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj);
1485 hasroot = 1;
1486 }
1487 }
1488
1489 if (!hasinfo)
1490 {
1491 if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer)))
1492 {
1493 nobj = pdf_new_indirect(ctx, doc, i, 0);
1494 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj);
1495 hasinfo = 1;
1496 }
1497 }
1498
1499 pdf_drop_obj(ctx, dict);
1500 dict = NULL;
1501 }
1502
1503 /* ensure that strings are not used in their repaired, non-decrypted form */
1504 if (doc->crypt)
1505 pdf_clear_xref(ctx, doc);
1506 }
1507 }
1508 fz_catch(ctx)
1509 {
1510 pdf_drop_obj(ctx, dict);
1511 fz_rethrow(ctx);
1512 }
1513
1514 fz_try(ctx)
1515 {
1516 pdf_read_ocg(ctx, doc);
1517 }
1518 fz_catch(ctx)
1519 {
1520 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1521 fz_warn(ctx, "Ignoring broken Optional Content configuration");
1522 }
1523 }
1524
1525 void
pdf_invalidate_xfa(fz_context * ctx,pdf_document * doc)1526 pdf_invalidate_xfa(fz_context *ctx, pdf_document *doc)
1527 {
1528 int i;
1529
1530 if (doc == NULL)
1531 return;
1532
1533 for (i = 0; i < doc->xfa.count; i++)
1534 {
1535 fz_free(ctx, doc->xfa.entries[i].key);
1536 fz_drop_xml(ctx, doc->xfa.entries[i].value);
1537 }
1538 doc->xfa.count = 0;
1539 fz_free(ctx, doc->xfa.entries);
1540 doc->xfa.entries = 0;
1541 }
1542
1543 static void
pdf_drop_document_imp(fz_context * ctx,pdf_document * doc)1544 pdf_drop_document_imp(fz_context *ctx, pdf_document *doc)
1545 {
1546 int i;
1547
1548 fz_defer_reap_start(ctx);
1549
1550 /* Type3 glyphs in the glyph cache can contain pdf_obj pointers
1551 * that we are about to destroy. Simplest solution is to bin the
1552 * glyph cache at this point. */
1553 fz_try(ctx)
1554 fz_purge_glyph_cache(ctx);
1555 fz_catch(ctx)
1556 {
1557 /* Swallow error, but continue dropping */
1558 }
1559
1560 pdf_drop_js(ctx, doc->js);
1561
1562 pdf_drop_xref_sections(ctx, doc);
1563 fz_free(ctx, doc->xref_index);
1564
1565 fz_drop_stream(ctx, doc->file);
1566 pdf_drop_crypt(ctx, doc->crypt);
1567
1568 pdf_drop_obj(ctx, doc->linear_obj);
1569 if (doc->linear_page_refs)
1570 {
1571 for (i=0; i < doc->linear_page_count; i++)
1572 pdf_drop_obj(ctx, doc->linear_page_refs[i]);
1573
1574 fz_free(ctx, doc->linear_page_refs);
1575 }
1576
1577 fz_free(ctx, doc->hint_page);
1578 fz_free(ctx, doc->hint_shared_ref);
1579 fz_free(ctx, doc->hint_shared);
1580 fz_free(ctx, doc->hint_obj_offsets);
1581
1582 for (i=0; i < doc->num_type3_fonts; i++)
1583 {
1584 fz_try(ctx)
1585 fz_decouple_type3_font(ctx, doc->type3_fonts[i], (void *)doc);
1586 fz_always(ctx)
1587 fz_drop_font(ctx, doc->type3_fonts[i]);
1588 fz_catch(ctx)
1589 {
1590 /* Swallow error, but continue dropping */
1591 }
1592 }
1593
1594 fz_free(ctx, doc->type3_fonts);
1595
1596 pdf_drop_ocg(ctx, doc);
1597
1598 pdf_empty_store(ctx, doc);
1599
1600 pdf_lexbuf_fin(ctx, &doc->lexbuf.base);
1601
1602 pdf_drop_resource_tables(ctx, doc);
1603
1604 fz_drop_colorspace(ctx, doc->oi);
1605
1606 for (i = 0; i < doc->orphans_count; i++)
1607 pdf_drop_obj(ctx, doc->orphans[i]);
1608
1609 fz_free(ctx, doc->orphans);
1610
1611 fz_free(ctx, doc->rev_page_map);
1612
1613 fz_defer_reap_end(ctx);
1614
1615 pdf_invalidate_xfa(ctx, doc);
1616 }
1617
1618 void
pdf_drop_document(fz_context * ctx,pdf_document * doc)1619 pdf_drop_document(fz_context *ctx, pdf_document *doc)
1620 {
1621 fz_drop_document(ctx, &doc->super);
1622 }
1623
1624 pdf_document *
pdf_keep_document(fz_context * ctx,pdf_document * doc)1625 pdf_keep_document(fz_context *ctx, pdf_document *doc)
1626 {
1627 return (pdf_document *)fz_keep_document(ctx, &doc->super);
1628 }
1629
1630 /*
1631 * compressed object streams
1632 */
1633
1634 static pdf_xref_entry *
pdf_load_obj_stm(fz_context * ctx,pdf_document * doc,int num,pdf_lexbuf * buf,int target)1635 pdf_load_obj_stm(fz_context *ctx, pdf_document *doc, int num, pdf_lexbuf *buf, int target)
1636 {
1637 fz_stream *stm = NULL;
1638 pdf_obj *objstm = NULL;
1639 int *numbuf = NULL;
1640 int64_t *ofsbuf = NULL;
1641
1642 pdf_obj *obj;
1643 int64_t first;
1644 int count;
1645 int i;
1646 pdf_token tok;
1647 pdf_xref_entry *ret_entry = NULL;
1648 int xref_len;
1649 int found;
1650
1651 fz_var(numbuf);
1652 fz_var(ofsbuf);
1653 fz_var(objstm);
1654 fz_var(stm);
1655
1656 fz_try(ctx)
1657 {
1658 objstm = pdf_load_object(ctx, doc, num);
1659
1660 if (pdf_obj_marked(ctx, objstm))
1661 fz_throw(ctx, FZ_ERROR_GENERIC, "recursive object stream lookup");
1662 }
1663 fz_catch(ctx)
1664 {
1665 pdf_drop_obj(ctx, objstm);
1666 fz_rethrow(ctx);
1667 }
1668
1669 fz_try(ctx)
1670 {
1671 pdf_mark_obj(ctx, objstm);
1672
1673 count = pdf_dict_get_int(ctx, objstm, PDF_NAME(N));
1674 first = pdf_dict_get_int(ctx, objstm, PDF_NAME(First));
1675
1676 if (count < 0 || count > PDF_MAX_OBJECT_NUMBER)
1677 fz_throw(ctx, FZ_ERROR_GENERIC, "number of objects in object stream out of range");
1678 if (first < 0 || first > PDF_MAX_OBJECT_NUMBER
1679 || count < 0 || count > PDF_MAX_OBJECT_NUMBER
1680 || first + count - 1 > PDF_MAX_OBJECT_NUMBER)
1681 fz_throw(ctx, FZ_ERROR_GENERIC, "object stream object numbers are out of range");
1682
1683 numbuf = fz_calloc(ctx, count, sizeof(*numbuf));
1684 ofsbuf = fz_calloc(ctx, count, sizeof(*ofsbuf));
1685
1686 xref_len = pdf_xref_len(ctx, doc);
1687
1688 found = 0;
1689
1690 stm = pdf_open_stream_number(ctx, doc, num);
1691 for (i = 0; i < count; i++)
1692 {
1693 tok = pdf_lex(ctx, stm, buf);
1694 if (tok != PDF_TOK_INT)
1695 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num);
1696 numbuf[found] = buf->i;
1697
1698 tok = pdf_lex(ctx, stm, buf);
1699 if (tok != PDF_TOK_INT)
1700 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num);
1701 ofsbuf[found] = buf->i;
1702
1703 if (numbuf[found] <= 0 || numbuf[found] >= xref_len)
1704 fz_warn(ctx, "object stream object out of range, skipping");
1705 else
1706 found++;
1707 }
1708
1709 for (i = 0; i < found; i++)
1710 {
1711 pdf_xref_entry *entry;
1712
1713 fz_seek(ctx, stm, first + ofsbuf[i], SEEK_SET);
1714
1715 obj = pdf_parse_stm_obj(ctx, doc, stm, buf);
1716
1717 entry = pdf_get_xref_entry(ctx, doc, numbuf[i]);
1718
1719 pdf_set_obj_parent(ctx, obj, numbuf[i]);
1720
1721 if (entry->type == 'o' && entry->ofs == num)
1722 {
1723 /* If we already have an entry for this object,
1724 * we'd like to drop it and use the new one -
1725 * but this means that anyone currently holding
1726 * a pointer to the old one will be left with a
1727 * stale pointer. Instead, we drop the new one
1728 * and trust that the old one is correct. */
1729 if (entry->obj)
1730 {
1731 if (pdf_objcmp(ctx, entry->obj, obj))
1732 fz_warn(ctx, "Encountered new definition for object %d - keeping the original one", numbuf[i]);
1733 pdf_drop_obj(ctx, obj);
1734 }
1735 else
1736 {
1737 entry->obj = obj;
1738 fz_drop_buffer(ctx, entry->stm_buf);
1739 entry->stm_buf = NULL;
1740 }
1741 if (numbuf[i] == target)
1742 ret_entry = entry;
1743 }
1744 else
1745 {
1746 pdf_drop_obj(ctx, obj);
1747 }
1748 }
1749 }
1750 fz_always(ctx)
1751 {
1752 fz_drop_stream(ctx, stm);
1753 fz_free(ctx, ofsbuf);
1754 fz_free(ctx, numbuf);
1755 pdf_unmark_obj(ctx, objstm);
1756 pdf_drop_obj(ctx, objstm);
1757 }
1758 fz_catch(ctx)
1759 {
1760 fz_rethrow(ctx);
1761 }
1762 return ret_entry;
1763 }
1764
1765 /*
1766 * object loading
1767 */
1768 static int
pdf_obj_read(fz_context * ctx,pdf_document * doc,int64_t * offset,int * nump,pdf_obj ** page)1769 pdf_obj_read(fz_context *ctx, pdf_document *doc, int64_t *offset, int *nump, pdf_obj **page)
1770 {
1771 pdf_lexbuf *buf = &doc->lexbuf.base;
1772 int num, gen, tok;
1773 int64_t numofs, genofs, stmofs, tmpofs, newtmpofs;
1774 int xref_len;
1775 pdf_xref_entry *entry;
1776
1777 numofs = *offset;
1778 fz_seek(ctx, doc->file, numofs, SEEK_SET);
1779
1780 /* We expect to read 'num' here */
1781 tok = pdf_lex(ctx, doc->file, buf);
1782 genofs = fz_tell(ctx, doc->file);
1783 if (tok != PDF_TOK_INT)
1784 {
1785 /* Failed! */
1786 DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, *offset));
1787 *offset = genofs;
1788 return tok == PDF_TOK_EOF;
1789 }
1790 *nump = num = buf->i;
1791
1792 /* We expect to read 'gen' here */
1793 tok = pdf_lex(ctx, doc->file, buf);
1794 tmpofs = fz_tell(ctx, doc->file);
1795 if (tok != PDF_TOK_INT)
1796 {
1797 /* Failed! */
1798 DEBUGMESS((ctx, "skipping unexpected data after \"%d\" (tok=%d) at %d", num, tok, *offset));
1799 *offset = tmpofs;
1800 return tok == PDF_TOK_EOF;
1801 }
1802 gen = buf->i;
1803
1804 /* We expect to read 'obj' here */
1805 do
1806 {
1807 tmpofs = fz_tell(ctx, doc->file);
1808 tok = pdf_lex(ctx, doc->file, buf);
1809 if (tok == PDF_TOK_OBJ)
1810 break;
1811 if (tok != PDF_TOK_INT)
1812 {
1813 DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, tmpofs));
1814 *offset = fz_tell(ctx, doc->file);
1815 return tok == PDF_TOK_EOF;
1816 }
1817 DEBUGMESS((ctx, "skipping unexpected int %d at %d", num, numofs));
1818 *nump = num = gen;
1819 numofs = genofs;
1820 gen = buf->i;
1821 genofs = tmpofs;
1822 }
1823 while (1);
1824
1825 /* Now we read the actual object */
1826 xref_len = pdf_xref_len(ctx, doc);
1827
1828 /* When we are reading a progressive file, we typically see:
1829 * File Header
1830 * obj m (Linearization params)
1831 * xref #1 (refers to objects m-n)
1832 * obj m+1
1833 * ...
1834 * obj n
1835 * obj 1
1836 * ...
1837 * obj n-1
1838 * xref #2
1839 *
1840 * The linearisation params are read elsewhere, hence
1841 * whenever we read an object it should just go into the
1842 * previous xref.
1843 */
1844 tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs, NULL);
1845
1846 do /* So we can break out of it */
1847 {
1848 if (num <= 0 || num >= xref_len)
1849 {
1850 fz_warn(ctx, "Not a valid object number (%d %d obj)", num, gen);
1851 break;
1852 }
1853 if (gen != 0)
1854 {
1855 fz_warn(ctx, "Unexpected non zero generation number in linearized file");
1856 }
1857 entry = pdf_get_populating_xref_entry(ctx, doc, num);
1858 if (entry->type != 0)
1859 {
1860 DEBUGMESS((ctx, "Duplicate object found (%d %d obj)", num, gen));
1861 break;
1862 }
1863 if (page && *page)
1864 {
1865 DEBUGMESS((ctx, "Successfully read object %d @ %d - and found page %d!", num, numofs, doc->linear_page_num));
1866 if (!entry->obj)
1867 entry->obj = pdf_keep_obj(ctx, *page);
1868
1869 if (doc->linear_page_refs[doc->linear_page_num] == NULL)
1870 doc->linear_page_refs[doc->linear_page_num] = pdf_new_indirect(ctx, doc, num, gen);
1871 }
1872 else
1873 {
1874 DEBUGMESS((ctx, "Successfully read object %d @ %d", num, numofs));
1875 }
1876 entry->type = 'n';
1877 entry->gen = gen; // XXX: was 0
1878 entry->num = num;
1879 entry->ofs = numofs;
1880 entry->stm_ofs = stmofs;
1881 }
1882 while (0);
1883 if (page && *page)
1884 doc->linear_page_num++;
1885
1886 if (tok == PDF_TOK_ENDOBJ)
1887 {
1888 *offset = fz_tell(ctx, doc->file);
1889 }
1890 else
1891 {
1892 *offset = newtmpofs;
1893 }
1894 return 0;
1895 }
1896
1897 static void
pdf_load_hinted_page(fz_context * ctx,pdf_document * doc,int pagenum)1898 pdf_load_hinted_page(fz_context *ctx, pdf_document *doc, int pagenum)
1899 {
1900 pdf_obj *page = NULL;
1901
1902 if (!doc->hints_loaded || !doc->linear_page_refs)
1903 return;
1904
1905 if (doc->linear_page_refs[pagenum])
1906 return;
1907
1908 fz_var(page);
1909
1910 fz_try(ctx)
1911 {
1912 int num = doc->hint_page[pagenum].number;
1913 page = pdf_load_object(ctx, doc, num);
1914 if (pdf_name_eq(ctx, PDF_NAME(Page), pdf_dict_get(ctx, page, PDF_NAME(Type))))
1915 {
1916 /* We have found the page object! */
1917 DEBUGMESS((ctx, "LoadHintedPage pagenum=%d num=%d", pagenum, num));
1918 doc->linear_page_refs[pagenum] = pdf_new_indirect(ctx, doc, num, 0);
1919 }
1920 }
1921 fz_always(ctx)
1922 pdf_drop_obj(ctx, page);
1923 fz_catch(ctx)
1924 {
1925 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1926 /* Silently swallow the error and proceed as normal */
1927 }
1928 }
1929
1930 static int
read_hinted_object(fz_context * ctx,pdf_document * doc,int num)1931 read_hinted_object(fz_context *ctx, pdf_document *doc, int num)
1932 {
1933 /* Try to find the object using our hint table. Find the closest
1934 * object <= the one we want that has a hint and read forward from
1935 * there. */
1936 int expected = num;
1937 int curr_pos;
1938 int64_t start, offset;
1939
1940 while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
1941 expected--;
1942 if (expected != num)
1943 DEBUGMESS((ctx, "object %d is unhinted, will search forward from %d", expected, num));
1944 if (expected == 0) /* No hints found, just bail */
1945 return 0;
1946
1947 curr_pos = fz_tell(ctx, doc->file);
1948 offset = doc->hint_obj_offsets[expected];
1949
1950 fz_var(expected);
1951
1952 fz_try(ctx)
1953 {
1954 int found;
1955
1956 /* Try to read forward from there */
1957 do
1958 {
1959 start = offset;
1960 DEBUGMESS((ctx, "Searching for object %d @ %d", expected, offset));
1961 pdf_obj_read(ctx, doc, &offset, &found, 0);
1962 DEBUGMESS((ctx, "Found object %d - next will be @ %d", found, offset));
1963 if (found <= expected)
1964 {
1965 /* We found the right one (or one earlier than
1966 * we expected). Update the hints. */
1967 doc->hint_obj_offsets[expected] = offset;
1968 doc->hint_obj_offsets[found] = start;
1969 doc->hint_obj_offsets[found+1] = offset;
1970 /* Retry with the next one */
1971 expected = found+1;
1972 }
1973 else
1974 {
1975 /* We found one later than we expected. */
1976 doc->hint_obj_offsets[expected] = 0;
1977 doc->hint_obj_offsets[found] = start;
1978 doc->hint_obj_offsets[found+1] = offset;
1979 while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
1980 expected--;
1981 if (expected == 0) /* No hints found, we give up */
1982 break;
1983 }
1984 }
1985 while (found != num);
1986 }
1987 fz_always(ctx)
1988 {
1989 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
1990 }
1991 fz_catch(ctx)
1992 {
1993 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1994 /* FIXME: Currently we ignore the hint. Perhaps we should
1995 * drop back to non-hinted operation here. */
1996 doc->hint_obj_offsets[expected] = 0;
1997 fz_rethrow(ctx);
1998 }
1999 return expected != 0;
2000 }
2001
2002 pdf_obj *
pdf_load_unencrypted_object(fz_context * ctx,pdf_document * doc,int num)2003 pdf_load_unencrypted_object(fz_context *ctx, pdf_document *doc, int num)
2004 {
2005 pdf_xref_entry *x;
2006
2007 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2008 fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2009
2010 x = pdf_get_xref_entry(ctx, doc, num);
2011 if (x->type == 'n')
2012 {
2013 fz_seek(ctx, doc->file, x->ofs, SEEK_SET);
2014 return pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, NULL, NULL, NULL, NULL);
2015 }
2016 return NULL;
2017 }
2018
2019 pdf_xref_entry *
pdf_cache_object(fz_context * ctx,pdf_document * doc,int num)2020 pdf_cache_object(fz_context *ctx, pdf_document *doc, int num)
2021 {
2022 pdf_xref_entry *x;
2023 int rnum, rgen, try_repair;
2024
2025 fz_var(try_repair);
2026
2027 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2028 fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2029
2030 object_updated:
2031 try_repair = 0;
2032 rnum = num;
2033
2034 x = pdf_get_xref_entry(ctx, doc, num);
2035
2036 if (x->obj != NULL)
2037 return x;
2038
2039 if (x->type == 'f')
2040 {
2041 x->obj = PDF_NULL;
2042 }
2043 else if (x->type == 'n')
2044 {
2045 fz_seek(ctx, doc->file, x->ofs, SEEK_SET);
2046
2047 fz_try(ctx)
2048 {
2049 x->obj = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base,
2050 &rnum, &rgen, &x->stm_ofs, &try_repair);
2051 }
2052 fz_catch(ctx)
2053 {
2054 if (!try_repair || fz_caught(ctx) == FZ_ERROR_TRYLATER)
2055 fz_rethrow(ctx);
2056 }
2057
2058 if (!try_repair && rnum != num)
2059 {
2060 pdf_drop_obj(ctx, x->obj);
2061 x->type = 'f';
2062 x->ofs = -1;
2063 x->gen = 0;
2064 x->num = 0;
2065 x->stm_ofs = 0;
2066 x->obj = NULL;
2067 try_repair = (doc->repair_attempted == 0);
2068 }
2069
2070 if (try_repair)
2071 {
2072 fz_try(ctx)
2073 {
2074 pdf_repair_xref(ctx, doc);
2075 pdf_prime_xref_index(ctx, doc);
2076 pdf_repair_obj_stms(ctx, doc);
2077 }
2078 fz_catch(ctx)
2079 {
2080 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2081 if (rnum == num)
2082 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse object (%d 0 R)", num);
2083 else
2084 fz_throw(ctx, FZ_ERROR_GENERIC, "found object (%d 0 R) instead of (%d 0 R)", rnum, num);
2085 }
2086 goto object_updated;
2087 }
2088
2089 if (doc->crypt)
2090 pdf_crypt_obj(ctx, doc->crypt, x->obj, x->num, x->gen);
2091 }
2092 else if (x->type == 'o')
2093 {
2094 if (!x->obj)
2095 {
2096 x = pdf_load_obj_stm(ctx, doc, x->ofs, &doc->lexbuf.base, num);
2097 if (x == NULL)
2098 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot load object stream containing object (%d 0 R)", num);
2099 if (!x->obj)
2100 fz_throw(ctx, FZ_ERROR_GENERIC, "object (%d 0 R) was not found in its object stream", num);
2101 }
2102 }
2103 else if (doc->hint_obj_offsets && read_hinted_object(ctx, doc, num))
2104 {
2105 goto object_updated;
2106 }
2107 else if (doc->file_length && doc->linear_pos < doc->file_length)
2108 {
2109 fz_throw(ctx, FZ_ERROR_TRYLATER, "cannot find object in xref (%d 0 R) - not loaded yet?", num);
2110 }
2111 else
2112 {
2113 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find object in xref (%d 0 R)", num);
2114 }
2115
2116 pdf_set_obj_parent(ctx, x->obj, num);
2117 return x;
2118 }
2119
2120 pdf_obj *
pdf_load_object(fz_context * ctx,pdf_document * doc,int num)2121 pdf_load_object(fz_context *ctx, pdf_document *doc, int num)
2122 {
2123 pdf_xref_entry *entry = pdf_cache_object(ctx, doc, num);
2124 return pdf_keep_obj(ctx, entry->obj);
2125 }
2126
2127 pdf_obj *
pdf_resolve_indirect(fz_context * ctx,pdf_obj * ref)2128 pdf_resolve_indirect(fz_context *ctx, pdf_obj *ref)
2129 {
2130 if (pdf_is_indirect(ctx, ref))
2131 {
2132 pdf_document *doc = pdf_get_indirect_document(ctx, ref);
2133 int num = pdf_to_num(ctx, ref);
2134 pdf_xref_entry *entry;
2135
2136 if (!doc)
2137 return NULL;
2138 if (num <= 0)
2139 {
2140 fz_warn(ctx, "invalid indirect reference (%d 0 R)", num);
2141 return NULL;
2142 }
2143
2144 fz_try(ctx)
2145 entry = pdf_cache_object(ctx, doc, num);
2146 fz_catch(ctx)
2147 {
2148 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2149 fz_warn(ctx, "cannot load object (%d 0 R) into cache", num);
2150 return NULL;
2151 }
2152
2153 ref = entry->obj;
2154 }
2155 return ref;
2156 }
2157
2158 pdf_obj *
pdf_resolve_indirect_chain(fz_context * ctx,pdf_obj * ref)2159 pdf_resolve_indirect_chain(fz_context *ctx, pdf_obj *ref)
2160 {
2161 int sanity = 10;
2162
2163 while (pdf_is_indirect(ctx, ref))
2164 {
2165 if (--sanity == 0)
2166 {
2167 fz_warn(ctx, "too many indirections (possible indirection cycle involving %d 0 R)", pdf_to_num(ctx, ref));
2168 return NULL;
2169 }
2170
2171 ref = pdf_resolve_indirect(ctx, ref);
2172 }
2173
2174 return ref;
2175 }
2176
2177 int
pdf_count_objects(fz_context * ctx,pdf_document * doc)2178 pdf_count_objects(fz_context *ctx, pdf_document *doc)
2179 {
2180 return pdf_xref_len(ctx, doc);
2181 }
2182
2183 int
pdf_create_object(fz_context * ctx,pdf_document * doc)2184 pdf_create_object(fz_context *ctx, pdf_document *doc)
2185 {
2186 /* TODO: reuse free object slots by properly linking free object chains in the ofs field */
2187 pdf_xref_entry *entry;
2188 int num = pdf_xref_len(ctx, doc);
2189
2190 if (num > PDF_MAX_OBJECT_NUMBER)
2191 fz_throw(ctx, FZ_ERROR_GENERIC, "too many objects stored in pdf");
2192
2193 entry = pdf_get_incremental_xref_entry(ctx, doc, num);
2194 entry->type = 'f';
2195 entry->ofs = -1;
2196 entry->gen = 0;
2197 entry->num = num;
2198 entry->stm_ofs = 0;
2199 entry->stm_buf = NULL;
2200 entry->obj = NULL;
2201 return num;
2202 }
2203
2204 void
pdf_delete_object(fz_context * ctx,pdf_document * doc,int num)2205 pdf_delete_object(fz_context *ctx, pdf_document *doc, int num)
2206 {
2207 pdf_xref_entry *x;
2208
2209 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2210 {
2211 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2212 return;
2213 }
2214
2215 x = pdf_get_incremental_xref_entry(ctx, doc, num);
2216
2217 fz_drop_buffer(ctx, x->stm_buf);
2218 pdf_drop_obj(ctx, x->obj);
2219
2220 x->type = 'f';
2221 x->ofs = 0;
2222 x->gen += 1;
2223 x->num = 0;
2224 x->stm_ofs = 0;
2225 x->stm_buf = NULL;
2226 x->obj = NULL;
2227 }
2228
2229 void
pdf_update_object(fz_context * ctx,pdf_document * doc,int num,pdf_obj * newobj)2230 pdf_update_object(fz_context *ctx, pdf_document *doc, int num, pdf_obj *newobj)
2231 {
2232 pdf_xref_entry *x;
2233
2234 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2235 {
2236 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2237 return;
2238 }
2239
2240 if (!newobj)
2241 {
2242 pdf_delete_object(ctx, doc, num);
2243 return;
2244 }
2245
2246 x = pdf_get_incremental_xref_entry(ctx, doc, num);
2247
2248 pdf_drop_obj(ctx, x->obj);
2249
2250 x->type = 'n';
2251 x->ofs = 0;
2252 x->obj = pdf_keep_obj(ctx, newobj);
2253
2254 pdf_set_obj_parent(ctx, newobj, num);
2255 }
2256
2257 void
pdf_update_stream(fz_context * ctx,pdf_document * doc,pdf_obj * obj,fz_buffer * newbuf,int compressed)2258 pdf_update_stream(fz_context *ctx, pdf_document *doc, pdf_obj *obj, fz_buffer *newbuf, int compressed)
2259 {
2260 int num;
2261 pdf_xref_entry *x;
2262
2263 if (pdf_is_indirect(ctx, obj))
2264 num = pdf_to_num(ctx, obj);
2265 else
2266 num = pdf_obj_parent_num(ctx, obj);
2267 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2268 {
2269 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2270 return;
2271 }
2272
2273 x = pdf_get_xref_entry(ctx, doc, num);
2274
2275 fz_drop_buffer(ctx, x->stm_buf);
2276 x->stm_buf = fz_keep_buffer(ctx, newbuf);
2277
2278 pdf_dict_put_int(ctx, obj, PDF_NAME(Length), (int)fz_buffer_storage(ctx, newbuf, NULL));
2279 if (!compressed)
2280 {
2281 pdf_dict_del(ctx, obj, PDF_NAME(Filter));
2282 pdf_dict_del(ctx, obj, PDF_NAME(DecodeParms));
2283 }
2284 }
2285
2286 int
pdf_lookup_metadata(fz_context * ctx,pdf_document * doc,const char * key,char * buf,int size)2287 pdf_lookup_metadata(fz_context *ctx, pdf_document *doc, const char *key, char *buf, int size)
2288 {
2289 if (!strcmp(key, FZ_META_FORMAT))
2290 {
2291 int version = pdf_version(ctx, doc);
2292 return 1 + (int)fz_snprintf(buf, size, "PDF %d.%d", version/10, version % 10);
2293 }
2294
2295 if (!strcmp(key, FZ_META_ENCRYPTION))
2296 {
2297 if (doc->crypt)
2298 return 1 + (int)fz_snprintf(buf, size, "Standard V%d R%d %d-bit %s",
2299 pdf_crypt_version(ctx, doc->crypt),
2300 pdf_crypt_revision(ctx, doc->crypt),
2301 pdf_crypt_length(ctx, doc->crypt),
2302 pdf_crypt_method(ctx, doc->crypt));
2303 else
2304 return 1 + (int)fz_strlcpy(buf, "None", size);
2305 }
2306
2307 if (strstr(key, "info:") == key)
2308 {
2309 pdf_obj *info;
2310 const char *s;
2311 int n;
2312
2313 info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info));
2314 if (!info)
2315 return -1;
2316
2317 info = pdf_dict_gets(ctx, info, key + 5);
2318 if (!info)
2319 return -1;
2320
2321 s = pdf_to_text_string(ctx, info);
2322 n = 1 + (int)fz_strlcpy(buf, s, size);
2323 return n;
2324 }
2325
2326 return -1;
2327 }
2328
2329
2330 static fz_location
pdf_resolve_link_imp(fz_context * ctx,fz_document * doc_,const char * uri,float * xp,float * yp)2331 pdf_resolve_link_imp(fz_context *ctx, fz_document *doc_, const char *uri, float *xp, float *yp)
2332 {
2333 pdf_document *doc = (pdf_document*)doc_;
2334 return fz_make_location(0, pdf_resolve_link(ctx, doc, uri, xp, yp));
2335 }
2336
2337 /*
2338 Initializers for the fz_document interface.
2339
2340 The functions are split across two files to allow calls to a
2341 version of the constructor that does not link in the interpreter.
2342 The interpreter references the built-in font and cmap resources
2343 which are quite big. Not linking those into the mutool binary
2344 saves roughly 6MB of space.
2345 */
2346
2347 static pdf_document *
pdf_new_document(fz_context * ctx,fz_stream * file)2348 pdf_new_document(fz_context *ctx, fz_stream *file)
2349 {
2350 pdf_document *doc = fz_new_derived_document(ctx, pdf_document);
2351
2352 doc->super.drop_document = (fz_document_drop_fn*)pdf_drop_document_imp;
2353 doc->super.get_output_intent = (fz_document_output_intent_fn*)pdf_document_output_intent;
2354 doc->super.needs_password = (fz_document_needs_password_fn*)pdf_needs_password;
2355 doc->super.authenticate_password = (fz_document_authenticate_password_fn*)pdf_authenticate_password;
2356 doc->super.has_permission = (fz_document_has_permission_fn*)pdf_has_permission;
2357 doc->super.load_outline = (fz_document_load_outline_fn*)pdf_load_outline;
2358 doc->super.resolve_link = pdf_resolve_link_imp;
2359 doc->super.count_pages = pdf_count_pages_imp;
2360 doc->super.load_page = pdf_load_page_imp;
2361 doc->super.lookup_metadata = (fz_document_lookup_metadata_fn*)pdf_lookup_metadata;
2362
2363 pdf_lexbuf_init(ctx, &doc->lexbuf.base, PDF_LEXBUF_LARGE);
2364 doc->file = fz_keep_stream(ctx, file);
2365
2366 return doc;
2367 }
2368
2369 pdf_document *
pdf_open_document_with_stream(fz_context * ctx,fz_stream * file)2370 pdf_open_document_with_stream(fz_context *ctx, fz_stream *file)
2371 {
2372 pdf_document *doc = pdf_new_document(ctx, file);
2373 fz_try(ctx)
2374 {
2375 pdf_init_document(ctx, doc);
2376 }
2377 fz_catch(ctx)
2378 {
2379 /* fz_drop_document may clobber our error code/message so we have to stash them temporarily. */
2380 char message[256];
2381 int caught = fz_caught(ctx);
2382 fz_strlcpy(message, fz_caught_message(ctx), sizeof message);
2383 fz_drop_document(ctx, &doc->super);
2384 fz_throw(ctx, caught, "%s", message);
2385 }
2386 return doc;
2387 }
2388
2389 pdf_document *
pdf_open_document(fz_context * ctx,const char * filename)2390 pdf_open_document(fz_context *ctx, const char *filename)
2391 {
2392 fz_stream *file = NULL;
2393 pdf_document *doc = NULL;
2394
2395 fz_var(file);
2396 fz_var(doc);
2397
2398 fz_try(ctx)
2399 {
2400 file = fz_open_file(ctx, filename);
2401 doc = pdf_new_document(ctx, file);
2402 pdf_init_document(ctx, doc);
2403 }
2404 fz_always(ctx)
2405 {
2406 fz_drop_stream(ctx, file);
2407 }
2408 fz_catch(ctx)
2409 {
2410 fz_drop_document(ctx, &doc->super);
2411 fz_rethrow(ctx);
2412 }
2413 return doc;
2414 }
2415
2416 static void
pdf_load_hints(fz_context * ctx,pdf_document * doc,int objnum)2417 pdf_load_hints(fz_context *ctx, pdf_document *doc, int objnum)
2418 {
2419 fz_stream *stream = NULL;
2420 pdf_obj *dict;
2421
2422 fz_var(stream);
2423 fz_var(dict);
2424
2425 fz_try(ctx)
2426 {
2427 int i, j, least_num_page_objs, page_obj_num_bits;
2428 int least_page_len, page_len_num_bits, shared_hint_offset;
2429 /* int least_page_offset, page_offset_num_bits; */
2430 /* int least_content_stream_len, content_stream_len_num_bits; */
2431 int num_shared_obj_num_bits, shared_obj_num_bits;
2432 /* int numerator_bits, denominator_bits; */
2433 int shared;
2434 int shared_obj_num, shared_obj_offset, shared_obj_count_page1;
2435 int shared_obj_count_total;
2436 int least_shared_group_len, shared_group_len_num_bits;
2437 int max_object_num = pdf_xref_len(ctx, doc);
2438
2439 stream = pdf_open_stream_number(ctx, doc, objnum);
2440 dict = pdf_get_xref_entry(ctx, doc, objnum)->obj;
2441 if (dict == NULL || !pdf_is_dict(ctx, dict))
2442 fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint object");
2443
2444 shared_hint_offset = pdf_dict_get_int(ctx, dict, PDF_NAME(S));
2445
2446 /* Malloc the structures (use realloc to cope with the fact we
2447 * may try this several times before enough data is loaded) */
2448 doc->hint_page = fz_realloc_array(ctx, doc->hint_page, doc->linear_page_count+1, pdf_hint_page);
2449 memset(doc->hint_page, 0, sizeof(*doc->hint_page) * (doc->linear_page_count+1));
2450 doc->hint_obj_offsets = fz_realloc_array(ctx, doc->hint_obj_offsets, max_object_num, int64_t);
2451 memset(doc->hint_obj_offsets, 0, sizeof(*doc->hint_obj_offsets) * max_object_num);
2452 doc->hint_obj_offsets_max = max_object_num;
2453
2454 /* Read the page object hints table: Header first */
2455 least_num_page_objs = fz_read_bits(ctx, stream, 32);
2456 /* The following is sometimes a lie, but we read this version,
2457 * as other table values are built from it. In
2458 * pdf_reference17.pdf, this points to 2 objects before the
2459 * first pages page object. */
2460 doc->hint_page[0].offset = fz_read_bits(ctx, stream, 32);
2461 if (doc->hint_page[0].offset > doc->hint_object_offset)
2462 doc->hint_page[0].offset += doc->hint_object_length;
2463 page_obj_num_bits = fz_read_bits(ctx, stream, 16);
2464 least_page_len = fz_read_bits(ctx, stream, 32);
2465 page_len_num_bits = fz_read_bits(ctx, stream, 16);
2466 /* least_page_offset = */ (void) fz_read_bits(ctx, stream, 32);
2467 /* page_offset_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
2468 /* least_content_stream_len = */ (void) fz_read_bits(ctx, stream, 32);
2469 /* content_stream_len_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
2470 num_shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2471 shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2472 /* numerator_bits = */ (void) fz_read_bits(ctx, stream, 16);
2473 /* denominator_bits = */ (void) fz_read_bits(ctx, stream, 16);
2474
2475 /* Item 1: Page object numbers */
2476 doc->hint_page[0].number = doc->linear_page1_obj_num;
2477 /* We don't care about the number of objects in the first page */
2478 (void)fz_read_bits(ctx, stream, page_obj_num_bits);
2479 j = 1;
2480 for (i = 1; i < doc->linear_page_count; i++)
2481 {
2482 int delta_page_objs = fz_read_bits(ctx, stream, page_obj_num_bits);
2483
2484 doc->hint_page[i].number = j;
2485 j += least_num_page_objs + delta_page_objs;
2486 }
2487 doc->hint_page[i].number = j; /* Not a real page object */
2488 fz_sync_bits(ctx, stream);
2489 /* Item 2: Page lengths */
2490 j = doc->hint_page[0].offset;
2491 for (i = 0; i < doc->linear_page_count; i++)
2492 {
2493 int delta_page_len = fz_read_bits(ctx, stream, page_len_num_bits);
2494 int old = j;
2495
2496 doc->hint_page[i].offset = j;
2497 j += least_page_len + delta_page_len;
2498 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2499 j += doc->hint_object_length;
2500 }
2501 doc->hint_page[i].offset = j;
2502 fz_sync_bits(ctx, stream);
2503 /* Item 3: Shared references */
2504 shared = 0;
2505 for (i = 0; i < doc->linear_page_count; i++)
2506 {
2507 int num_shared_objs = fz_read_bits(ctx, stream, num_shared_obj_num_bits);
2508 doc->hint_page[i].index = shared;
2509 shared += num_shared_objs;
2510 }
2511 doc->hint_page[i].index = shared;
2512 doc->hint_shared_ref = fz_realloc_array(ctx, doc->hint_shared_ref, shared, int);
2513 memset(doc->hint_shared_ref, 0, sizeof(*doc->hint_shared_ref) * shared);
2514 fz_sync_bits(ctx, stream);
2515 /* Item 4: Shared references */
2516 for (i = 0; i < shared; i++)
2517 {
2518 int ref = fz_read_bits(ctx, stream, shared_obj_num_bits);
2519 doc->hint_shared_ref[i] = ref;
2520 }
2521 /* Skip items 5,6,7 as we don't use them */
2522
2523 fz_seek(ctx, stream, shared_hint_offset, SEEK_SET);
2524
2525 /* Read the shared object hints table: Header first */
2526 shared_obj_num = fz_read_bits(ctx, stream, 32);
2527 shared_obj_offset = fz_read_bits(ctx, stream, 32);
2528 if (shared_obj_offset > doc->hint_object_offset)
2529 shared_obj_offset += doc->hint_object_length;
2530 shared_obj_count_page1 = fz_read_bits(ctx, stream, 32);
2531 shared_obj_count_total = fz_read_bits(ctx, stream, 32);
2532 shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2533 least_shared_group_len = fz_read_bits(ctx, stream, 32);
2534 shared_group_len_num_bits = fz_read_bits(ctx, stream, 16);
2535
2536 /* Sanity check the references in Item 4 above to ensure we
2537 * don't access out of range with malicious files. */
2538 for (i = 0; i < shared; i++)
2539 {
2540 if (doc->hint_shared_ref[i] >= shared_obj_count_total)
2541 {
2542 fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint stream (shared refs)");
2543 }
2544 }
2545
2546 doc->hint_shared = fz_realloc_array(ctx, doc->hint_shared, shared_obj_count_total+1, pdf_hint_shared);
2547 memset(doc->hint_shared, 0, sizeof(*doc->hint_shared) * (shared_obj_count_total+1));
2548
2549 /* Item 1: Shared references */
2550 j = doc->hint_page[0].offset;
2551 for (i = 0; i < shared_obj_count_page1; i++)
2552 {
2553 int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
2554 int old = j;
2555 doc->hint_shared[i].offset = j;
2556 j += off + least_shared_group_len;
2557 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2558 j += doc->hint_object_length;
2559 }
2560 /* FIXME: We would have problems recreating the length of the
2561 * last page 1 shared reference group. But we'll never need
2562 * to, so ignore it. */
2563 j = shared_obj_offset;
2564 for (; i < shared_obj_count_total; i++)
2565 {
2566 int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
2567 int old = j;
2568 doc->hint_shared[i].offset = j;
2569 j += off + least_shared_group_len;
2570 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2571 j += doc->hint_object_length;
2572 }
2573 doc->hint_shared[i].offset = j;
2574 fz_sync_bits(ctx, stream);
2575 /* Item 2: Signature flags: read these just so we can skip */
2576 for (i = 0; i < shared_obj_count_total; i++)
2577 {
2578 doc->hint_shared[i].number = fz_read_bits(ctx, stream, 1);
2579 }
2580 fz_sync_bits(ctx, stream);
2581 /* Item 3: Signatures: just skip */
2582 for (i = 0; i < shared_obj_count_total; i++)
2583 {
2584 if (doc->hint_shared[i].number)
2585 {
2586 (void) fz_read_bits(ctx, stream, 128);
2587 }
2588 }
2589 fz_sync_bits(ctx, stream);
2590 /* Item 4: Shared object object numbers */
2591 j = doc->linear_page1_obj_num; /* FIXME: This is a lie! */
2592 for (i = 0; i < shared_obj_count_page1; i++)
2593 {
2594 doc->hint_shared[i].number = j;
2595 j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
2596 }
2597 j = shared_obj_num;
2598 for (; i < shared_obj_count_total; i++)
2599 {
2600 doc->hint_shared[i].number = j;
2601 j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
2602 }
2603 doc->hint_shared[i].number = j;
2604
2605 /* Now, actually use the data we have gathered. */
2606 for (i = 0 /*shared_obj_count_page1*/; i < shared_obj_count_total; i++)
2607 {
2608 doc->hint_obj_offsets[doc->hint_shared[i].number] = doc->hint_shared[i].offset;
2609 }
2610 for (i = 0; i < doc->linear_page_count; i++)
2611 {
2612 doc->hint_obj_offsets[doc->hint_page[i].number] = doc->hint_page[i].offset;
2613 }
2614 }
2615 fz_always(ctx)
2616 {
2617 fz_drop_stream(ctx, stream);
2618 }
2619 fz_catch(ctx)
2620 {
2621 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2622 /* Don't try to load hints again */
2623 doc->hints_loaded = 1;
2624 /* We won't use the linearized object anymore. */
2625 doc->file_reading_linearly = 0;
2626 /* Any other error becomes a TRYLATER */
2627 fz_throw(ctx, FZ_ERROR_TRYLATER, "malformed hints object");
2628 }
2629 doc->hints_loaded = 1;
2630 }
2631
2632 static void
pdf_load_hint_object(fz_context * ctx,pdf_document * doc)2633 pdf_load_hint_object(fz_context *ctx, pdf_document *doc)
2634 {
2635 pdf_lexbuf *buf = &doc->lexbuf.base;
2636 int64_t curr_pos;
2637
2638 curr_pos = fz_tell(ctx, doc->file);
2639 fz_seek(ctx, doc->file, doc->hint_object_offset, SEEK_SET);
2640 fz_try(ctx)
2641 {
2642 while (1)
2643 {
2644 pdf_obj *page = NULL;
2645 int64_t tmpofs;
2646 int num, tok;
2647
2648 tok = pdf_lex(ctx, doc->file, buf);
2649 if (tok != PDF_TOK_INT)
2650 break;
2651 num = buf->i;
2652 tok = pdf_lex(ctx, doc->file, buf);
2653 if (tok != PDF_TOK_INT)
2654 break;
2655 /* Ignore gen = buf->i */
2656 tok = pdf_lex(ctx, doc->file, buf);
2657 if (tok != PDF_TOK_OBJ)
2658 break;
2659 (void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs, NULL);
2660 pdf_load_hints(ctx, doc, num);
2661 }
2662 }
2663 fz_always(ctx)
2664 {
2665 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2666 }
2667 fz_catch(ctx)
2668 {
2669 fz_rethrow(ctx);
2670 }
2671 }
2672
pdf_progressive_advance(fz_context * ctx,pdf_document * doc,int pagenum)2673 pdf_obj *pdf_progressive_advance(fz_context *ctx, pdf_document *doc, int pagenum)
2674 {
2675 pdf_lexbuf *buf = &doc->lexbuf.base;
2676 int curr_pos;
2677 pdf_obj *page = NULL;
2678
2679 pdf_load_hinted_page(ctx, doc, pagenum);
2680
2681 if (pagenum < 0 || pagenum >= doc->linear_page_count)
2682 fz_throw(ctx, FZ_ERROR_GENERIC, "page load out of range (%d of %d)", pagenum, doc->linear_page_count);
2683
2684 if (doc->linear_pos == doc->file_length)
2685 return doc->linear_page_refs[pagenum];
2686
2687 /* Only load hints once, and then only after we have got page 0 */
2688 if (pagenum > 0 && !doc->hints_loaded && doc->hint_object_offset > 0 && doc->linear_pos >= doc->hint_object_offset)
2689 {
2690 /* Found hint object */
2691 pdf_load_hint_object(ctx, doc);
2692 }
2693
2694 DEBUGMESS((ctx, "continuing to try to advance from %d", doc->linear_pos));
2695 curr_pos = fz_tell(ctx, doc->file);
2696
2697 fz_var(page);
2698
2699 fz_try(ctx)
2700 {
2701 int eof;
2702 do
2703 {
2704 int num;
2705 eof = pdf_obj_read(ctx, doc, &doc->linear_pos, &num, &page);
2706 pdf_drop_obj(ctx, page);
2707 page = NULL;
2708 }
2709 while (!eof);
2710
2711 {
2712 pdf_obj *catalog;
2713 pdf_obj *pages;
2714 doc->linear_pos = doc->file_length;
2715 pdf_load_xref(ctx, doc, buf);
2716 catalog = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
2717 pages = pdf_dict_get(ctx, catalog, PDF_NAME(Pages));
2718
2719 if (!pdf_is_dict(ctx, pages))
2720 fz_throw(ctx, FZ_ERROR_GENERIC, "missing page tree");
2721 break;
2722 }
2723 }
2724 fz_always(ctx)
2725 {
2726 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2727 }
2728 fz_catch(ctx)
2729 {
2730 pdf_drop_obj(ctx, page);
2731 if (fz_caught(ctx) == FZ_ERROR_TRYLATER)
2732 {
2733 if (doc->linear_page_refs[pagenum] == NULL)
2734 {
2735 /* Still not got a page */
2736 fz_rethrow(ctx);
2737 }
2738 }
2739 else
2740 fz_rethrow(ctx);
2741 }
2742
2743 return doc->linear_page_refs[pagenum];
2744 }
2745
pdf_document_from_fz_document(fz_context * ctx,fz_document * ptr)2746 pdf_document *pdf_document_from_fz_document(fz_context *ctx, fz_document *ptr)
2747 {
2748 return (pdf_document *)((ptr && ptr->count_pages == pdf_count_pages_imp) ? ptr : NULL);
2749 }
2750
pdf_page_from_fz_page(fz_context * ctx,fz_page * ptr)2751 pdf_page *pdf_page_from_fz_page(fz_context *ctx, fz_page *ptr)
2752 {
2753 return (pdf_page *)((ptr && ptr->bound_page == (fz_page_bound_page_fn*)pdf_bound_page) ? ptr : NULL);
2754 }
2755
pdf_specifics(fz_context * ctx,fz_document * doc)2756 pdf_document *pdf_specifics(fz_context *ctx, fz_document *doc)
2757 {
2758 return pdf_document_from_fz_document(ctx, doc);
2759 }
2760
2761 pdf_obj *
pdf_add_object(fz_context * ctx,pdf_document * doc,pdf_obj * obj)2762 pdf_add_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2763 {
2764 pdf_document *orig_doc;
2765 int num;
2766
2767 orig_doc = pdf_get_bound_document(ctx, obj);
2768 if (orig_doc && orig_doc != doc)
2769 fz_throw(ctx, FZ_ERROR_GENERIC, "tried to add an object belonging to a different document");
2770 if (pdf_is_indirect(ctx, obj))
2771 return pdf_keep_obj(ctx, obj);
2772 num = pdf_create_object(ctx, doc);
2773 pdf_update_object(ctx, doc, num, obj);
2774 return pdf_new_indirect(ctx, doc, num, 0);
2775 }
2776
2777 pdf_obj *
pdf_add_object_drop(fz_context * ctx,pdf_document * doc,pdf_obj * obj)2778 pdf_add_object_drop(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2779 {
2780 pdf_obj *ind = NULL;
2781 fz_try(ctx)
2782 ind = pdf_add_object(ctx, doc, obj);
2783 fz_always(ctx)
2784 pdf_drop_obj(ctx, obj);
2785 fz_catch(ctx)
2786 fz_rethrow(ctx);
2787 return ind;
2788 }
2789
2790 pdf_obj *
pdf_add_new_dict(fz_context * ctx,pdf_document * doc,int initial)2791 pdf_add_new_dict(fz_context *ctx, pdf_document *doc, int initial)
2792 {
2793 return pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, initial));
2794 }
2795
2796 pdf_obj *
pdf_add_new_array(fz_context * ctx,pdf_document * doc,int initial)2797 pdf_add_new_array(fz_context *ctx, pdf_document *doc, int initial)
2798 {
2799 return pdf_add_object_drop(ctx, doc, pdf_new_array(ctx, doc, initial));
2800 }
2801
2802 pdf_obj *
pdf_add_stream(fz_context * ctx,pdf_document * doc,fz_buffer * buf,pdf_obj * obj,int compressed)2803 pdf_add_stream(fz_context *ctx, pdf_document *doc, fz_buffer *buf, pdf_obj *obj, int compressed)
2804 {
2805 pdf_obj *ind;
2806 if (!obj)
2807 ind = pdf_add_new_dict(ctx, doc, 4);
2808 else
2809 ind = pdf_add_object(ctx, doc, obj);
2810 fz_try(ctx)
2811 pdf_update_stream(ctx, doc, ind, buf, compressed);
2812 fz_catch(ctx)
2813 {
2814 pdf_drop_obj(ctx, ind);
2815 fz_rethrow(ctx);
2816 }
2817 return ind;
2818 }
2819
pdf_create_document(fz_context * ctx)2820 pdf_document *pdf_create_document(fz_context *ctx)
2821 {
2822 pdf_document *doc;
2823 pdf_obj *root;
2824 pdf_obj *pages;
2825 pdf_obj *trailer = NULL;
2826
2827 fz_var(trailer);
2828
2829 doc = pdf_new_document(ctx, NULL);
2830 fz_try(ctx)
2831 {
2832 doc->version = 17;
2833 doc->file_size = 0;
2834 doc->startxref = 0;
2835 doc->num_xref_sections = 0;
2836 doc->num_incremental_sections = 0;
2837 doc->xref_base = 0;
2838 doc->disallow_new_increments = 0;
2839 pdf_get_populating_xref_entry(ctx, doc, 0);
2840
2841 trailer = pdf_new_dict(ctx, doc, 2);
2842 pdf_dict_put_int(ctx, trailer, PDF_NAME(Size), 3);
2843 pdf_dict_put_drop(ctx, trailer, PDF_NAME(Root), root = pdf_add_new_dict(ctx, doc, 2));
2844 pdf_dict_put(ctx, root, PDF_NAME(Type), PDF_NAME(Catalog));
2845 pdf_dict_put_drop(ctx, root, PDF_NAME(Pages), pages = pdf_add_new_dict(ctx, doc, 3));
2846 pdf_dict_put(ctx, pages, PDF_NAME(Type), PDF_NAME(Pages));
2847 pdf_dict_put_int(ctx, pages, PDF_NAME(Count), 0);
2848 pdf_dict_put_array(ctx, pages, PDF_NAME(Kids), 1);
2849
2850 /* Set the trailer of the final xref section. */
2851 doc->xref_sections[0].trailer = trailer;
2852 }
2853 fz_catch(ctx)
2854 {
2855 pdf_drop_obj(ctx, trailer);
2856 fz_drop_document(ctx, &doc->super);
2857 fz_rethrow(ctx);
2858 }
2859 return doc;
2860 }
2861
2862 static const char *pdf_extensions[] =
2863 {
2864 "pdf",
2865 "pclm",
2866 "ai",
2867 NULL
2868 };
2869
2870 static const char *pdf_mimetypes[] =
2871 {
2872 "application/pdf",
2873 "application/PCLm",
2874 NULL
2875 };
2876
2877 fz_document_handler pdf_document_handler =
2878 {
2879 NULL,
2880 (fz_document_open_fn*)pdf_open_document,
2881 (fz_document_open_with_stream_fn*)pdf_open_document_with_stream,
2882 pdf_extensions,
2883 pdf_mimetypes,
2884 NULL,
2885 NULL
2886 };
2887
pdf_mark_xref(fz_context * ctx,pdf_document * doc)2888 void pdf_mark_xref(fz_context *ctx, pdf_document *doc)
2889 {
2890 int x, e;
2891
2892 for (x = 0; x < doc->num_xref_sections; x++)
2893 {
2894 pdf_xref *xref = &doc->xref_sections[x];
2895 pdf_xref_subsec *sub;
2896
2897 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2898 {
2899 for (e = 0; e < sub->len; e++)
2900 {
2901 pdf_xref_entry *entry = &sub->table[e];
2902 if (entry->obj)
2903 {
2904 entry->marked = 1;
2905 }
2906 }
2907 }
2908 }
2909 }
2910
pdf_clear_xref(fz_context * ctx,pdf_document * doc)2911 void pdf_clear_xref(fz_context *ctx, pdf_document *doc)
2912 {
2913 int x, e;
2914
2915 for (x = 0; x < doc->num_xref_sections; x++)
2916 {
2917 pdf_xref *xref = &doc->xref_sections[x];
2918 pdf_xref_subsec *sub;
2919
2920 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2921 {
2922 for (e = 0; e < sub->len; e++)
2923 {
2924 pdf_xref_entry *entry = &sub->table[e];
2925 /* We cannot drop objects if the stream
2926 * buffer has been updated */
2927 if (entry->obj != NULL && entry->stm_buf == NULL)
2928 {
2929 if (pdf_obj_refs(ctx, entry->obj) == 1)
2930 {
2931 pdf_drop_obj(ctx, entry->obj);
2932 entry->obj = NULL;
2933 }
2934 }
2935 }
2936 }
2937 }
2938 }
2939
pdf_clear_xref_to_mark(fz_context * ctx,pdf_document * doc)2940 void pdf_clear_xref_to_mark(fz_context *ctx, pdf_document *doc)
2941 {
2942 int x, e;
2943
2944 for (x = 0; x < doc->num_xref_sections; x++)
2945 {
2946 pdf_xref *xref = &doc->xref_sections[x];
2947 pdf_xref_subsec *sub;
2948
2949 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2950 {
2951 for (e = 0; e < sub->len; e++)
2952 {
2953 pdf_xref_entry *entry = &sub->table[e];
2954
2955 /* We cannot drop objects if the stream buffer has
2956 * been updated */
2957 if (entry->obj != NULL && entry->stm_buf == NULL)
2958 {
2959 if (!entry->marked && pdf_obj_refs(ctx, entry->obj) == 1)
2960 {
2961 pdf_drop_obj(ctx, entry->obj);
2962 entry->obj = NULL;
2963 }
2964 }
2965 }
2966 }
2967 }
2968 }
2969
2970 int
pdf_count_versions(fz_context * ctx,pdf_document * doc)2971 pdf_count_versions(fz_context *ctx, pdf_document *doc)
2972 {
2973 return doc->num_xref_sections-doc->num_incremental_sections-doc->has_linearization_object;
2974 }
2975
2976 int
pdf_count_unsaved_versions(fz_context * ctx,pdf_document * doc)2977 pdf_count_unsaved_versions(fz_context *ctx, pdf_document *doc)
2978 {
2979 return doc->num_incremental_sections;
2980 }
2981
2982 int
pdf_doc_was_linearized(fz_context * ctx,pdf_document * doc)2983 pdf_doc_was_linearized(fz_context *ctx, pdf_document *doc)
2984 {
2985 return doc->has_linearization_object;
2986 }
2987
pdf_obj_exists(fz_context * ctx,pdf_document * doc,int i)2988 static int pdf_obj_exists(fz_context *ctx, pdf_document *doc, int i)
2989 {
2990 pdf_xref_subsec *sub;
2991 int j;
2992
2993 if (i < 0)
2994 fz_throw(ctx, FZ_ERROR_GENERIC, "Negative object number requested");
2995
2996 if (i <= doc->max_xref_len)
2997 j = doc->xref_index[i];
2998 else
2999 j = 0;
3000
3001 /* We may be accessing an earlier version of the document using xref_base
3002 * and j may be an index into a later xref section */
3003 if (doc->xref_base > j)
3004 j = doc->xref_base;
3005
3006 /* Find the first xref section where the entry is defined. */
3007 for (; j < doc->num_xref_sections; j++)
3008 {
3009 pdf_xref *xref = &doc->xref_sections[j];
3010
3011 if (i < xref->num_objects)
3012 {
3013 for (sub = xref->subsec; sub != NULL; sub = sub->next)
3014 {
3015 if (i < sub->start || i >= sub->start + sub->len)
3016 continue;
3017
3018 if (sub->table[i - sub->start].type)
3019 return 1;
3020 }
3021 }
3022 }
3023
3024 return 0;
3025 }
3026
3027 enum {
3028 FIELD_CHANGED = 1,
3029 FIELD_CHANGE_VALID = 2,
3030 FIELD_CHANGE_INVALID = 4
3031 };
3032
3033 typedef struct
3034 {
3035 int num_obj;
3036 int obj_changes[1];
3037 } pdf_changes;
3038
3039 static int
check_unchanged_between(fz_context * ctx,pdf_document * doc,pdf_changes * changes,pdf_obj * nobj,pdf_obj * oobj)3040 check_unchanged_between(fz_context *ctx, pdf_document *doc, pdf_changes *changes, pdf_obj *nobj, pdf_obj *oobj)
3041 {
3042 int marked = 0;
3043 int changed = 0;
3044
3045 /* Trivially identical => trivially unchanged. */
3046 if (nobj == oobj)
3047 return 0;
3048
3049 /* Strictly speaking we shouldn't need to call fz_var,
3050 * but I suspect static analysis tools are not smart
3051 * enough to figure that out. */
3052 fz_var(marked);
3053
3054 if (pdf_is_indirect(ctx, nobj))
3055 {
3056 int o_xref_base = doc->xref_base;
3057
3058 /* Both must be indirect if one is. */
3059 if (!pdf_is_indirect(ctx, oobj))
3060 {
3061 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3062 return 1;
3063 }
3064
3065 /* Handle recursing back into ourselves. */
3066 if (pdf_obj_marked(ctx, nobj))
3067 {
3068 if (pdf_obj_marked(ctx, oobj))
3069 return 0;
3070 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3071 return 1;
3072 }
3073 else if (pdf_obj_marked(ctx, oobj))
3074 {
3075 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3076 return 1;
3077 }
3078
3079 nobj = pdf_resolve_indirect_chain(ctx, nobj);
3080 doc->xref_base = o_xref_base+1;
3081 fz_try(ctx)
3082 {
3083 oobj = pdf_resolve_indirect_chain(ctx, oobj);
3084 if (oobj != nobj)
3085 {
3086 /* Different objects, so lock them */
3087 if (!pdf_obj_marked(ctx, nobj) && !pdf_obj_marked(ctx, oobj))
3088 {
3089 pdf_mark_obj(ctx, nobj);
3090 pdf_mark_obj(ctx, oobj);
3091 marked = 1;
3092 }
3093 }
3094 }
3095 fz_always(ctx)
3096 doc->xref_base = o_xref_base;
3097 fz_catch(ctx)
3098 fz_rethrow(ctx);
3099
3100 if (nobj == oobj)
3101 return 0; /* Trivially identical */
3102 }
3103
3104 fz_var(changed);
3105
3106 fz_try(ctx)
3107 {
3108 if (pdf_is_dict(ctx, nobj))
3109 {
3110 int i, n = pdf_dict_len(ctx, nobj);
3111
3112 if (!pdf_is_dict(ctx, oobj) || n != pdf_dict_len(ctx, oobj))
3113 {
3114 change_found:
3115 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3116 changed = 1;
3117 break;
3118 }
3119
3120 for (i = 0; i < n; i++)
3121 {
3122 pdf_obj *key = pdf_dict_get_key(ctx, nobj, i);
3123 pdf_obj *nval = pdf_dict_get(ctx, nobj, key);
3124 pdf_obj *oval = pdf_dict_get(ctx, oobj, key);
3125
3126 changed |= check_unchanged_between(ctx, doc, changes, nval, oval);
3127 }
3128 }
3129 else if (pdf_is_array(ctx, nobj))
3130 {
3131 int i, n = pdf_array_len(ctx, nobj);
3132
3133 if (!pdf_is_array(ctx, oobj) || n != pdf_array_len(ctx, oobj))
3134 goto change_found;
3135
3136 for (i = 0; i < n; i++)
3137 {
3138 pdf_obj *nval = pdf_array_get(ctx, nobj, i);
3139 pdf_obj *oval = pdf_array_get(ctx, oobj, i);
3140
3141 changed |= check_unchanged_between(ctx, doc, changes, nval, oval);
3142 }
3143 }
3144 else if (pdf_objcmp(ctx, nobj, oobj))
3145 goto change_found;
3146 }
3147 fz_always(ctx)
3148 {
3149 if (marked)
3150 {
3151 pdf_unmark_obj(ctx, nobj);
3152 pdf_unmark_obj(ctx, oobj);
3153 }
3154 }
3155 fz_catch(ctx)
3156 fz_rethrow(ctx);
3157
3158 return changed;
3159 }
3160
3161 typedef struct
3162 {
3163 int max;
3164 int len;
3165 char **list;
3166 } char_list;
3167
3168 /* This structure is used to hold the definition of which fields
3169 * are locked. */
3170 struct pdf_locked_fields
3171 {
3172 int p;
3173 int all;
3174 char_list includes;
3175 char_list excludes;
3176 };
3177
3178 static void
free_char_list(fz_context * ctx,char_list * c)3179 free_char_list(fz_context *ctx, char_list *c)
3180 {
3181 int i;
3182
3183 if (c == NULL)
3184 return;
3185
3186 for (i = c->len-1; i >= 0; i--)
3187 fz_free(ctx, c->list[i]);
3188 fz_free(ctx, c->list);
3189 c->len = 0;
3190 c->max = 0;
3191 }
3192
3193 void
pdf_drop_locked_fields(fz_context * ctx,pdf_locked_fields * fl)3194 pdf_drop_locked_fields(fz_context *ctx, pdf_locked_fields *fl)
3195 {
3196 if (fl == NULL)
3197 return;
3198
3199 free_char_list(ctx, &fl->includes);
3200 free_char_list(ctx, &fl->excludes);
3201 fz_free(ctx, fl);
3202 }
3203
3204 static void
char_list_append(fz_context * ctx,char_list * list,const char * s)3205 char_list_append(fz_context *ctx, char_list *list, const char *s)
3206 {
3207 if (list->len == list->max)
3208 {
3209 int n = list->max * 2;
3210 if (n == 0) n = 4;
3211
3212 list->list = fz_realloc_array(ctx, list->list, n, char *);
3213 list->max = n;
3214 }
3215 list->list[list->len] = fz_strdup(ctx, s);
3216 list->len++;
3217 }
3218
3219 int
pdf_is_field_locked(fz_context * ctx,pdf_locked_fields * locked,const char * name)3220 pdf_is_field_locked(fz_context *ctx, pdf_locked_fields *locked, const char *name)
3221 {
3222 int i;
3223
3224 if (locked->p == 1)
3225 {
3226 /* Permissions were set, and say that field changes are not to be allowed. */
3227 return 1; /* Locked */
3228 }
3229
3230 if(locked->all)
3231 {
3232 /* The only way we might not be unlocked is if
3233 * we are listed in the excludes. */
3234 for (i = 0; i < locked->excludes.len; i++)
3235 if (!strcmp(locked->excludes.list[i], name))
3236 return 0;
3237 return 1;
3238 }
3239
3240 /* The only way we can be locked is for us to be in the includes. */
3241 for (i = 0; i < locked->includes.len; i++)
3242 if (strcmp(locked->includes.list[i], name) == 0)
3243 return 1;
3244
3245 /* Anything else is unlocked */
3246 return 0;
3247 }
3248
3249 /* Unfortunately, in C, there is no legal way to define a function
3250 * type that returns itself. We therefore have to use a struct
3251 * wrapper. */
3252 typedef struct filter_wrap
3253 {
3254 struct filter_wrap (*func)(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
3255 } filter_wrap;
3256
3257 typedef struct filter_wrap (*filter_fn)(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
3258
3259 #define RETURN_FILTER(f) { filter_wrap rf; rf.func = (f); return rf; }
3260
3261 static filter_wrap filter_simple(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3262 {
3263 RETURN_FILTER(NULL);
3264 }
3265
filter_transformparams(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3266 static filter_wrap filter_transformparams(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3267 {
3268 if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
3269 pdf_name_eq(ctx, key, PDF_NAME(P)) ||
3270 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3271 pdf_name_eq(ctx, key, PDF_NAME(Document)) ||
3272 pdf_name_eq(ctx, key, PDF_NAME(Msg)) ||
3273 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3274 pdf_name_eq(ctx, key, PDF_NAME(Annots)) ||
3275 pdf_name_eq(ctx, key, PDF_NAME(Form)) ||
3276 pdf_name_eq(ctx, key, PDF_NAME(FormEx)) ||
3277 pdf_name_eq(ctx, key, PDF_NAME(EF)) ||
3278 pdf_name_eq(ctx, key, PDF_NAME(P)) ||
3279 pdf_name_eq(ctx, key, PDF_NAME(Action)) ||
3280 pdf_name_eq(ctx, key, PDF_NAME(Fields)))
3281 RETURN_FILTER(&filter_simple);
3282 RETURN_FILTER(NULL);
3283 }
3284
filter_reference(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3285 static filter_wrap filter_reference(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3286 {
3287 if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
3288 pdf_name_eq(ctx, key, PDF_NAME(TransformMethod)) ||
3289 pdf_name_eq(ctx, key, PDF_NAME(DigestMethod)) ||
3290 pdf_name_eq(ctx, key, PDF_NAME(DigestValue)) ||
3291 pdf_name_eq(ctx, key, PDF_NAME(DigestLocation)))
3292 RETURN_FILTER(&filter_simple);
3293 if (pdf_name_eq(ctx, key, PDF_NAME(TransformParams)))
3294 RETURN_FILTER(&filter_transformparams);
3295 RETURN_FILTER(NULL);
3296 }
3297
filter_prop_build_sub(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3298 static filter_wrap filter_prop_build_sub(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3299 {
3300 if (pdf_name_eq(ctx, key, PDF_NAME(Name)) ||
3301 pdf_name_eq(ctx, key, PDF_NAME(Date)) ||
3302 pdf_name_eq(ctx, key, PDF_NAME(R)) ||
3303 pdf_name_eq(ctx, key, PDF_NAME(PreRelease)) ||
3304 pdf_name_eq(ctx, key, PDF_NAME(OS)) ||
3305 pdf_name_eq(ctx, key, PDF_NAME(NonEFontNoWarn)) ||
3306 pdf_name_eq(ctx, key, PDF_NAME(TrustedMode)) ||
3307 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3308 pdf_name_eq(ctx, key, PDF_NAME(REx)) ||
3309 pdf_name_eq(ctx, key, PDF_NAME(Preview)))
3310 RETURN_FILTER(&filter_simple);
3311 RETURN_FILTER(NULL);
3312 }
3313
filter_prop_build(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3314 static filter_wrap filter_prop_build(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3315 {
3316 if (pdf_name_eq(ctx, key, PDF_NAME(Filter)) ||
3317 pdf_name_eq(ctx, key, PDF_NAME(PubSec)) ||
3318 pdf_name_eq(ctx, key, PDF_NAME(App)) ||
3319 pdf_name_eq(ctx, key, PDF_NAME(SigQ)))
3320 RETURN_FILTER(&filter_prop_build_sub);
3321 RETURN_FILTER(NULL);
3322 }
3323
filter_v(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3324 static filter_wrap filter_v(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3325 {
3326 /* Text can point to a stream object */
3327 if (pdf_name_eq(ctx, key, PDF_NAME(Length)) && pdf_is_stream(ctx, dict))
3328 RETURN_FILTER(&filter_simple);
3329 /* Sigs point to a dict. */
3330 if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
3331 pdf_name_eq(ctx, key, PDF_NAME(Filter)) ||
3332 pdf_name_eq(ctx, key, PDF_NAME(SubFilter)) ||
3333 pdf_name_eq(ctx, key, PDF_NAME(Contents)) ||
3334 pdf_name_eq(ctx, key, PDF_NAME(Cert)) ||
3335 pdf_name_eq(ctx, key, PDF_NAME(ByteRange)) ||
3336 pdf_name_eq(ctx, key, PDF_NAME(Changes)) ||
3337 pdf_name_eq(ctx, key, PDF_NAME(Name)) ||
3338 pdf_name_eq(ctx, key, PDF_NAME(M)) ||
3339 pdf_name_eq(ctx, key, PDF_NAME(Location)) ||
3340 pdf_name_eq(ctx, key, PDF_NAME(Reason)) ||
3341 pdf_name_eq(ctx, key, PDF_NAME(ContactInfo)) ||
3342 pdf_name_eq(ctx, key, PDF_NAME(R)) ||
3343 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
3344 pdf_name_eq(ctx, key, PDF_NAME(Prop_AuthTime)) ||
3345 pdf_name_eq(ctx, key, PDF_NAME(Prop_AuthType)))
3346 RETURN_FILTER(&filter_simple);
3347 if (pdf_name_eq(ctx, key, PDF_NAME(Reference)))
3348 RETURN_FILTER(filter_reference);
3349 if (pdf_name_eq(ctx, key, PDF_NAME(Prop_Build)))
3350 RETURN_FILTER(filter_prop_build);
3351 RETURN_FILTER(NULL);
3352 }
3353
3354 static filter_wrap filter_appearance(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
3355
filter_xobject_list(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3356 static filter_wrap filter_xobject_list(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3357 {
3358 /* FIXME: Infinite recursion possible here? */
3359 RETURN_FILTER(&filter_appearance);
3360 }
3361
filter_font(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3362 static filter_wrap filter_font(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3363 {
3364 /* In the example I've seen the /Name field was dropped, so we'll allow
3365 * local changes, but none that follow an indirection. */
3366 RETURN_FILTER(NULL);
3367 }
3368
3369 /* FIXME: One idea here is to make filter_font_list and filter_xobject_list
3370 * only accept NEW objects as changes. Will think about this. */
filter_font_list(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3371 static filter_wrap filter_font_list(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3372 {
3373 RETURN_FILTER(&filter_font);
3374 }
3375
filter_resources(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3376 static filter_wrap filter_resources(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3377 {
3378 if (pdf_name_eq(ctx, key, PDF_NAME(XObject)))
3379 RETURN_FILTER(&filter_xobject_list);
3380 if (pdf_name_eq(ctx, key, PDF_NAME(Font)))
3381 RETURN_FILTER(&filter_font_list);
3382 RETURN_FILTER(NULL);
3383 }
3384
filter_appearance(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3385 static filter_wrap filter_appearance(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3386 {
3387 if (pdf_name_eq(ctx, key, PDF_NAME(Resources)))
3388 RETURN_FILTER(&filter_resources);
3389 RETURN_FILTER(NULL);
3390 }
3391
filter_ap(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3392 static filter_wrap filter_ap(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3393 {
3394 /* Just the /N entry for now. May need to add more later. */
3395 if (pdf_name_eq(ctx, key, PDF_NAME(N)) && pdf_is_stream(ctx, pdf_dict_get(ctx, dict, key)))
3396 RETURN_FILTER(&filter_appearance);
3397 RETURN_FILTER(NULL);
3398 }
3399
filter_xfa(fz_context * ctx,pdf_obj * dict,pdf_obj * key)3400 static filter_wrap filter_xfa(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
3401 {
3402 /* Text can point to a stream object */
3403 if (pdf_is_stream(ctx, dict))
3404 RETURN_FILTER(&filter_simple);
3405 RETURN_FILTER(NULL);
3406 }
3407
3408 static void
filter_changes_accepted(fz_context * ctx,pdf_changes * changes,pdf_obj * obj,filter_fn filter)3409 filter_changes_accepted(fz_context *ctx, pdf_changes *changes, pdf_obj *obj, filter_fn filter)
3410 {
3411 int obj_num;
3412
3413 if (obj == NULL || pdf_obj_marked(ctx, obj))
3414 return;
3415
3416 obj_num = pdf_to_num(ctx, obj);
3417
3418 fz_try(ctx)
3419 {
3420 if (obj_num != 0)
3421 {
3422 pdf_mark_obj(ctx, obj);
3423 changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
3424 }
3425 if (filter == NULL)
3426 break;
3427 if (pdf_is_dict(ctx, obj))
3428 {
3429 int i, n = pdf_dict_len(ctx, obj);
3430
3431 for (i = 0; i < n; i++)
3432 {
3433 pdf_obj *key = pdf_dict_get_key(ctx, obj, i);
3434 pdf_obj *val = pdf_dict_get_val(ctx, obj, i);
3435 filter_fn f = (filter(ctx, obj, key)).func;
3436 if (f != NULL)
3437 filter_changes_accepted(ctx, changes, val, f);
3438 }
3439 }
3440 else if (pdf_is_array(ctx, obj))
3441 {
3442 int i, n = pdf_array_len(ctx, obj);
3443
3444 for (i = 0; i < n; i++)
3445 {
3446 pdf_obj *val = pdf_array_get(ctx, obj, i);
3447 filter_changes_accepted(ctx, changes, val, filter);
3448 }
3449 }
3450 }
3451 fz_always(ctx)
3452 if (obj_num != 0)
3453 pdf_unmark_obj(ctx, obj);
3454 fz_catch(ctx)
3455 fz_rethrow(ctx);
3456 }
3457
3458 static void
check_field(fz_context * ctx,pdf_document * doc,pdf_changes * changes,pdf_obj * obj,pdf_locked_fields * locked,const char * name_prefix,pdf_obj * new_v,pdf_obj * old_v)3459 check_field(fz_context *ctx, pdf_document *doc, pdf_changes *changes, pdf_obj *obj, pdf_locked_fields *locked, const char *name_prefix, pdf_obj *new_v, pdf_obj *old_v)
3460 {
3461 pdf_obj *old_obj, *new_obj, *n_v, *o_v;
3462 int o_xref_base;
3463 int obj_num;
3464 char *field_name = NULL;
3465
3466 /* All fields MUST be indirections, either in the Fields array
3467 * or AcroForms, or in the Kids array of other Fields. */
3468 if (!pdf_is_indirect(ctx, obj))
3469 return;
3470
3471 obj_num = pdf_to_num(ctx, obj);
3472 o_xref_base = doc->xref_base;
3473 new_obj = pdf_resolve_indirect_chain(ctx, obj);
3474
3475 /* Similarly, all fields must be dicts */
3476 if (!pdf_is_dict(ctx, new_obj))
3477 return;
3478
3479 if (pdf_obj_marked(ctx, obj))
3480 return;
3481
3482 fz_var(field_name);
3483
3484 fz_try(ctx)
3485 {
3486 int i, len;
3487 const char *name;
3488 size_t n;
3489 pdf_obj *t;
3490 int is_locked;
3491
3492 pdf_mark_obj(ctx, obj);
3493
3494 /* Do this within the try, so we can catch any problems */
3495 doc->xref_base = o_xref_base+1;
3496 old_obj = pdf_resolve_indirect_chain(ctx, obj);
3497
3498 t = pdf_dict_get(ctx, old_obj, PDF_NAME(T));
3499 if (t != NULL)
3500 {
3501 name = pdf_to_text_string(ctx, pdf_dict_get(ctx, old_obj, PDF_NAME(T)));
3502 n = strlen(name)+1;
3503 if (*name_prefix)
3504 n += 1 + strlen(name_prefix);
3505 field_name = fz_malloc(ctx, n);
3506 if (*name_prefix)
3507 {
3508 strcpy(field_name, name_prefix);
3509 strcat(field_name, ".");
3510 }
3511 else
3512 *field_name = 0;
3513 strcat(field_name, name);
3514 name_prefix = field_name;
3515 }
3516
3517 doc->xref_base = o_xref_base;
3518
3519 if (!pdf_is_dict(ctx, old_obj))
3520 break;
3521
3522 /* Check V explicitly, allowing for it being inherited. */
3523 n_v = pdf_dict_get(ctx, new_obj, PDF_NAME(V));
3524 if (n_v == NULL)
3525 n_v = new_v;
3526 o_v = pdf_dict_get(ctx, old_obj, PDF_NAME(V));
3527 if (o_v == NULL)
3528 o_v = old_v;
3529
3530 is_locked = pdf_is_field_locked(ctx, locked, name_prefix);
3531 if (pdf_name_eq(ctx, pdf_dict_get(ctx, new_obj, PDF_NAME(Type)), PDF_NAME(Annot)) &&
3532 pdf_name_eq(ctx, pdf_dict_get(ctx, new_obj, PDF_NAME(Subtype)), PDF_NAME(Widget)))
3533 {
3534 if (is_locked)
3535 {
3536 /* If locked, V must not change! */
3537 if (check_unchanged_between(ctx, doc, changes, n_v, o_v))
3538 changes->obj_changes[obj_num] |= FIELD_CHANGE_INVALID;
3539 }
3540 else
3541 {
3542 /* If not locked, V can change to be filled in! */
3543 filter_changes_accepted(ctx, changes, n_v, &filter_v);
3544 changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
3545 }
3546 }
3547
3548 /* Check all the fields in the new object are
3549 * either the same as the old object, or are
3550 * expected changes. */
3551 len = pdf_dict_len(ctx, new_obj);
3552 for (i = 0; i < len; i++)
3553 {
3554 pdf_obj *key = pdf_dict_get_key(ctx, new_obj, i);
3555 pdf_obj *nval = pdf_dict_get(ctx, new_obj, key);
3556 pdf_obj *oval = pdf_dict_get(ctx, old_obj, key);
3557
3558 /* Kids arrays shouldn't change. */
3559 if (pdf_name_eq(ctx, key, PDF_NAME(Kids)))
3560 {
3561 int j, m;
3562
3563 /* Kids must be an array. If it's not, count it as a difference. */
3564 if (!pdf_is_array(ctx, nval) || !pdf_is_array(ctx, oval))
3565 {
3566 change_found:
3567 changes->obj_changes[obj_num] |= FIELD_CHANGE_INVALID;
3568 break;
3569 }
3570 m = pdf_array_len(ctx, nval);
3571 /* Any change in length counts as a difference */
3572 if (m != pdf_array_len(ctx, oval))
3573 goto change_found;
3574 for (j = 0; j < m; j++)
3575 {
3576 pdf_obj *nkid = pdf_array_get(ctx, nval, j);
3577 pdf_obj *okid = pdf_array_get(ctx, oval, j);
3578 /* Kids arrays are supposed to all be indirect. If they aren't,
3579 * count it as a difference. */
3580 if (!pdf_is_indirect(ctx, nkid) || !pdf_is_indirect(ctx, okid))
3581 goto change_found;
3582 /* For now at least, we'll count any change in number as a difference. */
3583 if (pdf_to_num(ctx, nkid) != pdf_to_num(ctx, okid))
3584 goto change_found;
3585 check_field(ctx, doc, changes, nkid, locked, name_prefix, n_v, o_v);
3586 }
3587 }
3588 else if (pdf_name_eq(ctx, key, PDF_NAME(V)))
3589 {
3590 /* V is checked above */
3591 }
3592 else if (pdf_name_eq(ctx, key, PDF_NAME(AP)))
3593 {
3594 /* If we're locked, then nothing can change. If not,
3595 * we can change to be filled in. */
3596 if (is_locked)
3597 check_unchanged_between(ctx, doc, changes, nval, oval);
3598 else
3599 filter_changes_accepted(ctx, changes, nval, &filter_ap);
3600 }
3601 /* All other fields can't change */
3602 else
3603 check_unchanged_between(ctx, doc, changes, nval, oval);
3604 }
3605
3606 /* Now check all the fields in the old object to
3607 * make sure none were dropped. */
3608 len = pdf_dict_len(ctx, old_obj);
3609 for (i = 0; i < len; i++)
3610 {
3611 pdf_obj *key = pdf_dict_get_key(ctx, old_obj, i);
3612 pdf_obj *nval, *oval;
3613
3614 /* V is checked above */
3615 if (pdf_name_eq(ctx, key, PDF_NAME(V)))
3616 continue;
3617
3618 nval = pdf_dict_get(ctx, new_obj, key);
3619 oval = pdf_dict_get(ctx, old_obj, key);
3620
3621 if (nval == NULL && oval != NULL)
3622 changes->obj_changes[pdf_to_num(ctx, nval)] |= FIELD_CHANGE_INVALID;
3623 }
3624 changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
3625
3626 }
3627 fz_always(ctx)
3628 {
3629 pdf_unmark_obj(ctx, obj);
3630 fz_free(ctx, field_name);
3631 doc->xref_base = o_xref_base;
3632 }
3633 fz_catch(ctx)
3634 fz_rethrow(ctx);
3635 }
3636
3637 static int
pdf_obj_changed_in_version(fz_context * ctx,pdf_document * doc,int num,int version)3638 pdf_obj_changed_in_version(fz_context *ctx, pdf_document *doc, int num, int version)
3639 {
3640 if (num < 0 || num > doc->max_xref_len)
3641 fz_throw(ctx, FZ_ERROR_GENERIC, "Invalid object number requested");
3642
3643 return version == doc->xref_index[num];
3644 }
3645
3646 static void
merge_lock_specification(fz_context * ctx,pdf_locked_fields * fields,pdf_obj * lock)3647 merge_lock_specification(fz_context *ctx, pdf_locked_fields *fields, pdf_obj *lock)
3648 {
3649 pdf_obj *action;
3650 int i, r, w;
3651
3652 if (lock == NULL)
3653 return;
3654
3655 action = pdf_dict_get(ctx, lock, PDF_NAME(Action));
3656
3657 if (pdf_name_eq(ctx, action, PDF_NAME(All)))
3658 {
3659 /* All fields locked means we don't need any stored
3660 * includes/excludes. */
3661 fields->all = 1;
3662 free_char_list(ctx, &fields->includes);
3663 free_char_list(ctx, &fields->excludes);
3664 }
3665 else
3666 {
3667 pdf_obj *f = pdf_dict_get(ctx, lock, PDF_NAME(Fields));
3668 int len = pdf_array_len(ctx, f);
3669
3670 if (pdf_name_eq(ctx, action, PDF_NAME(Include)))
3671 {
3672 if (fields->all)
3673 {
3674 /* Current state = "All except <excludes> are locked".
3675 * We need to remove <Fields> from <excludes>. */
3676 for (i = 0; i < len; i++)
3677 {
3678 const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3679 int r, w;
3680
3681 for (r = w = 0; r < fields->excludes.len; r++)
3682 {
3683 if (strcmp(s, fields->excludes.list[r]))
3684 fields->excludes.list[w++] = fields->excludes.list[r];
3685 }
3686 fields->excludes.len = w;
3687 }
3688 }
3689 else
3690 {
3691 /* Current state = <includes> are locked.
3692 * We need to add <Fields> to <include> (avoiding repetition). */
3693 for (i = 0; i < len; i++)
3694 {
3695 const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3696
3697 for (r = 0; r < fields->includes.len; r++)
3698 {
3699 if (!strcmp(s, fields->includes.list[r]))
3700 break;
3701 }
3702 if (r == fields->includes.len)
3703 char_list_append(ctx, &fields->includes, s);
3704 }
3705 }
3706 }
3707 else if (pdf_name_eq(ctx, action, PDF_NAME(Exclude)))
3708 {
3709 if (fields->all)
3710 {
3711 /* Current state = "All except <excludes> are locked.
3712 * We need to remove anything from <excludes> that isn't in <Fields>. */
3713 for (r = w = 0; r < fields->excludes.len; r++)
3714 {
3715 for (i = 0; i < len; i++)
3716 {
3717 const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3718 if (!strcmp(s, fields->excludes.list[r]))
3719 break;
3720 }
3721 if (i != len) /* we found a match */
3722 fields->excludes.list[w++] = fields->excludes.list[r];
3723 }
3724 fields->excludes.len = w;
3725 }
3726 else
3727 {
3728 /* Current state = <includes> are locked.
3729 * Set all. <excludes> becomes <Fields> less <includes>. Remove <includes>. */
3730 fields->all = 1;
3731 for (i = 0; i < len; i++)
3732 {
3733 const char *s = pdf_to_text_string(ctx, pdf_array_get(ctx, f, i));
3734 for (r = 0; r < fields->includes.len; r++)
3735 {
3736 if (!strcmp(s, fields->includes.list[r]))
3737 break;
3738 }
3739 if (r == fields->includes.len)
3740 char_list_append(ctx, &fields->excludes, s);
3741 }
3742 free_char_list(ctx, &fields->includes);
3743 }
3744 }
3745 }
3746 }
3747
3748 static void
find_locked_fields_value(fz_context * ctx,pdf_locked_fields * fields,pdf_obj * v)3749 find_locked_fields_value(fz_context *ctx, pdf_locked_fields *fields, pdf_obj *v)
3750 {
3751 pdf_obj *ref = pdf_dict_get(ctx, v, PDF_NAME(Reference));
3752 int i, n;
3753
3754 if (!ref)
3755 return;
3756
3757 n = pdf_array_len(ctx, ref);
3758 for (i = 0; i < n; i++)
3759 {
3760 pdf_obj *sr = pdf_array_get(ctx, ref, i);
3761 pdf_obj *tm, *tp, *type;
3762
3763 /* Type is optional, but if it exists, it'd better be SigRef. */
3764 type = pdf_dict_get(ctx, sr, PDF_NAME(Type));
3765 if (type != NULL && !pdf_name_eq(ctx, type, PDF_NAME(SigRef)))
3766 continue;
3767 tm = pdf_dict_get(ctx, sr, PDF_NAME(TransformMethod));
3768 tp = pdf_dict_get(ctx, sr, PDF_NAME(TransformParams));
3769 if (pdf_name_eq(ctx, tm, PDF_NAME(DocMDP)))
3770 {
3771 int p = pdf_to_int(ctx, pdf_dict_get(ctx, tp, PDF_NAME(P)));
3772
3773 if (p == 0)
3774 p = 2;
3775 if (fields->p == 0)
3776 fields->p = p;
3777 else
3778 fields->p = fz_mini(fields->p, p);
3779 }
3780 else if (pdf_name_eq(ctx, tm, PDF_NAME(FieldMDP)))
3781 merge_lock_specification(ctx, fields, tp);
3782 }
3783 }
3784
3785 static void
find_locked_fields_aux(fz_context * ctx,pdf_obj * field,pdf_locked_fields * fields,pdf_obj * inherit_v,pdf_obj * inherit_ft)3786 find_locked_fields_aux(fz_context *ctx, pdf_obj *field, pdf_locked_fields *fields, pdf_obj *inherit_v, pdf_obj *inherit_ft)
3787 {
3788 int i, n;
3789
3790 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, field, PDF_NAME(Type)), PDF_NAME(Annot)))
3791 return;
3792
3793 if (pdf_obj_marked(ctx, field))
3794 return;
3795
3796 fz_try(ctx)
3797 {
3798 pdf_obj *kids, *v, *ft;
3799
3800 pdf_mark_obj(ctx, field);
3801
3802 v = pdf_dict_get(ctx, field, PDF_NAME(V));
3803 if (v == NULL)
3804 v = inherit_v;
3805 ft = pdf_dict_get(ctx, field, PDF_NAME(FT));
3806 if (ft == NULL)
3807 ft = inherit_ft;
3808
3809 /* We are looking for Widget annotations of type Sig that are
3810 * signed (i.e. have a 'V' field). */
3811 if (pdf_name_eq(ctx, pdf_dict_get(ctx, field, PDF_NAME(Subtype)), PDF_NAME(Widget)) &&
3812 pdf_name_eq(ctx, ft, PDF_NAME(Sig)) &&
3813 pdf_name_eq(ctx, pdf_dict_get(ctx, v, PDF_NAME(Type)), PDF_NAME(Sig)))
3814 {
3815 /* Signed Sig Widgets (i.e. ones with a 'V' field) need
3816 * to have their lock field respected. */
3817 merge_lock_specification(ctx, fields, pdf_dict_get(ctx, field, PDF_NAME(Lock)));
3818
3819 /* Look for DocMDP and FieldMDP entries to see what
3820 * flavours of alterations are allowed. */
3821 find_locked_fields_value(ctx, fields, v);
3822 }
3823
3824 /* Recurse as required */
3825 kids = pdf_dict_get(ctx, field, PDF_NAME(Kids));
3826 if (kids)
3827 {
3828 n = pdf_array_len(ctx, kids);
3829 for (i = 0; i < n; i++)
3830 find_locked_fields_aux(ctx, pdf_array_get(ctx, kids, i), fields, v, ft);
3831 }
3832 }
3833 fz_always(ctx)
3834 pdf_unmark_obj(ctx, field);
3835 fz_catch(ctx)
3836 fz_rethrow(ctx);
3837 }
3838
3839 pdf_locked_fields *
pdf_find_locked_fields(fz_context * ctx,pdf_document * doc,int version)3840 pdf_find_locked_fields(fz_context *ctx, pdf_document *doc, int version)
3841 {
3842 pdf_locked_fields *fields = fz_malloc_struct(ctx, pdf_locked_fields);
3843 int o_xref_base = doc->xref_base;
3844 doc->xref_base = version;
3845
3846 fz_var(fields);
3847
3848 fz_try(ctx)
3849 {
3850 pdf_obj *fobj = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm/Fields");
3851 int i, len = pdf_array_len(ctx, fobj);
3852
3853 if (len == 0)
3854 break;
3855
3856 for (i = 0; i < len; i++)
3857 find_locked_fields_aux(ctx, pdf_array_get(ctx, fobj, i), fields, NULL, NULL);
3858
3859 /* Add in any DocMDP referenced directly from the Perms dict. */
3860 find_locked_fields_value(ctx, fields, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Perms/DocMDP"));
3861 }
3862 fz_always(ctx)
3863 doc->xref_base = o_xref_base;
3864 fz_catch(ctx)
3865 {
3866 pdf_drop_locked_fields(ctx, fields);
3867 fz_rethrow(ctx);
3868 }
3869
3870 return fields;
3871 }
3872
3873 pdf_locked_fields *
pdf_find_locked_fields_for_sig(fz_context * ctx,pdf_document * doc,pdf_obj * sig)3874 pdf_find_locked_fields_for_sig(fz_context *ctx, pdf_document *doc, pdf_obj *sig)
3875 {
3876 pdf_locked_fields *fields = fz_malloc_struct(ctx, pdf_locked_fields);
3877
3878 fz_var(fields);
3879
3880 fz_try(ctx)
3881 {
3882 pdf_obj *ref;
3883 int i, len;
3884
3885 /* Ensure it really is a sig */
3886 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, sig, PDF_NAME(Subtype)), PDF_NAME(Widget)) ||
3887 !pdf_name_eq(ctx, pdf_dict_get_inheritable(ctx, sig, PDF_NAME(FT)), PDF_NAME(Sig)))
3888 break;
3889
3890 /* Check the locking details given in the V (i.e. what the signature value
3891 * claims to lock). */
3892 ref = pdf_dict_getp(ctx, sig, "V/Reference");
3893 len = pdf_array_len(ctx, ref);
3894 for (i = 0; i < len; i++)
3895 {
3896 pdf_obj *tp = pdf_dict_get(ctx, pdf_array_get(ctx, ref, i), PDF_NAME(TransformParams));
3897 merge_lock_specification(ctx, fields, tp);
3898 }
3899
3900 /* Also, check the locking details given in the Signature definition. This may
3901 * not strictly be necessary as it's supposed to be "what the form author told
3902 * the signature that it should lock". A well-formed signature should lock
3903 * at least that much (possibly with extra fields locked from the XFA). If the
3904 * signature doesn't lock as much as it was told to, we should be suspicious
3905 * of the signing application. It is not clear that this test is actually
3906 * necessary, or in keeping with what Acrobat does. */
3907 merge_lock_specification(ctx, fields, pdf_dict_get(ctx, sig, PDF_NAME(Lock)));
3908 }
3909 fz_catch(ctx)
3910 {
3911 pdf_drop_locked_fields(ctx, fields);
3912 fz_rethrow(ctx);
3913 }
3914
3915 return fields;
3916 }
3917
3918 static int
validate_locked_fields(fz_context * ctx,pdf_document * doc,int version,pdf_locked_fields * locked)3919 validate_locked_fields(fz_context *ctx, pdf_document *doc, int version, pdf_locked_fields *locked)
3920 {
3921 int o_xref_base = doc->xref_base;
3922 pdf_changes *changes;
3923 int num_objs;
3924 int i, n;
3925 int all_indirects = 1;
3926
3927 num_objs = doc->max_xref_len;
3928 changes = Memento_label(fz_calloc(ctx, 1, sizeof(*changes) + sizeof(int)*(num_objs-1)), "pdf_changes");
3929 changes->num_obj = num_objs;
3930
3931 fz_try(ctx)
3932 {
3933 pdf_obj *acroform, *new_acroform, *old_acroform;
3934 int len, acroform_num;
3935
3936 doc->xref_base = version;
3937
3938 /* Detect every object that has changed */
3939 for (i = 1; i < num_objs; i++)
3940 {
3941 if (pdf_obj_changed_in_version(ctx, doc, i, version))
3942 changes->obj_changes[i] = FIELD_CHANGED;
3943 }
3944
3945 /* FIXME: Compare PageTrees and NumberTrees (just to allow for them being regenerated
3946 * and having produced stuff that represents the same stuff). */
3947
3948 /* The metadata of a document may be regenerated. Allow for that. */
3949 filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Metadata"), &filter_simple);
3950
3951 /* The ModDate of document info may be regenerated. Allow for that. */
3952 /* FIXME: We accept all changes in document info, when maybe we ought to just
3953 * accept ModDate? */
3954 filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Info"), &filter_simple);
3955
3956 /* The Encryption dict may be rewritten for the new Xref. */
3957 filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Encrypt"), &filter_simple);
3958
3959 /* We have to accept certain changes in the top level AcroForms dict,
3960 * so get the 2 versions... */
3961 acroform = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm");
3962 acroform_num = pdf_to_num(ctx, acroform);
3963 new_acroform = pdf_resolve_indirect_chain(ctx, acroform);
3964 doc->xref_base = version+1;
3965 old_acroform = pdf_resolve_indirect_chain(ctx, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm"));
3966 doc->xref_base = version;
3967 n = pdf_dict_len(ctx, new_acroform);
3968 for (i = 0; i < n; i++)
3969 {
3970 pdf_obj *key = pdf_dict_get_key(ctx, new_acroform, i);
3971 pdf_obj *nval = pdf_dict_get(ctx, new_acroform, key);
3972 pdf_obj *oval = pdf_dict_get(ctx, old_acroform, key);
3973
3974 if (pdf_name_eq(ctx, key, PDF_NAME(Fields)))
3975 {
3976 int j;
3977
3978 len = pdf_array_len(ctx, nval);
3979 for (j = 0; j < len; j++)
3980 {
3981 pdf_obj *field = pdf_array_get(ctx, nval, j);
3982 if (!pdf_is_indirect(ctx, field))
3983 all_indirects = 0;
3984 check_field(ctx, doc, changes, field, locked, "", NULL, NULL);
3985 }
3986 }
3987 else if (pdf_name_eq(ctx, key, PDF_NAME(SigFlags)))
3988 {
3989 /* Accept this */
3990 changes->obj_changes[acroform_num] |= FIELD_CHANGE_VALID;
3991 }
3992 else if (pdf_name_eq(ctx, key, PDF_NAME(DR)))
3993 {
3994 /* Accept any changes from within the Document Resources */
3995 filter_changes_accepted(ctx, changes, nval, &filter_resources);
3996 }
3997 else if (pdf_name_eq(ctx, key, PDF_NAME(XFA)))
3998 {
3999 /* Allow any changes within the XFA streams. */
4000 filter_changes_accepted(ctx, changes, nval, &filter_xfa);
4001 }
4002 else if (pdf_objcmp(ctx, nval, oval))
4003 {
4004 changes->obj_changes[acroform_num] |= FIELD_CHANGE_INVALID;
4005 }
4006 }
4007
4008 /* Allow for any object streams/XRefs to be changed. */
4009 doc->xref_base = version+1;
4010 for (i = 1; i < num_objs; i++)
4011 {
4012 pdf_obj *oobj, *otype;
4013 if (changes->obj_changes[i] != FIELD_CHANGED)
4014 continue;
4015 if (!pdf_obj_exists(ctx, doc, i))
4016 {
4017 /* Not present this version - must be newly created, can't be a change. */
4018 changes->obj_changes[i] |= FIELD_CHANGE_VALID;
4019 continue;
4020 }
4021 oobj = pdf_load_object(ctx, doc, i);
4022 otype = pdf_dict_get(ctx, oobj, PDF_NAME(Type));
4023 if (pdf_name_eq(ctx, otype, PDF_NAME(ObjStm)) ||
4024 pdf_name_eq(ctx, otype, PDF_NAME(XRef)))
4025 {
4026 changes->obj_changes[i] |= FIELD_CHANGE_VALID;
4027 }
4028 pdf_drop_obj(ctx, oobj);
4029 }
4030 }
4031 fz_always(ctx)
4032 doc->xref_base = o_xref_base;
4033 fz_catch(ctx)
4034 fz_rethrow(ctx);
4035
4036 for (i = 1; i < num_objs; i++)
4037 {
4038 if (changes->obj_changes[i] == FIELD_CHANGED)
4039 /* Change with no reason */
4040 break;
4041 if (changes->obj_changes[i] & FIELD_CHANGE_INVALID)
4042 /* Illegal Change */
4043 break;
4044 }
4045
4046 fz_free(ctx, changes);
4047
4048 return (i == num_objs) && all_indirects;
4049 }
4050
4051 int
pdf_validate_changes(fz_context * ctx,pdf_document * doc,int version)4052 pdf_validate_changes(fz_context *ctx, pdf_document *doc, int version)
4053 {
4054 int unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4055 int n = pdf_count_versions(ctx, doc);
4056 pdf_locked_fields *locked = NULL;
4057 int result;
4058
4059 if (version < 0 || version >= n)
4060 fz_throw(ctx, FZ_ERROR_GENERIC, "There aren't that many changes to find in this document!");
4061
4062 /* We are wanting to compare version+1 with version to make sure
4063 * that the only changes made in going to version are conformant
4064 * with what was allowed in version+1. The production of version
4065 * might have involved signing a signature field and locking down
4066 * more fields - this means that taking the list of locked things
4067 * from version rather than version+1 will give us bad results! */
4068 locked = pdf_find_locked_fields(ctx, doc, unsaved_versions+version+1);
4069
4070 if (!locked->all && locked->includes.len == 0 && locked->p == 0)
4071 {
4072 /* If nothing is locked at all, then all changes are permissible. */
4073 result = 1;
4074 }
4075 else
4076 result = validate_locked_fields(ctx, doc, unsaved_versions+version, locked);
4077
4078 pdf_drop_locked_fields(ctx, locked);
4079
4080 return result;
4081 }
4082
4083 int
pdf_validate_change_history(fz_context * ctx,pdf_document * doc)4084 pdf_validate_change_history(fz_context *ctx, pdf_document *doc)
4085 {
4086 int num_versions = pdf_count_versions(ctx, doc);
4087 int v;
4088
4089 if (num_versions < 2)
4090 return 0; /* Unless there are at least 2 versions, there have been no updates. */
4091
4092 for(v = num_versions - 2; v >= 0; v--)
4093 {
4094 if (!pdf_validate_changes(ctx, doc, v))
4095 return v+1;
4096 }
4097 return 0;
4098 }
4099
4100 /* Return the version that obj appears in, or -1 for not found. */
4101 static int
pdf_find_incremental_update_num_for_obj(fz_context * ctx,pdf_document * doc,pdf_obj * obj)4102 pdf_find_incremental_update_num_for_obj(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
4103 {
4104 pdf_xref *xref = NULL;
4105 pdf_xref_subsec *sub;
4106 int i, j;
4107
4108 if (obj == NULL)
4109 return -1;
4110
4111 /* obj needs to be indirect for us to get a num out of it. */
4112 i = pdf_to_num(ctx, obj);
4113 if (i <= 0)
4114 return -1;
4115
4116 /* obj can't be indirect below, so resolve it here. */
4117 obj = pdf_resolve_indirect_chain(ctx, obj);
4118
4119 /* Find the first xref section where the entry is defined. */
4120 for (j = 0; j < doc->num_xref_sections; j++)
4121 {
4122 xref = &doc->xref_sections[j];
4123
4124 if (i < xref->num_objects)
4125 {
4126 for (sub = xref->subsec; sub != NULL; sub = sub->next)
4127 {
4128 pdf_xref_entry *entry;
4129
4130 if (i < sub->start || i >= sub->start + sub->len)
4131 continue;
4132
4133 entry = &sub->table[i - sub->start];
4134 if (entry->obj == obj)
4135 return j;
4136 }
4137 }
4138 }
4139 return -1;
4140 }
4141
pdf_find_version_for_obj(fz_context * ctx,pdf_document * doc,pdf_obj * obj)4142 int pdf_find_version_for_obj(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
4143 {
4144 int v = pdf_find_incremental_update_num_for_obj(ctx, doc, obj);
4145 int n;
4146
4147 if (v == -1)
4148 return -1;
4149
4150 n = pdf_count_versions(ctx, doc) + pdf_count_unsaved_versions(ctx, doc);
4151 if (v > n)
4152 return n;
4153
4154 return v;
4155 }
4156
pdf_validate_signature(fz_context * ctx,pdf_widget * widget)4157 int pdf_validate_signature(fz_context *ctx, pdf_widget *widget)
4158 {
4159 pdf_document *doc = widget->page->doc;
4160 int unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4161 int num_versions = pdf_count_versions(ctx, doc) + unsaved_versions;
4162 int version = pdf_find_version_for_obj(ctx, doc, widget->obj);
4163 int i;
4164 pdf_locked_fields *locked = NULL;
4165 int o_xref_base;
4166
4167 if (version > num_versions-1)
4168 version = num_versions-1;
4169
4170 /* Get the locked definition from the object when it was signed. */
4171 o_xref_base = doc->xref_base;
4172 doc->xref_base = version;
4173
4174 fz_var(locked); /* Not really needed, but it stops warnings */
4175
4176 fz_try(ctx)
4177 {
4178 locked = pdf_find_locked_fields_for_sig(ctx, doc, widget->obj);
4179 for (i = version-1; i >= unsaved_versions; i--)
4180 {
4181 doc->xref_base = i;
4182 if (!validate_locked_fields(ctx, doc, i, locked))
4183 break;
4184 }
4185 }
4186 fz_always(ctx)
4187 {
4188 doc->xref_base = o_xref_base;
4189 pdf_drop_locked_fields(ctx, locked);
4190 }
4191 fz_catch(ctx)
4192 fz_rethrow(ctx);
4193
4194 return i+1-unsaved_versions;
4195 }
4196
pdf_was_pure_xfa(fz_context * ctx,pdf_document * doc)4197 int pdf_was_pure_xfa(fz_context *ctx, pdf_document *doc)
4198 {
4199 int num_unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4200 int num_versions = pdf_count_versions(ctx, doc);
4201 int v;
4202 int o_xref_base = doc->xref_base;
4203 int pure_xfa = 0;
4204
4205 fz_var(pure_xfa);
4206
4207 fz_try(ctx)
4208 {
4209 for(v = num_versions + num_unsaved_versions; !pure_xfa && v >= num_unsaved_versions; v--)
4210 {
4211 pdf_obj *o;
4212 doc->xref_base = v;
4213 o = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm");
4214 /* If we find a version that had an empty Root/AcroForm/Fields, but had a
4215 * Root/AcroForm/XFA entry, then we deduce that this was at one time a
4216 * pure XFA form. */
4217 if (pdf_array_len(ctx, pdf_dict_get(ctx, o, PDF_NAME(Fields))) == 0 &&
4218 pdf_dict_get(ctx, o, PDF_NAME(XFA)) != NULL)
4219 pure_xfa = 1;
4220 }
4221 }
4222 fz_always(ctx)
4223 doc->xref_base = o_xref_base;
4224 fz_catch(ctx)
4225 fz_rethrow(ctx);
4226
4227 return pure_xfa;
4228 }
4229