1 /* Copyright (C) 2002-2005 Ghostgum Software Pty Ltd. All rights reserved.
2
3 This software is provided AS-IS with no warranty, either express or
4 implied.
5
6 This software is distributed under licence and may not be copied,
7 modified or distributed except as expressly authorised under the terms
8 of the licence contained in the file LICENCE in this distribution.
9 */
10
11 /* $Id: cpdfscan.c,v 1.7 2005/06/10 09:39:24 ghostgum Exp $ */
12 /* PDF scanner */
13
14 /* This is a rudimentary PDF scanner, intended to get
15 * the page count, and for each page the Rotate, MediaBox
16 * and CropBox.
17 */
18
19 #ifdef DEMO_PDFSCAN
20 # include <windows.h>
21 # include <stdio.h>
22 # include <stdarg.h>
23 # include <string.h>
24 # include <ctype.h>
25 # ifdef _MSC_VER
26 # define vsnprintf _vsnprintf
27 # endif
28 # define csfopen fopen
29 # define cslen strlen
30 #else
31 # include "common.h"
32 # include <ctype.h>
33 #endif
34
35 #include "cpdfscan.h"
36
37
38 /* Limitations.
39 *
40 * We currently load the entire xref table. To minimise memory
41 * would could instead keep a list of xref blocks, and do random
42 * access within those.
43 *
44 * Memory management is very simple. We just keep a linked
45 * list of allocated blocks for composite objects.
46 * We empty the stack, and free all PDF objects and composite
47 * objects before returning to the caller.
48 * We don't bother doing garbage collection.
49 */
50
51
52 /* We keep a linked list of memory allocated for composite objects
53 * such as name, string, array or dict.
54 */
55 typedef struct PDFMEM_s PDFMEM;
56 struct PDFMEM_s {
57 void *ptr;
58 int len;
59 PDFMEM *next;
60 };
61
62 /* The token scanner and object references understand the following types */
63 typedef enum rtype_e {
64 invalidtype=0,
65 marktype=1,
66 nulltype=2,
67 booltype=3, /* uses boolval */
68 integertype=4, /* uses intval */
69 realtype=5, /* uses realval */
70 nametype=6, /* uses nameval */
71 stringtype=7, /* uses strval */
72 arraytype=8, /* uses arrayval */
73 dicttype=9, /* uses dictval */
74 optype=10, /* uses opval */
75 streamtype=11, /* uses streamval */
76 objtype=12, /* uses objval */
77 commenttype=13
78 } rtype;
79
80 const char *rtype_string[] = {
81 "invalidtype", "marktype", "nulltype", "booltype", "integertype",
82 "realtype", "nametype", "stringtype", "arraytype", "dicttype",
83 "optype", "streamtype", "objtype", "commenttype"
84 };
85
86 /* A reference contains a simple object, or a pointer to
87 * a composite object.
88 */
89 typedef struct ref_s ref;
90 struct ref_s {
91 rtype type;
92 int rsize;
93 union value_u {
94 /* simple */
95 void *voidval;
96 BOOL boolval;
97 int intval;
98 float realval;
99 /* composite */
100 char *nameval;
101 char *strval;
102 ref *arrayval;
103 ref *dictval;
104 char *opval;
105 /* simple */
106 unsigned long streamval;
107 int objval;
108 } value;
109 };
110
111 /* Cross reference table entry */
112 typedef struct PDFXREF_s {
113 unsigned long offset;
114 int generation;
115 BOOL used;
116 } PDFXREF;
117
118 struct PDFSCAN_s {
119 void *handle;
120 int (*print_fn)(void *handle, const char *ptr, int len);
121 TCHAR filename[1024];
122 FILE *file;
123 char *buf;
124 int buflen; /* length of allocated buf */
125 int len; /* #bytes currently in buf */
126 int offset; /* file offset to start of buf */
127 int begin; /* offset in buf to start of token */
128 int end; /* offset in buf to end of token */
129 rtype token_type; /* token type */
130 BOOL instream; /* In a stream, looking for endstream */
131 unsigned long xref_offset; /* offset to xref table */
132 PDFXREF *xref;
133 int xref_len;
134
135 /* Object numbers obtained during pdf_scan_open() */
136 int root; /* root object reference */
137 int info; /* document info dicionary reference */
138 int pages; /* Pages dictionary reference */
139 int page_count; /* number of pages */
140
141 /* Cached page media */
142 int pagenum;
143 int rotate;
144 PDFBBOX mediabox;
145 PDFBBOX cropbox;
146
147 /* memory allocation */
148 PDFMEM *memory_head;
149 PDFMEM *memory_tail;
150
151 /* operand stack */
152 ref *ostack;
153 int ostack_idx; /* index to top of ostack */
154 int ostack_len; /* Initially 512 */
155 int ostack_maxlen; /* maximum depth of ostack */
156
157 /* objects in memory */
158 /* This contains pairs of integer & reference */
159 ref *objs;
160 int objs_count; /* count of loaded objects */
161 int objs_len; /* length of objs */
162 int objs_maxlen; /* maximum number entries in objs */
163 };
164
165 typedef enum PDFSEEK_e {
166 PDFSEEK_CUR,
167 PDFSEEK_END,
168 PDFSEEK_SET
169 } PDFSEEK;
170
171
172 /* Prototypes */
173 static int pdf_scan_next_token(PDFSCAN *ps);
174 static int pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev);
175 static int pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset);
176
177 static void clear_stack(PDFSCAN *ps);
178 static void clear_objs(PDFSCAN *ps);
179 static void pdf_scan_freeall(PDFSCAN *ps);
180 static void pdf_scan_cleanup(PDFSCAN *ps);
181 static int pdf_scan_open_file(PDFSCAN *ps);
182
183
184 /*****************************************************************/
185 /* text message output */
186
187 static int
pdf_scan_write(PDFSCAN * ps,const char * str,int len)188 pdf_scan_write(PDFSCAN *ps, const char *str, int len)
189 {
190 if (ps != NULL)
191 fwrite(str, 1, len, stdout);
192 else
193 (*ps->print_fn)(ps->handle, str, len);
194 return len;
195 }
196
197 static int
pdf_scan_msgf(PDFSCAN * ps,const char * fmt,...)198 pdf_scan_msgf(PDFSCAN *ps, const char *fmt, ...)
199 {
200 va_list args;
201 int count;
202 char buf[2048];
203 va_start(args,fmt);
204 count = vsnprintf(buf, sizeof(buf), fmt, args);
205 pdf_scan_write(ps, buf, count);
206 va_end(args);
207 return count;
208 }
209
210 /*****************************************************************/
211 /* memory allocation */
212
213 static void
pdf_scan_cleanup(PDFSCAN * ps)214 pdf_scan_cleanup(PDFSCAN *ps)
215 {
216 if (ps->file)
217 fclose(ps->file);
218 ps->file = NULL;
219 clear_stack(ps);
220 clear_objs(ps);
221 pdf_scan_freeall(ps);
222 }
223
pdf_scan_alloc(PDFSCAN * ps,const void * ptr,int len)224 static void *pdf_scan_alloc(PDFSCAN *ps, const void *ptr, int len)
225 {
226 void *data;
227 PDFMEM *mem = (PDFMEM *)malloc(sizeof(PDFMEM));
228 if (mem == NULL)
229 return NULL;
230
231 data = malloc(len);
232 if (data == NULL) {
233 free(mem);
234 return NULL;
235 }
236
237 mem->ptr = data;
238 mem->next = NULL;
239 mem->len = len;
240 memcpy(data, ptr, len);
241
242 if (ps->memory_tail) {
243 ps->memory_tail->next = mem;
244 ps->memory_tail = mem;
245 }
246 else
247 ps->memory_head = ps->memory_tail = mem;
248 return data;
249 }
250
251 /* free all name/string/array/dict memory */
252 static void
pdf_scan_freeall(PDFSCAN * ps)253 pdf_scan_freeall(PDFSCAN *ps)
254 {
255 PDFMEM *memnext;
256 PDFMEM *mem = ps->memory_head;
257 while (mem) {
258 memnext = mem->next;
259 free(mem->ptr);
260 free(mem);
261 mem = memnext;
262 }
263 ps->memory_head = ps->memory_tail = NULL;
264 }
265
266 /*****************************************************************/
267 /* Token checks */
268
is_optoken(PDFSCAN * ps,const char * str)269 static BOOL is_optoken(PDFSCAN *ps, const char *str)
270 {
271 return (ps->token_type == optype) &&
272 (ps->end-ps->begin == (int)strlen(str)) &&
273 (memcmp(ps->buf+ps->begin, str, ps->end-ps->begin) == 0);
274 }
275
276 static int
type_check(PDFSCAN * ps,rtype type)277 type_check(PDFSCAN *ps, rtype type)
278 {
279 if (ps->token_type == type)
280 return 0;
281
282 pdf_scan_msgf(ps, "Error at offset %ld. Expecting %s and found %s\n",
283 ps->offset + ps->begin,
284 rtype_string[(int)type],
285 rtype_string[(int)ps->token_type]);
286 pdf_scan_msgf(ps, "Token is \042");
287 pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
288 pdf_scan_msgf(ps, "\042\n");
289 return -1;
290 }
291
292 static int
op_check(PDFSCAN * ps,const char * str)293 op_check(PDFSCAN *ps, const char *str)
294 {
295 int code = type_check(ps, optype);
296 if (code)
297 return code;
298
299 if (!is_optoken(ps, str)) {
300 pdf_scan_msgf(ps,
301 "Error at offset %ld. Expecting \042%s\042 and found \042",
302 ps->offset + ps->begin, str);
303 pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
304 pdf_scan_msgf(ps, "\042\n");
305 code = -1;
306 }
307 return code;
308 }
309
310 /*****************************************************************/
311 /* stack */
312
313 const ref invalidref = {invalidtype, 0, {NULL}};
314 const ref markref = {marktype, 0, {NULL}};
315
316 /* Push item, return depth of stack */
317 /* >0 is success, <=0 is failure */
push_stack(PDFSCAN * ps,ref r)318 static int push_stack(PDFSCAN *ps, ref r)
319 {
320 int idx;
321 if (ps->ostack_idx + 1 >= ps->ostack_len) {
322 /* increase stack size */
323 ref *newstack;
324 int newlen = ps->ostack_len + 256;
325 if (newlen > ps->ostack_maxlen) {
326 pdf_scan_msgf(ps, "push_stack: stack overflow\n");
327 return 0;
328 }
329 newstack = (ref *)malloc(newlen * sizeof(ref));
330 if (newstack == NULL) {
331 pdf_scan_msgf(ps, "push_stack: Out of memory\n");
332 return 0;
333 }
334 memcpy(newstack, ps->ostack, ps->ostack_len * sizeof(ref));
335 free(ps->ostack);
336 ps->ostack = newstack;
337 ps->ostack_len = newlen;
338 }
339 idx = ++(ps->ostack_idx);
340 ps->ostack[idx] = r;
341 return idx;
342 }
343
pop_stack(PDFSCAN * ps)344 static ref pop_stack(PDFSCAN *ps)
345 {
346 if (ps->ostack_idx <= 0) {
347 pdf_scan_msgf(ps, "pop_stack: stack underflow\n");
348 return invalidref;
349 }
350 return ps->ostack[ps->ostack_idx--];
351 }
352
clear_stack(PDFSCAN * ps)353 static void clear_stack(PDFSCAN *ps)
354 {
355 ps->ostack_idx = 0;
356 }
357
index_stack(PDFSCAN * ps,int n)358 static ref index_stack(PDFSCAN *ps, int n)
359 {
360 if (n < 0) {
361 pdf_scan_msgf(ps, "index_stack: index must not be negative\n");
362 return invalidref;
363 }
364 if (ps->ostack_idx <= n) {
365 pdf_scan_msgf(ps, "index_stack: stack isn't that deep\n");
366 return invalidref;
367 }
368 return ps->ostack[ps->ostack_idx-n];
369 }
370
top_stack(PDFSCAN * ps)371 static ref top_stack(PDFSCAN *ps)
372 {
373 if (ps->ostack_idx <= 0) {
374 pdf_scan_msgf(ps, "top_stack: stack is empty\n");
375 return invalidref;
376 }
377 return ps->ostack[ps->ostack_idx];
378 }
379
380 /*****************************************************************/
381 /* references */
382
383
make_int(int value)384 static ref make_int(int value)
385 {
386 ref r;
387 r.type = integertype;
388 r.rsize = 0;
389 r.value.intval = value;
390 return r;
391 }
392
make_string(PDFSCAN * ps,const char * str,int len)393 static ref make_string(PDFSCAN *ps, const char *str, int len)
394 {
395 ref r;
396 r.type = stringtype;
397 r.rsize = len;
398 r.value.strval = pdf_scan_alloc(ps, str, len);
399 if (r.value.strval == NULL)
400 return invalidref;
401 return r;
402 }
403
make_name(PDFSCAN * ps,const char * str,int len)404 static ref make_name(PDFSCAN *ps, const char *str, int len)
405 {
406 ref r;
407 r.type = nametype;
408 r.rsize = len;
409 r.value.nameval = pdf_scan_alloc(ps, str, len);
410 if (r.value.nameval == NULL)
411 return invalidref;
412 return r;
413 }
414
nameref_equals(ref * r,const char * name)415 static BOOL nameref_equals(ref *r, const char *name)
416 {
417 int len = (int)strlen(name);
418 if (r->type != nametype)
419 return FALSE;
420 if (r->rsize != len)
421 return FALSE;
422 return (memcmp(r->value.nameval, name, len) == 0);
423 }
424
425 /* Get a reference from a dictionary */
426 /* Return the result, but don't push it */
dict_get(PDFSCAN * ps,const char * name)427 static ref dict_get(PDFSCAN *ps, const char *name)
428 {
429 int namelen = (int)strlen(name);
430 ref dict = top_stack(ps);
431 ref *r;
432 int dictlen;
433 int i;
434 if (dict.type == invalidtype)
435 return invalidref;
436 dictlen = dict.rsize * 2;
437 for (i = 0; i<dictlen; i+=2) {
438 r = &dict.value.dictval[i];
439 if ((r->rsize == namelen) && (r->type == nametype) &&
440 (memcmp(r->value.nameval, name, namelen) ==0))
441 return dict.value.dictval[i+1];
442 }
443 return invalidref;
444 }
445
446 /* convert the items on the stack to an array on the stack */
array_to_mark(PDFSCAN * ps)447 static ref array_to_mark(PDFSCAN *ps)
448 {
449 ref r;
450 ref *array;
451 int n = ps->ostack_idx;
452 int len;
453 while ((n>0) && (ps->ostack[n].type != marktype))
454 n--;
455 if (n == 0) {
456 pdf_scan_msgf(ps, "array_to_mark: no mark on stack\n");
457 return invalidref;
458 }
459 len = ps->ostack_idx - n;
460 r.type = arraytype;
461 r.rsize = len;
462 r.value.arrayval = NULL;
463 if (len) {
464 array = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
465 if (array)
466 r.value.arrayval = array;
467 else
468 return invalidref;
469 }
470 ps->ostack_idx -= len + 1;
471 push_stack(ps, r);
472 return r;
473 }
474
475 /* convert the items on the stack to a dictionary on the stack */
dict_to_mark(PDFSCAN * ps)476 static ref dict_to_mark(PDFSCAN *ps)
477 {
478 ref r;
479 ref *dict;
480 int n = ps->ostack_idx;
481 int len;
482 while ((n>0) && (ps->ostack[n].type != marktype))
483 n--;
484 if (n == 0) {
485 pdf_scan_msgf(ps, "dict_to_mark: no mark on stack\n");
486 return invalidref;
487 }
488 len = ps->ostack_idx - n;
489 if (len & 1) {
490 pdf_scan_msgf(ps, "dict_to_mark: must have name/value pairs\n");
491 return invalidref;
492 }
493 r.type = dicttype;
494 r.rsize = len/2;
495 r.value.arrayval = NULL;
496 if (len) {
497 dict = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
498 if (dict)
499 r.value.arrayval = dict;
500 else
501 return invalidref;
502 }
503 ps->ostack_idx -= len + 1;
504 push_stack(ps, r);
505 return r;
506 }
507
508 /*****************************************************************/
509
510 /* Push reference from a token */
push_token(PDFSCAN * ps)511 static ref push_token(PDFSCAN *ps)
512 {
513 ref r;
514 int len = ps->end - ps->begin;
515 const char *p = ps->buf + ps->begin;
516 r.type = ps->token_type;
517 r.rsize = 0;
518 r.value.voidval = NULL;
519 switch(r.type) {
520 case invalidtype:
521 break;
522 case marktype:
523 break;
524 case nulltype:
525 break;
526 case booltype:
527 if ((len == 4) && (memcmp(p, "true", 4)==0))
528 r.value.boolval = TRUE;
529 else if ((len == 5) && (memcmp(p, "true", 5)==0))
530 r.value.boolval = FALSE;
531 else
532 r = invalidref;
533 break;
534 case integertype:
535 { char buf[64];
536 if (len > (int)sizeof(buf)-1)
537 r = invalidref;
538 else {
539 memcpy(buf, p, len);
540 buf[len] = '\0';
541 r.value.intval = atoi(buf);
542 }
543 }
544 break;
545 case realtype:
546 { char buf[64];
547 if (len > (int)sizeof(buf)-1)
548 r = invalidref;
549 else {
550 memcpy(buf, p, len);
551 buf[len] = '\0';
552 r.value.realval = (float)atof(buf);
553 }
554 }
555 break;
556 case nametype:
557 r = make_name(ps, p+1, len-1);
558 break;
559 case stringtype:
560 r = make_string(ps, p, len);
561 break;
562 case streamtype:
563 case commenttype:
564 case objtype:
565 case optype:
566 case arraytype:
567 case dicttype:
568 /* Can't push these from a token */
569 /* These are made by operators like stream, R, ], >> */
570 return invalidref;
571 default:
572 r.type = invalidtype;
573 break;
574 }
575 push_stack(ps, r);
576 return r;
577 }
578
579 /* Process known operators */
process_op(PDFSCAN * ps)580 static int process_op(PDFSCAN *ps)
581 {
582 ref r;
583 if (ps->token_type != optype)
584 return 1; /* not an op */
585 if (is_optoken(ps, "R")) {
586 /* convert "n 0 R" to an indirect reference */
587 ref r1 = index_stack(ps, 1);
588 r = top_stack(ps);
589 if ((r.type == integertype) && (r1.type == integertype)) {
590 r.type = objtype;
591 r.rsize = r.value.intval;
592 r.value.intval = r1.value.intval;
593 pop_stack(ps);
594 pop_stack(ps);
595 push_stack(ps, r);
596 }
597 }
598 else if (is_optoken(ps, "]")) {
599 array_to_mark(ps);
600 }
601 else if (is_optoken(ps, ">>")) {
602 dict_to_mark(ps);
603 }
604 else if (is_optoken(ps, "null")) {
605 r.type = nulltype;
606 r.rsize = 0;
607 r.value.voidval = NULL;
608 push_stack(ps, r);
609 }
610 else if (is_optoken(ps, "obj")) {
611 pdf_scan_msgf(ps, "ignoring obj token\n");
612 /* ignore */
613 }
614 else if (is_optoken(ps, "endobj")) {
615 pdf_scan_msgf(ps, "ignoring endobj token\n");
616 /* ignore */
617 }
618 else if (is_optoken(ps, "stream")) {
619 /* stream object contains offset to start of stream */
620 r.type = streamtype;
621 r.rsize = 0;
622 r.value.streamval = ps->offset + ps->end;
623 push_stack(ps, r);
624 /* Now skip over stream */
625 pdf_scan_next_token(ps);
626 }
627 else {
628 pdf_scan_msgf(ps, "process_op: unrecognised operator \042");
629 pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
630 pdf_scan_msgf(ps, "\042\n");
631 return -1;
632 }
633 return 0;
634 }
635
636 /*****************************************************************/
637 /* Debugging and error messages */
638
639 #ifdef NOTUSED
640
641 /* Print a reference, returning number of characters written */
642 static int
print_ref(PDFSCAN * ps,ref * r)643 print_ref(PDFSCAN *ps, ref *r)
644 {
645 int n = 0;
646 switch(r->type) {
647 case invalidtype:
648 n = pdf_scan_msgf(ps, "--invalid--");
649 break;
650 case marktype:
651 n = pdf_scan_msgf(ps, "--mark--");
652 break;
653 case nulltype:
654 n = pdf_scan_msgf(ps, "--null--");
655 break;
656 case booltype:
657 n = pdf_scan_msgf(ps, "%s", r->value.boolval ? "true" : "false");
658 break;
659 case integertype:
660 n = pdf_scan_msgf(ps, "%d", r->value.intval);
661 break;
662 case realtype:
663 n = pdf_scan_msgf(ps, "%g", r->value.realval);
664 break;
665 case nametype:
666 n = pdf_scan_write(ps, "/", 1);
667 pdf_scan_write(ps, r->value.nameval, r->rsize);
668 break;
669 case stringtype:
670 n = pdf_scan_write(ps, "(", 1);
671 n += pdf_scan_write(ps, r->value.strval, r->rsize);
672 n += pdf_scan_write(ps, ")", 1);
673 break;
674 case streamtype:
675 n = pdf_scan_msgf(ps, "--stream:%d--", r->value.streamval);
676 break;
677 case commenttype:
678 n = pdf_scan_msgf(ps, "--comment--");
679 break;
680 case objtype:
681 n = pdf_scan_msgf(ps, "--obj:%d--", r->value.objval);
682 break;
683 case optype:
684 n = pdf_scan_msgf(ps, "--op:");
685 n += pdf_scan_write(ps, r->value.opval, r->rsize);
686 n += pdf_scan_write(ps, "--", 2);
687 break;
688 case arraytype:
689 n = pdf_scan_msgf(ps, "--array:%d--", r->rsize);
690 break;
691 case dicttype:
692 n = pdf_scan_msgf(ps, "--dict:%d--", r->rsize);
693 break;
694 default:
695 n = pdf_scan_msgf(ps, "--unknown--");
696 break;
697 }
698 return n;
699 }
700
701 /* print a reference, expanding array and dict */
702 static int
print_ref_expand(PDFSCAN * ps,ref * r)703 print_ref_expand(PDFSCAN *ps, ref *r)
704 {
705 int i;
706 int n = 0;;
707 if (r->type == arraytype) {
708 n += pdf_scan_msgf(ps, "[ ");
709 for (i=0; i<r->rsize; i++) {
710 n += print_ref(ps, &r->value.arrayval[i]);
711 n += pdf_scan_msgf(ps, " ");
712 }
713 n += pdf_scan_msgf(ps, "]");
714 }
715 else if (r->type == dicttype) {
716 n += pdf_scan_msgf(ps, "<< ");
717 for (i=0; i<r->rsize; i++) {
718 n += print_ref(ps, &r->value.dictval[i+i]);
719 n += pdf_scan_msgf(ps, " ");
720 n += print_ref(ps, &r->value.dictval[i+i+1]);
721 n += pdf_scan_msgf(ps, " ");
722 }
723 n += pdf_scan_msgf(ps, ">>");
724 }
725 else
726 n += print_ref(ps, r);
727 return n;
728 }
729
730 static void
print_stack(PDFSCAN * ps)731 print_stack(PDFSCAN *ps)
732 {
733 int i, n=ps->ostack_idx;
734 int col = 0;
735 pdf_scan_msgf(ps, "Stack: ");
736 for (i=1; i<=n; i++) {
737 col += print_ref(ps, &ps->ostack[i]);
738 if (col > 70) {
739 pdf_scan_msgf(ps, "\n");
740 col = 0;
741 }
742 else
743 col += pdf_scan_msgf(ps, " ");
744 }
745 pdf_scan_msgf(ps, "\n");
746 }
747
748 static void
print_stack_expand(PDFSCAN * ps)749 print_stack_expand(PDFSCAN *ps)
750 {
751 int i, n=ps->ostack_idx;
752 pdf_scan_msgf(ps, "Stack:\n");
753 for (i=1; i<=n; i++) {
754 pdf_scan_msgf(ps, "%2d: ", i);
755 print_ref_expand(ps, &ps->ostack[i]);
756 pdf_scan_msgf(ps, "\n");
757 }
758 }
759
pdf_scan_print_allocated(PDFSCAN * ps)760 static void pdf_scan_print_allocated(PDFSCAN *ps)
761 {
762 int count = 0;
763 int len = 0;
764 PDFMEM *mem = ps->memory_head;
765 while (mem) {
766 len += sizeof(PDFMEM);
767 len += mem->len;
768 count++;
769 mem = mem->next;
770 }
771 pdf_scan_msgf(ps, "Allocated memory %d bytes in %d objects\n",
772 len, count);
773 }
774
775 #endif
776
777 /*****************************************************************/
778 /* object reading and cache */
779
obj_add(PDFSCAN * ps,int objnum,ref objref)780 static int obj_add(PDFSCAN *ps, int objnum, ref objref)
781 {
782 if (ps->objs_count + 2 >= ps->objs_len) {
783 /* allocate more space */
784 ref *newobjs;
785 int newlen = ps->objs_len + 256;
786 if (newlen > ps->objs_maxlen) {
787 pdf_scan_msgf(ps, "obj_add: too many objects to cache\n");
788 return 0;
789 }
790 newobjs = (ref *)malloc(newlen * sizeof(ref));
791 if (newobjs == NULL) {
792 pdf_scan_msgf(ps, "obj_add: Out of memory\n");
793 return 0;
794 }
795 memcpy(newobjs, ps->objs, ps->objs_len * sizeof(ref));
796 free(ps->objs);
797 ps->objs = newobjs;
798 ps->objs_len = newlen;
799 }
800 ps->objs[ps->objs_count++] = make_int(objnum);
801 ps->objs[ps->objs_count++] = objref;
802 return ps->objs_count;
803 }
804
obj_find(PDFSCAN * ps,int objnum)805 static ref obj_find(PDFSCAN *ps, int objnum)
806 {
807 int i;
808 for (i=0; i<ps->objs_count; i+=2) {
809 if (objnum == ps->objs[i].value.intval)
810 return ps->objs[i+1];
811 }
812 return invalidref;
813 }
814
clear_objs(PDFSCAN * ps)815 static void clear_objs(PDFSCAN *ps)
816 {
817 ps->objs_count = 0;
818 }
819
820 /*****************************************************************/
821 /* token parsing */
822
is_white(char ch)823 static int is_white(char ch)
824 {
825 return (ch == '\0') || (ch == '\t') || (ch == '\n') ||
826 (ch == '\f') || (ch == '\r') || (ch == ' ');
827 }
828
is_delimiter(char ch)829 static int is_delimiter(char ch)
830 {
831 return (ch == '(') || (ch == ')') ||
832 (ch == '<') || (ch == '>') ||
833 (ch == '[') || (ch == ']') ||
834 (ch == '{') || (ch == '}') ||
835 (ch == '/') || (ch == '%');
836 }
837
838
839 /* Scan next token from buffer, returning token type and offset to begin
840 * and end of token.
841 * Return 0 if OK, 1 if no token or not enough data, -1 on error
842 */
pdf_scan_token(const char * buf,int buflen,rtype * ttype,int * tbegin,int * tend)843 static int pdf_scan_token(const char *buf, int buflen,
844 rtype *ttype, int *tbegin, int *tend)
845 {
846 int code = -1;
847 int i = 0;
848 rtype type;
849 int begin, end;
850 *ttype = type = invalidtype;
851 *tbegin = begin = 0;
852 *tend = end = 0;
853 while ((i < buflen) && is_white(buf[i]))
854 i++;
855 if (i == buflen)
856 return 1;
857
858 begin = i;
859 if (buf[i] == '%') {
860 while (i < buflen) {
861 if ((buf[i] == '\n') || (buf[i] == '\r')) {
862 type = commenttype;
863 end = i;
864 code = 0;
865 break;
866 }
867 i++;
868 }
869 if (i >= buflen)
870 code = 1;
871
872 }
873 else if (buf[i] == '(') {
874 /* string */
875 int pcount = 0;
876 type = stringtype;
877 i++;
878 while (i < buflen) {
879 if (buf[i] == '\\')
880 i++;
881 else if (buf[i] == '(')
882 pcount++;
883 else if (buf[i] == ')') {
884 if (pcount <= 0) {
885 end = i+1;
886 code = 0;
887 break;
888 }
889 else
890 pcount--;
891 }
892 i++;
893 }
894 if (i >= buflen)
895 code = 1;
896 }
897 else if (buf[i] == '<') {
898 i++;
899 if (i >= buflen) {
900 code = 1;
901 }
902 else if (buf[i] == '<') {
903 /* marktype */
904 end = i+1;
905 type = marktype;
906 code = 0;
907 }
908 else {
909 /* hexadecimal string */
910 type = stringtype;
911 while (i < buflen) {
912 if (buf[i] == '>') {
913 end = i+1;
914 code = 0;
915 break;
916 }
917 i++;
918 }
919 if (i >= buflen)
920 code = 1;
921 }
922 }
923 else if (buf[i] == '[') {
924 code = 0;
925 end = i+1;
926 type = marktype;
927 }
928 else if (buf[i] == '/') {
929 /* name */
930 type = nametype;
931 i++;
932 while (i < buflen) {
933 if (is_white(buf[i]) || is_delimiter(buf[i])) {
934 end = i;
935 code = 0;
936 break;
937 }
938 i++;
939 }
940 if (i >= buflen)
941 code = 1;
942 }
943 else if (is_delimiter(buf[i])) {
944 /* skip over delimiter */
945 if (buf[i] == '>') {
946 i++;
947 if (i < buflen) {
948 if (buf[i] == '>') {
949 type = optype;
950 end = i+1;
951 code = 0;
952 }
953 else
954 code = -1;
955 }
956 }
957 else {
958 type = optype;
959 end = i+1;
960 code = 0;
961 }
962 if (i >= buflen)
963 code = 1;
964 }
965 else {
966 /* First assume that it is an op */
967 type = optype;
968 while (i < buflen) {
969 if (is_white(buf[i]) || is_delimiter(buf[i])) {
970 end = i;
971 code = 0;
972 break;
973 }
974 i++;
975 }
976 if (i >= buflen)
977 code = 1;
978
979 /* try to convert it into a bool */
980 if ((code == 0) && (type == optype)) {
981 if ((end - begin == 4) &&
982 (memcmp(buf+begin, "true", 4) == 0)) {
983 type = booltype;
984 }
985 else if ((end - begin == 5) &&
986 (memcmp(buf+begin, "false", 5) == 0)) {
987 type = booltype;
988 }
989 }
990
991 /* try to convert it into an integer */
992 if ((code == 0) && (type == optype)) {
993 int j;
994 char ch;
995 BOOL isreal = FALSE;
996 BOOL isnum = TRUE;
997 for (j=begin; j<end; j++) {
998 ch = buf[j];
999 if (ch == '.')
1000 isreal = TRUE;
1001 if (!((ch == '-') || (ch == '+') || (ch == '.') ||
1002 isdigit((int)ch)))
1003 isnum = FALSE;
1004 }
1005 if (isnum) {
1006 if (isreal)
1007 type = realtype;
1008 else
1009 type = integertype;
1010 }
1011 }
1012 }
1013
1014 *ttype = type;
1015 *tbegin = begin;
1016 *tend = end;
1017 return code;
1018 }
1019
1020 /*****************************************************************/
1021
pdf_scan_finish(PDFSCAN * ps)1022 static void pdf_scan_finish(PDFSCAN *ps)
1023 {
1024 if (ps->file) {
1025 fclose(ps->file);
1026 ps->file = NULL;
1027 }
1028 if (ps->buf) {
1029 free(ps->buf);
1030 ps->buf = NULL;
1031 }
1032 ps->buflen = 0;
1033 if (ps->xref) {
1034 free(ps->xref);
1035 ps->xref = NULL;
1036 }
1037 ps->xref_len = 0;
1038 if (ps->ostack) {
1039 free(ps->ostack);
1040 ps->ostack = NULL;
1041 }
1042 ps->ostack_len = 0;
1043 ps->ostack_idx = 0;
1044
1045 if (ps->objs) {
1046 free(ps->objs);
1047 ps->objs = NULL;
1048 }
1049 ps->objs_len = 0;
1050 ps->objs_count = 0;
1051 memset(ps, 0, sizeof(PDFSCAN));
1052 }
1053
pdf_scan_open_file(PDFSCAN * ps)1054 static int pdf_scan_open_file(PDFSCAN *ps)
1055 {
1056 ps->file = csfopen(ps->filename, TEXT("rb"));
1057 if (ps->file == NULL)
1058 return -1;
1059 return 0;
1060 }
1061
pdf_scan_init(PDFSCAN * ps,const TCHAR * name)1062 static int pdf_scan_init(PDFSCAN *ps, const TCHAR *name)
1063 {
1064 int len = (int)(cslen(name)+1) * sizeof(TCHAR);
1065 if (len > (int)sizeof(ps->filename))
1066 return -1;
1067 memcpy(ps->filename, name, len);
1068 if (pdf_scan_open_file(ps) != 0)
1069 return -1;
1070 ps->buflen = 256;
1071 ps->buf = (char *)malloc(ps->buflen);
1072 if (ps->buf == NULL) {
1073 pdf_scan_finish(ps);
1074 return -2;
1075 }
1076 ps->ostack_maxlen = 4096;
1077 ps->ostack_len = 256;
1078 ps->ostack_idx = 0; /* empty */
1079 ps->ostack = (ref *)malloc(ps->ostack_len * sizeof(ref));
1080 if (ps->ostack == NULL) {
1081 pdf_scan_finish(ps);
1082 return -2;
1083 }
1084 /* make first item on stack invalid */
1085 ps->ostack[0].type = invalidtype;
1086 ps->ostack[0].rsize = 0;
1087 ps->ostack[0].value.voidval = NULL;
1088
1089 /* object cache */
1090 ps->objs_maxlen = 1024;
1091 ps->objs_len = 256;
1092 ps->objs_count = 0; /* empty */
1093 ps->objs = (ref *)malloc(ps->objs_len * sizeof(ref));
1094 if (ps->objs == NULL) {
1095 pdf_scan_finish(ps);
1096 return -2;
1097 }
1098
1099 ps->pagenum = -1; /* no cached media info yet */
1100
1101 return 0;
1102 }
1103
pdf_scan_seek(PDFSCAN * ps,long offset,PDFSEEK whence)1104 static int pdf_scan_seek(PDFSCAN *ps, long offset, PDFSEEK whence)
1105 {
1106 int code = -1;
1107 switch (whence) {
1108 case PDFSEEK_CUR:
1109 offset = ps->offset + ps->end + offset;
1110 case PDFSEEK_SET:
1111 ps->begin = ps->end = ps->len = 0;
1112 code = fseek(ps->file, offset, SEEK_SET);
1113 ps->offset = offset;
1114 break;
1115 case PDFSEEK_END:
1116 code = fseek(ps->file, 0, SEEK_END);
1117 ps->begin = ps->end = ps->len = 0;
1118 ps->offset = ftell(ps->file);
1119 break;
1120 }
1121 return code;
1122 }
1123
1124 /* Read next token from PDF file */
1125 /* Return 0 if OK, or -1 if EOF, -2 if error */
1126 /* Set *token_type to token type */
pdf_scan_next_token(PDFSCAN * ps)1127 static int pdf_scan_next_token(PDFSCAN *ps)
1128 {
1129 int code = 0;
1130 int count;
1131 rtype type=invalidtype;
1132 int begin=0, end=0;
1133
1134 do {
1135 if ((code == 1) && ps->end) {
1136 /* move characters to front of buffer */
1137 if (ps->len - ps->end)
1138 memmove(ps->buf, ps->buf+ps->end, ps->len - ps->end);
1139 ps->offset += ps->end;
1140 ps->len = ps->len - ps->end;
1141 ps->begin = 0;
1142 ps->end = 0;
1143 }
1144
1145 if ((code == 1) && (ps->len >= ps->buflen)) {
1146 /* increase buffer size */
1147 char *newbuf;
1148 int newbuflen = 2 * ps->buflen;
1149 newbuf = (char *)malloc(newbuflen);
1150 if (newbuf) {
1151 memcpy(newbuf, ps->buf, ps->buflen);
1152 free(ps->buf);
1153 ps->buf = newbuf;
1154 ps->buflen = newbuflen;
1155 }
1156 else {
1157 pdf_scan_msgf(ps, "Out of memory in pdf_scan_next_token\n");
1158 pdf_scan_msgf(ps, "Tried to realloc %d to %d\n",
1159 ps->buflen, newbuflen);
1160 code = -2;
1161 break;
1162 }
1163 }
1164
1165 if ((code == 1) || (ps->len == 0)) {
1166 count = (int)fread(ps->buf+ps->len, 1, ps->buflen-ps->len,
1167 ps->file);
1168 if (count == 0) {
1169 pdf_scan_msgf(ps, "EOF in pdf_scan_next_token\n");
1170 code = -1;
1171 break;
1172 }
1173 ps->len += count;
1174 }
1175
1176 while (ps->instream) {
1177 /* We are in a stream. Keep reading until we find
1178 * the endstream. This isn't robust. It can be fooled
1179 * by "endstream" occuring within a stream.
1180 */
1181 while ((ps->end < ps->len) && (ps->buf[ps->end] != 'e'))
1182 ps->end++;
1183 /* look for endstream */
1184 if (ps->end + 9 >= ps->len) {
1185 code = 1; /* need more */
1186 break;
1187 }
1188 if (memcmp(ps->buf+ps->end, "endstream", 9) == 0)
1189 ps->instream = FALSE;
1190 else
1191 ps->end++;
1192 }
1193 if (!ps->instream)
1194 code = pdf_scan_token(ps->buf+ps->end, ps->len - ps->end,
1195 &type, &begin, &end);
1196 } while (code == 1);
1197
1198
1199 if (code == 0) {
1200 /* got a token */
1201 ps->begin = ps->end + begin;
1202 ps->end = ps->end + end;
1203 ps->token_type = type;
1204
1205 if ((type == optype) && (ps->end-ps->begin == 6) &&
1206 (memcmp(ps->buf+ps->begin, "stream", 6) == 0))
1207 ps->instream = TRUE;
1208 }
1209
1210 return code;
1211 }
1212
1213 /*****************************************************************/
1214 /* Reading %%EOF, xref, traler */
1215
1216 static int
previous_line(const char * str,int len)1217 previous_line(const char *str, int len)
1218 {
1219 int i = len-1;
1220 /* first skip over EOL */
1221 while ((i > 0) && ((str[i]=='\r') || (str[i]=='\n')))
1222 i--;
1223 while ((i > 0) && !((str[i]=='\r') || (str[i]=='\n')))
1224 i--;
1225 if (!((str[i]=='\r') || (str[i]=='\n')))
1226 return -1; /* didn't find a line */
1227 return i+1;
1228 }
1229
1230 static int
pdf_scan_find_xref(PDFSCAN * ps)1231 pdf_scan_find_xref(PDFSCAN *ps)
1232 {
1233 char buf[4096];
1234 int i, j;
1235 int code = -1;
1236 int count;
1237 pdf_scan_seek(ps, 0, PDFSEEK_END);
1238 count = min((int)sizeof(buf), ps->offset);
1239 pdf_scan_seek(ps, -count, PDFSEEK_CUR);
1240 count = (int)fread(buf, 1, sizeof(buf), ps->file);
1241 pdf_scan_seek(ps, 0, PDFSEEK_SET);
1242 if (count == 0)
1243 return -1;
1244 i = count - 5;
1245 while (i > 0) {
1246 /* Find %%EOF */
1247 if (memcmp(buf+i, "%%EOF", 5) == 0) {
1248 code = 0;
1249 break;
1250 }
1251 i--;
1252 }
1253 if (i == 0) {
1254 pdf_scan_msgf(ps, "Failed to find %%EOF\n");
1255 code = -1;
1256 }
1257 if (code == 0) {
1258 /* Look for xref table offset */
1259 j = previous_line(buf, i);
1260 if (j >= 0)
1261 ps->xref_offset = atol(buf+j);
1262 else
1263 code = -1;
1264 i = j;
1265 if (ps->xref_offset == 0)
1266 code = -1;
1267 if (code != 0)
1268 pdf_scan_msgf(ps, "Failed to find cross reference table\n");
1269 }
1270
1271 if (code == 0) {
1272 /* Look for "startxref" */
1273 j = previous_line(buf, i);
1274 if (j >= 0) {
1275 if (memcmp(buf+j, "startxref", 9) != 0)
1276 code = -1;
1277 }
1278 else {
1279 code = -1;
1280 }
1281 if (code != 0)
1282 pdf_scan_msgf(ps, "Failed to find startxref\n");
1283 }
1284 return code;
1285 }
1286
1287 /* Read a cross reference table */
1288 /* This is called for each cross reference table */
1289 static int
pdf_scan_read_xref(PDFSCAN * ps,unsigned long xref_offset)1290 pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset)
1291 {
1292 int code;
1293 int i;
1294 int first = 0;
1295 int count = 0;
1296 unsigned long prev = 0;
1297 unsigned long offset = 0;
1298 int generation = 0;
1299 BOOL used = FALSE;
1300 pdf_scan_seek(ps, xref_offset, PDFSEEK_SET);
1301 code = pdf_scan_next_token(ps);
1302 if (code == 0)
1303 code = op_check(ps, "xref");
1304 while (code == 0) {
1305 code = pdf_scan_next_token(ps);
1306 if ((code == 0) && is_optoken(ps, "trailer"))
1307 break; /* finished this xref table */
1308 if (code == 0) {
1309 first = atoi(ps->buf + ps->begin);
1310 code = pdf_scan_next_token(ps);
1311 }
1312 if (code == 0) {
1313 count = atoi(ps->buf + ps->begin);
1314 }
1315 if (code == 0) {
1316 /* make sure there is enough space in the table */
1317 if (first + count > ps->xref_len) {
1318 int len = (first + count) * sizeof(PDFXREF);
1319 PDFXREF *newxref = (PDFXREF *)malloc(len);
1320 if (newxref) {
1321 memset(newxref, 0, len);
1322 memcpy(newxref, ps->xref, ps->xref_len * sizeof(PDFXREF));
1323 free(ps->xref);
1324 ps->xref = newxref;
1325 ps->xref_len = first + count;
1326 }
1327 else {
1328 pdf_scan_msgf(ps, "pdf_scan_read_xref: out of memory\n");
1329 code = -2;
1330 break;
1331 }
1332 }
1333 }
1334 for (i=first; i<first+count; i++) {
1335 code = pdf_scan_next_token(ps);
1336 if (code == 0) {
1337 offset = atol(ps->buf+ps->begin);
1338 code = pdf_scan_next_token(ps);
1339 }
1340 if (code == 0) {
1341 generation = atoi(ps->buf+ps->begin);
1342 code = pdf_scan_next_token(ps);
1343 }
1344 if (code == 0) {
1345 if (is_optoken(ps, "n"))
1346 used = TRUE;
1347 else if (is_optoken(ps, "f"))
1348 used = FALSE;
1349 else
1350 code = -1;
1351 }
1352 /* We don't deal correctly with generation.
1353 * We assume that the first xref table that marks an
1354 * object as used is the definitive reference.
1355 */
1356 if (code == 0) {
1357 if (!(ps->xref[i].used)) {
1358 ps->xref[i].offset = offset;
1359 ps->xref[i].generation = generation;
1360 ps->xref[i].used = used;
1361 }
1362 }
1363 }
1364 }
1365
1366 if (code == 0) {
1367 code = pdf_scan_read_trailer(ps, &prev);
1368 if ((code == 0) && prev && prev != ps->xref_offset) {
1369 /* read older xref and trailer */
1370 code = pdf_scan_read_xref(ps, prev);
1371 }
1372 }
1373
1374 return code;
1375 }
1376
1377 /* Read a trailer */
1378 static int
pdf_scan_read_trailer(PDFSCAN * ps,unsigned long * prev)1379 pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev)
1380 {
1381 int code = 0;
1382 ref p;
1383 code = pdf_scan_next_token(ps);
1384 if ((code == 0) && (ps->token_type != marktype))
1385 code = -1;
1386 push_token(ps);
1387 while (code == 0) {
1388 code = pdf_scan_next_token(ps);
1389 if (code != 0)
1390 break;
1391 if (is_optoken(ps, "startxref")) {
1392 if (ps->root == 0) {
1393 p = dict_get(ps, "Root");
1394 if (p.type == objtype)
1395 ps->root = p.value.objval;
1396 else {
1397 pdf_scan_msgf(ps,
1398 "trailer /Root requires indirect reference\n");
1399 code = -1;
1400 }
1401 }
1402 p = dict_get(ps, "Prev");
1403 if (p.type == integertype)
1404 *prev = p.value.intval;
1405 else if (p.type != invalidtype) {
1406 code = -1;
1407 pdf_scan_msgf(ps, "trailer /Prev requires integer\n");
1408 }
1409 break;
1410 }
1411 if (process_op(ps) != 0)
1412 push_token(ps);
1413 }
1414 if (code != 0)
1415 pdf_scan_msgf(ps, "Error reading trailer\n");
1416 return code;
1417 }
1418
1419
pdf_scan_read_object_start(PDFSCAN * ps,int objnum)1420 static int pdf_scan_read_object_start(PDFSCAN *ps, int objnum)
1421 {
1422 int code = 0;
1423 int value = 0;
1424 if (objnum == 0) {
1425 pdf_scan_msgf(ps, "Object 0 is always unused\n");
1426 return -1;
1427 }
1428 if (objnum >= ps->xref_len) {
1429 pdf_scan_msgf(ps, "Object reference %d doesn't exist. There are only %d objects\n", objnum, ps->xref_len);
1430 return -1;
1431 }
1432 if (!ps->xref[objnum].used) {
1433 pdf_scan_msgf(ps, "Object %d is unused\n", objnum);
1434 return -1;
1435 }
1436 pdf_scan_seek(ps, ps->xref[objnum].offset, PDFSEEK_SET);
1437
1438 code = pdf_scan_next_token(ps); /* object number */
1439 if (code == 0)
1440 code = type_check(ps, integertype);
1441 if (code == 0) {
1442 value = atoi(ps->buf+ps->begin); /* object number */
1443 code = pdf_scan_next_token(ps); /* generation */
1444 }
1445 if (code == 0)
1446 code = type_check(ps, integertype);
1447 if (code == 0)
1448 code = pdf_scan_next_token(ps); /* obj */
1449 if (code == 0)
1450 code = op_check(ps, "obj");
1451
1452 if (value != objnum) {
1453 pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
1454 return -1;
1455 }
1456 return code;
1457 }
1458
1459 /*****************************************************************/
1460
1461 /* Read an object, and leave it on the stack */
1462 static int
pdf_scan_read_object(PDFSCAN * ps,int objnum)1463 pdf_scan_read_object(PDFSCAN *ps, int objnum)
1464 {
1465 int code;
1466 ref objref = obj_find(ps, objnum);
1467
1468 if (objref.type != invalidtype) {
1469 /* found in cache */
1470 push_stack(ps, objref);
1471 return 0;
1472 }
1473
1474 code = pdf_scan_read_object_start(ps, objnum);
1475 if (code) {
1476 pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
1477 return -1;
1478 }
1479
1480 code = pdf_scan_next_token(ps);
1481 if ((code == 0) && (ps->token_type != marktype))
1482 code = -1;
1483 push_token(ps);
1484 while (code == 0) {
1485 code = pdf_scan_next_token(ps);
1486 if (code != 0)
1487 break;
1488 if (is_optoken(ps, "endobj")) {
1489 obj_add(ps, objnum, top_stack(ps));
1490 break;
1491 }
1492 if (process_op(ps) != 0)
1493 push_token(ps);
1494 }
1495 return code;
1496 }
1497
1498 /*****************************************************************/
1499
1500 /* find the object number for a page */
1501 /* Return <= 0 if failure, or object number */
1502 /* First page is 0 */
pdf_scan_find_page(PDFSCAN * ps,int pagenum)1503 static int pdf_scan_find_page(PDFSCAN *ps, int pagenum)
1504 {
1505 int code;
1506 ref kids;
1507 ref r;
1508 int pageobj = 0;
1509 int count_base = 0;
1510 int count;
1511 ref *pref;
1512 int i;
1513 int inext;
1514
1515 if (pagenum >= ps->page_count) {
1516 pdf_scan_msgf(ps, "Not that many pages\n");
1517 return -1;
1518 }
1519 code = pdf_scan_read_object(ps, ps->pages);
1520 if (code) {
1521 pdf_scan_msgf(ps, "Didn't find Pages object\n");
1522 return -1;
1523 }
1524 /* iterate through Kids, looking for the one that includes this page */
1525 kids = dict_get(ps, "Kids");
1526 if (kids.type != arraytype) {
1527 pdf_scan_msgf(ps, "/Pages object %d must contain /Kids array\n",
1528 ps->pages);
1529 return -1;
1530 }
1531 pop_stack(ps); /* First Pages */
1532 for (i = 0; (i < kids.rsize) && (code == 0); i=inext) {
1533 inext = i+1;
1534 pref = &kids.value.arrayval[i];
1535 if (pref->type == objtype)
1536 code = pdf_scan_read_object(ps, pref->value.objval);
1537 if (code == 0) {
1538 r = dict_get(ps, "Type");
1539 if (nameref_equals(&r, "Page")) {
1540 if (count_base + i == pagenum) {
1541 /* this is it */
1542 pageobj = pref->value.objval;
1543 pop_stack(ps); /* the wanted page */
1544 break;
1545 }
1546 }
1547 else if (nameref_equals(&r, "Pages")) {
1548 r = dict_get(ps, "Count");
1549 if (r.type == integertype) {
1550 count = r.value.intval;
1551 if (pagenum < count_base + count) {
1552 /* It's under this child */
1553 inext = 0;
1554 pop_stack(ps); /* The old /Pages */
1555 code = pdf_scan_read_object(ps, pref->value.objval);
1556 if (code == 0) {
1557 kids = dict_get(ps, "Kids");
1558 if (kids.type != arraytype) {
1559 pdf_scan_msgf(ps,
1560 "/Pages object %d must contain /Kids array\n",
1561 pref->value.objval);
1562 code = -1;
1563 }
1564 }
1565 }
1566 else {
1567 count_base += count;
1568 }
1569 }
1570 else {
1571 pdf_scan_msgf(ps, "/Pages /Count must be integer\n");
1572 code = -1;
1573 }
1574 }
1575 else {
1576 pdf_scan_msgf(ps,
1577 "pdf_scan_find_page: object %d isn't Pages or Page\n",
1578 pref->value.objval);
1579 code = -1;
1580 }
1581 pop_stack(ps);
1582 }
1583 }
1584
1585 if (pageobj <= 0) {
1586 pdf_scan_msgf(ps, "Failed to find page %d\n", pagenum+1);
1587 code = -1;
1588 }
1589
1590 if (code)
1591 return -1;
1592
1593 /* Don't clean up, since we will use the cached objects
1594 * when extracting the page media.
1595 */
1596
1597 return pageobj;
1598 }
1599
1600
1601 static int
pdf_scan_read_page_count(PDFSCAN * ps)1602 pdf_scan_read_page_count(PDFSCAN *ps)
1603 {
1604 int code;
1605 ref p;
1606 code = pdf_scan_read_object(ps, ps->pages);
1607 if (code) {
1608 pdf_scan_msgf(ps, "Didn't find Pages object\n");
1609 return -1;
1610 }
1611
1612 p = dict_get(ps, "Type");
1613 if (!nameref_equals(&p, "Pages")) {
1614 pdf_scan_msgf(ps, "Pages object didn't have /Type /Pages\n");
1615 return -1;
1616 }
1617 p = dict_get(ps, "Count");
1618 if (p.type != integertype) {
1619 pdf_scan_msgf(ps, "Pages object didn't integer /Count\n");
1620 return -1;
1621 }
1622 ps->page_count = p.value.intval;
1623
1624 return code;
1625 }
1626
convert_float(ref r,float * f)1627 static int convert_float(ref r, float *f)
1628 {
1629 if (r.type == realtype)
1630 *f = r.value.realval;
1631 else if (r.type == integertype)
1632 *f = (float)r.value.intval;
1633 else
1634 return -1;
1635 return 0;
1636 }
1637
1638 static int
pdf_scan_read_bbox(PDFBBOX * box,ref array)1639 pdf_scan_read_bbox(PDFBBOX *box, ref array)
1640 {
1641 int code = 0;
1642 if (array.type != arraytype)
1643 code = -1;
1644 if (array.rsize != 4)
1645 code = -1;
1646 if (code == 0)
1647 code = convert_float(array.value.arrayval[0], &box->llx);
1648 if (code == 0)
1649 code = convert_float(array.value.arrayval[1], &box->lly);
1650 if (code == 0)
1651 code = convert_float(array.value.arrayval[2], &box->urx);
1652 if (code == 0)
1653 code = convert_float(array.value.arrayval[3], &box->ury);
1654 return code;
1655 }
1656
1657 /* Read catalog and leave on stack */
1658 static int
pdf_scan_read_catalog(PDFSCAN * ps)1659 pdf_scan_read_catalog(PDFSCAN *ps)
1660 {
1661 int code;
1662 ref p;
1663 /* Read root object, making sure it is /Type /Catalog,
1664 * and that /Pages is an indirect reference
1665 */
1666 code = pdf_scan_read_object(ps, ps->root);
1667 if (code) {
1668 pdf_scan_msgf(ps, "Didn't find Root object\n");
1669 return -1;
1670 }
1671
1672 p = dict_get(ps, "Type");
1673 if (!nameref_equals(&p, "Catalog")) {
1674 pdf_scan_msgf(ps, "Root object didn't have /Type /Catalog\n");
1675 return -1;
1676 }
1677 p = dict_get(ps, "Pages");
1678 if (p.type != objtype) {
1679 pdf_scan_msgf(ps, "Root object didn't indirect reference to /Pages\n");
1680 return -1;
1681 }
1682 ps->pages = p.value.intval;
1683 return 0;
1684 }
1685
1686 /*****************************************************************/
1687 /* public functions */
1688
1689
1690 void
pdf_scan_close(PDFSCAN * ps)1691 pdf_scan_close(PDFSCAN *ps)
1692 {
1693 pdf_scan_cleanup(ps);
1694 pdf_scan_finish(ps);
1695 free(ps);
1696 }
1697
1698
1699 PDFSCAN *
pdf_scan_open(const TCHAR * filename,void * handle,int (* fn)(void * handle,const char * ptr,int len))1700 pdf_scan_open(const TCHAR *filename, void *handle,
1701 int (*fn)(void *handle, const char *ptr, int len))
1702 {
1703 int code;
1704 int rotate;
1705 PDFBBOX mediabox, cropbox;
1706 PDFSCAN *ps = (PDFSCAN *)malloc(sizeof(PDFSCAN));
1707 if (ps == NULL)
1708 return NULL;
1709 memset(ps, 0, sizeof(PDFSCAN));
1710 ps->handle = handle;
1711 ps->print_fn = fn;
1712 code = pdf_scan_init(ps, filename);
1713 if (code == -1)
1714 pdf_scan_msgf(ps, "Couldn't open PDF file\n");
1715 else if (code != 0)
1716 pdf_scan_msgf(ps, "Error initialising PDF scanner\n");
1717
1718 if (code == 0)
1719 code = pdf_scan_find_xref(ps);
1720 if (code == 0)
1721 code = pdf_scan_read_xref(ps, ps->xref_offset);
1722 if (code == 0)
1723 code = pdf_scan_read_catalog(ps);
1724 if (code == 0)
1725 code = pdf_scan_read_page_count(ps);
1726 if (code == 0)
1727 code = pdf_scan_page_media(ps, 0, &rotate, &mediabox, &cropbox);
1728
1729 pdf_scan_cleanup(ps);
1730 if (code != 0) {
1731 pdf_scan_close(ps);
1732 ps = NULL;
1733 }
1734 return ps;
1735 }
1736
1737 int
pdf_scan_page_count(PDFSCAN * ps)1738 pdf_scan_page_count(PDFSCAN *ps)
1739 {
1740 if (ps == NULL)
1741 return 0;
1742 return ps->page_count;
1743 }
1744
1745 int
pdf_scan_page_media(PDFSCAN * ps,int pagenum,int * rotate,PDFBBOX * mediabox,PDFBBOX * cropbox)1746 pdf_scan_page_media(PDFSCAN *ps, int pagenum, int *rotate,
1747 PDFBBOX *mediabox, PDFBBOX *cropbox)
1748 {
1749 BOOL found_rotate = FALSE;
1750 BOOL found_mediabox = FALSE;
1751 BOOL found_cropbox = FALSE;
1752 BOOL has_parent = TRUE;
1753 ref p, objref;
1754 int objnum;
1755
1756 if (ps == NULL)
1757 return -1;
1758
1759 if (pagenum == ps->pagenum) {
1760 /* Used cached values */
1761 *rotate = ps->rotate;
1762 *mediabox = ps->mediabox;
1763 *cropbox = ps->cropbox;
1764 return 0;
1765 }
1766
1767 if (ps->file == NULL) {
1768 if (pdf_scan_open_file(ps) != 0)
1769 return -1;
1770 }
1771 objnum = pdf_scan_find_page(ps, pagenum);
1772 if (objnum <= 0) {
1773 pdf_scan_cleanup(ps);
1774 return -1;
1775 }
1776 if (pdf_scan_read_object(ps, objnum) < 0) {
1777 pdf_scan_cleanup(ps);
1778 return -1;
1779 }
1780
1781 while (has_parent) {
1782 if (!found_rotate) {
1783 p = dict_get(ps, "Rotate");
1784 if (p.type == integertype) {
1785 *rotate = p.value.intval;
1786 found_rotate = TRUE;
1787 }
1788 }
1789 if (!found_mediabox) {
1790 p = dict_get(ps, "MediaBox");
1791 if (pdf_scan_read_bbox(mediabox, p) == 0)
1792 found_mediabox = TRUE;
1793 }
1794 if (!found_cropbox) {
1795 p = dict_get(ps, "CropBox");
1796 if (pdf_scan_read_bbox(cropbox, p) == 0)
1797 found_cropbox = TRUE;
1798 }
1799 if (found_rotate && found_mediabox && found_cropbox)
1800 break;
1801
1802 p = dict_get(ps, "Parent");
1803 if (p.type == objtype) {
1804 objref = pop_stack(ps);
1805 if (pdf_scan_read_object(ps, p.value.objval) < 0) {
1806 push_stack(ps, objref);
1807 has_parent = FALSE;
1808 }
1809 }
1810 else
1811 has_parent = FALSE;
1812 }
1813 pop_stack(ps);
1814 if (!found_cropbox) {
1815 *cropbox = *mediabox;
1816 found_cropbox = TRUE;
1817 }
1818 if (!found_rotate) {
1819 *rotate = 0;
1820 found_rotate = TRUE;
1821 }
1822
1823 pdf_scan_cleanup(ps);
1824
1825 if (found_rotate && found_mediabox && found_cropbox) {
1826 /* cache these values */
1827 ps->pagenum = pagenum;
1828 ps->rotate = *rotate;
1829 ps->mediabox = *mediabox;
1830 ps->cropbox = *cropbox;
1831 return 0;
1832 }
1833
1834 return -1;
1835 }
1836
1837 /*****************************************************************/
1838
1839 #ifdef DEMO_PDFSCAN
1840
test_print_fn(void * handle,const char * ptr,int len)1841 int test_print_fn(void *handle, const char *ptr, int len)
1842 {
1843 fwrite(ptr, 1, len, stdout);
1844 return len;
1845 }
1846
main(int argc,char * argv[])1847 int main(int argc, char *argv[])
1848 {
1849 PDFSCAN *ps;
1850 int i, count;
1851 int code;
1852 PDFBBOX mediabox, cropbox;
1853 int rotate;
1854
1855 if (argc < 2) {
1856 fprintf(stdout, "Usage: cpdfscan filename\n");
1857 return 1;
1858 }
1859
1860 ps = pdf_scan_open(argv[1], NULL, test_print_fn);
1861 if (ps) {
1862 count = pdf_scan_page_count(ps);
1863 pdf_scan_msgf(ps, "Page count is %d\n", count);
1864 for (i=0; i<count; i++) {
1865 code = pdf_scan_page_media(ps, i, &rotate, &mediabox, &cropbox);
1866 if (code == 0) {
1867 fprintf(stdout, "Page %d /Rotate %d ", i+1, rotate);
1868 fprintf(stdout, "/MediaBox [%g %g %g %g] /CropBox [%g %g %g %g]\n",
1869 mediabox.llx, mediabox.lly, mediabox.urx, mediabox.ury,
1870 cropbox.llx, cropbox.lly, cropbox.urx, cropbox.ury);
1871 }
1872 else
1873 fprintf(stdout, "Page %d media unknown\n", i+1);
1874 }
1875 pdf_scan_close(ps);
1876 }
1877 return 0;
1878 }
1879
1880 #endif
1881