1 /* Copyright (C) 2002-2005 Ghostgum Software Pty Ltd.  All rights reserved.
2 
3   This software is provided AS-IS with no warranty, either express or
4   implied.
5 
6   This software is distributed under licence and may not be copied,
7   modified or distributed except as expressly authorised under the terms
8   of the licence contained in the file LICENCE in this distribution.
9 */
10 
11 /* $Id: cpdfscan.c,v 1.7 2005/06/10 09:39:24 ghostgum Exp $ */
12 /* PDF scanner */
13 
14 /* This is a rudimentary PDF scanner, intended to get
15  * the page count, and for each page the Rotate, MediaBox
16  * and CropBox.
17  */
18 
19 #ifdef DEMO_PDFSCAN
20 # include <windows.h>
21 # include <stdio.h>
22 # include <stdarg.h>
23 # include <string.h>
24 # include <ctype.h>
25 # ifdef _MSC_VER
26 #  define vsnprintf _vsnprintf
27 # endif
28 # define csfopen fopen
29 # define cslen strlen
30 #else
31 # include "common.h"
32 # include <ctype.h>
33 #endif
34 
35 #include "cpdfscan.h"
36 
37 
38 /* Limitations.
39  *
40  * We currently load the entire xref table.  To minimise memory
41  * would could instead keep a list of xref blocks, and do random
42  * access within those.
43  *
44  * Memory management is very simple.  We just keep a linked
45  * list of allocated blocks for composite objects.
46  * We empty the stack, and free all PDF objects and composite
47  * objects before returning to the caller.
48  * We don't bother doing garbage collection.
49  */
50 
51 
52 /* We keep a linked list of memory allocated for composite objects
53  * such as name, string, array or dict.
54  */
55 typedef struct PDFMEM_s PDFMEM;
56 struct PDFMEM_s {
57     void *ptr;
58     int len;
59     PDFMEM *next;
60 };
61 
62 /* The token scanner and object references understand the following types */
63 typedef enum rtype_e {
64     invalidtype=0,
65     marktype=1,
66     nulltype=2,
67     booltype=3,		/* uses boolval */
68     integertype=4,	/* uses intval */
69     realtype=5,		/* uses realval */
70     nametype=6,		/* uses nameval */
71     stringtype=7,	/* uses strval */
72     arraytype=8,	/* uses arrayval */
73     dicttype=9,		/* uses dictval */
74     optype=10,		/* uses opval */
75     streamtype=11,	/* uses streamval */
76     objtype=12,		/* uses objval */
77     commenttype=13
78 } rtype;
79 
80 const char *rtype_string[] = {
81     "invalidtype", "marktype", "nulltype", "booltype", "integertype",
82     "realtype", "nametype", "stringtype", "arraytype", "dicttype",
83     "optype", "streamtype", "objtype", "commenttype"
84 };
85 
86 /* A reference contains a simple object, or a pointer to
87  * a composite object.
88  */
89 typedef struct ref_s ref;
90 struct ref_s {
91     rtype type;
92     int rsize;
93     union value_u {
94 	/* simple */
95         void *voidval;
96 	BOOL boolval;
97 	int intval;
98 	float realval;
99 	/* composite */
100 	char *nameval;
101 	char *strval;
102 	ref *arrayval;
103 	ref *dictval;
104 	char *opval;
105 	/* simple */
106 	unsigned long streamval;
107 	int objval;
108     } value;
109 };
110 
111 /* Cross reference table entry */
112 typedef struct PDFXREF_s {
113     unsigned long offset;
114     int generation;
115     BOOL used;
116 } PDFXREF;
117 
118 struct PDFSCAN_s {
119     void *handle;
120     int (*print_fn)(void *handle, const char *ptr, int len);
121     TCHAR filename[1024];
122     FILE *file;
123     char *buf;
124     int buflen;		/* length of allocated buf */
125     int len;		/* #bytes currently in buf */
126     int offset;		/* file offset to start of buf */
127     int begin;		/* offset in buf to start of token */
128     int end;		/* offset in buf to end of token */
129     rtype token_type;	/* token type */
130     BOOL instream;	/* In a stream, looking for endstream */
131     unsigned long xref_offset;	/* offset to xref table */
132     PDFXREF *xref;
133     int xref_len;
134 
135     /* Object numbers obtained during pdf_scan_open() */
136     int root;		/* root object reference */
137     int info;		/* document info dicionary reference */
138     int pages;		/* Pages dictionary reference */
139     int page_count;	/* number of pages */
140 
141     /* Cached page media */
142     int pagenum;
143     int rotate;
144     PDFBBOX mediabox;
145     PDFBBOX cropbox;
146 
147     /* memory allocation */
148     PDFMEM *memory_head;
149     PDFMEM *memory_tail;
150 
151     /* operand stack */
152     ref *ostack;
153     int ostack_idx;	/* index to top of ostack */
154     int ostack_len;	/* Initially 512 */
155     int ostack_maxlen;	/* maximum depth of ostack */
156 
157     /* objects in memory */
158     /* This contains pairs of integer & reference */
159     ref *objs;
160     int objs_count;	/* count of loaded objects */
161     int objs_len;	/* length of objs */
162     int objs_maxlen;	/* maximum number entries in objs */
163 };
164 
165 typedef enum PDFSEEK_e {
166     PDFSEEK_CUR,
167     PDFSEEK_END,
168     PDFSEEK_SET
169 } PDFSEEK;
170 
171 
172 /* Prototypes */
173 static int pdf_scan_next_token(PDFSCAN *ps);
174 static int pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev);
175 static int pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset);
176 
177 static void clear_stack(PDFSCAN *ps);
178 static void clear_objs(PDFSCAN *ps);
179 static void pdf_scan_freeall(PDFSCAN *ps);
180 static void pdf_scan_cleanup(PDFSCAN *ps);
181 static int pdf_scan_open_file(PDFSCAN *ps);
182 
183 
184 /*****************************************************************/
185 /* text message output */
186 
187 static int
pdf_scan_write(PDFSCAN * ps,const char * str,int len)188 pdf_scan_write(PDFSCAN *ps, const char *str, int len)
189 {
190     if (ps != NULL)
191         fwrite(str, 1, len, stdout);
192     else
193 	(*ps->print_fn)(ps->handle, str, len);
194     return len;
195 }
196 
197 static int
pdf_scan_msgf(PDFSCAN * ps,const char * fmt,...)198 pdf_scan_msgf(PDFSCAN *ps, const char *fmt, ...)
199 {
200 va_list args;
201 int count;
202 char buf[2048];
203     va_start(args,fmt);
204     count = vsnprintf(buf, sizeof(buf), fmt, args);
205     pdf_scan_write(ps, buf, count);
206     va_end(args);
207     return count;
208 }
209 
210 /*****************************************************************/
211 /* memory allocation */
212 
213 static void
pdf_scan_cleanup(PDFSCAN * ps)214 pdf_scan_cleanup(PDFSCAN *ps)
215 {
216     if (ps->file)
217 	fclose(ps->file);
218     ps->file = NULL;
219     clear_stack(ps);
220     clear_objs(ps);
221     pdf_scan_freeall(ps);
222 }
223 
pdf_scan_alloc(PDFSCAN * ps,const void * ptr,int len)224 static void *pdf_scan_alloc(PDFSCAN *ps, const void *ptr, int len)
225 {
226     void *data;
227     PDFMEM *mem = (PDFMEM *)malloc(sizeof(PDFMEM));
228     if (mem == NULL)
229 	return NULL;
230 
231     data = malloc(len);
232     if (data == NULL) {
233 	free(mem);
234 	return NULL;
235     }
236 
237     mem->ptr = data;
238     mem->next = NULL;
239     mem->len = len;
240     memcpy(data, ptr, len);
241 
242     if (ps->memory_tail) {
243 	ps->memory_tail->next = mem;
244 	ps->memory_tail = mem;
245     }
246     else
247 	ps->memory_head = ps->memory_tail = mem;
248     return data;
249 }
250 
251 /* free all name/string/array/dict memory */
252 static void
pdf_scan_freeall(PDFSCAN * ps)253 pdf_scan_freeall(PDFSCAN *ps)
254 {
255     PDFMEM *memnext;
256     PDFMEM *mem = ps->memory_head;
257     while (mem) {
258 	memnext = mem->next;
259 	free(mem->ptr);
260 	free(mem);
261 	mem = memnext;
262     }
263     ps->memory_head = ps->memory_tail = NULL;
264 }
265 
266 /*****************************************************************/
267 /* Token checks */
268 
is_optoken(PDFSCAN * ps,const char * str)269 static BOOL is_optoken(PDFSCAN *ps, const char *str)
270 {
271     return (ps->token_type == optype) &&
272 	(ps->end-ps->begin == (int)strlen(str)) &&
273 	(memcmp(ps->buf+ps->begin, str, ps->end-ps->begin) == 0);
274 }
275 
276 static int
type_check(PDFSCAN * ps,rtype type)277 type_check(PDFSCAN *ps, rtype type)
278 {
279     if (ps->token_type == type)
280 	return 0;
281 
282     pdf_scan_msgf(ps, "Error at offset %ld.  Expecting %s and found %s\n",
283 	ps->offset + ps->begin,
284 	rtype_string[(int)type],
285 	rtype_string[(int)ps->token_type]);
286     pdf_scan_msgf(ps, "Token is \042");
287     pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
288     pdf_scan_msgf(ps, "\042\n");
289     return -1;
290 }
291 
292 static int
op_check(PDFSCAN * ps,const char * str)293 op_check(PDFSCAN *ps, const char *str)
294 {
295     int code = type_check(ps, optype);
296     if (code)
297 	return code;
298 
299     if (!is_optoken(ps, str)) {
300 	pdf_scan_msgf(ps,
301 	    "Error at offset %ld.  Expecting \042%s\042 and found \042",
302 	    ps->offset + ps->begin, str);
303 	pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
304 	pdf_scan_msgf(ps, "\042\n");
305 	code = -1;
306     }
307     return code;
308 }
309 
310 /*****************************************************************/
311 /* stack */
312 
313 const ref invalidref = {invalidtype, 0, {NULL}};
314 const ref markref = {marktype, 0, {NULL}};
315 
316 /* Push item, return depth of stack */
317 /* >0 is success, <=0 is failure */
push_stack(PDFSCAN * ps,ref r)318 static int push_stack(PDFSCAN *ps, ref r)
319 {
320     int idx;
321     if (ps->ostack_idx + 1 >= ps->ostack_len) {
322 	/* increase stack size */
323 	ref *newstack;
324 	int newlen = ps->ostack_len + 256;
325 	if (newlen > ps->ostack_maxlen) {
326 	    pdf_scan_msgf(ps, "push_stack: stack overflow\n");
327 	    return 0;
328 	}
329 	newstack = (ref *)malloc(newlen * sizeof(ref));
330 	if (newstack == NULL) {
331 	    pdf_scan_msgf(ps, "push_stack: Out of memory\n");
332 	    return 0;
333 	}
334 	memcpy(newstack, ps->ostack, ps->ostack_len * sizeof(ref));
335 	free(ps->ostack);
336 	ps->ostack = newstack;
337 	ps->ostack_len = newlen;
338     }
339     idx = ++(ps->ostack_idx);
340     ps->ostack[idx] = r;
341     return idx;
342 }
343 
pop_stack(PDFSCAN * ps)344 static ref pop_stack(PDFSCAN *ps)
345 {
346     if (ps->ostack_idx <= 0) {
347 	pdf_scan_msgf(ps, "pop_stack: stack underflow\n");
348 	return invalidref;
349     }
350     return ps->ostack[ps->ostack_idx--];
351 }
352 
clear_stack(PDFSCAN * ps)353 static void clear_stack(PDFSCAN *ps)
354 {
355     ps->ostack_idx = 0;
356 }
357 
index_stack(PDFSCAN * ps,int n)358 static ref index_stack(PDFSCAN *ps, int n)
359 {
360     if (n < 0) {
361 	pdf_scan_msgf(ps, "index_stack: index must not be negative\n");
362 	return invalidref;
363     }
364     if (ps->ostack_idx <= n) {
365 	pdf_scan_msgf(ps, "index_stack: stack isn't that deep\n");
366 	return invalidref;
367     }
368     return ps->ostack[ps->ostack_idx-n];
369 }
370 
top_stack(PDFSCAN * ps)371 static ref top_stack(PDFSCAN *ps)
372 {
373     if (ps->ostack_idx <= 0) {
374 	pdf_scan_msgf(ps, "top_stack: stack is empty\n");
375 	return invalidref;
376     }
377     return ps->ostack[ps->ostack_idx];
378 }
379 
380 /*****************************************************************/
381 /* references */
382 
383 
make_int(int value)384 static ref make_int(int value)
385 {
386     ref r;
387     r.type = integertype;
388     r.rsize = 0;
389     r.value.intval = value;
390     return r;
391 }
392 
make_string(PDFSCAN * ps,const char * str,int len)393 static ref make_string(PDFSCAN *ps, const char *str, int len)
394 {
395     ref r;
396     r.type = stringtype;
397     r.rsize = len;
398     r.value.strval = pdf_scan_alloc(ps, str, len);
399     if (r.value.strval == NULL)
400 	return invalidref;
401     return r;
402 }
403 
make_name(PDFSCAN * ps,const char * str,int len)404 static ref make_name(PDFSCAN *ps, const char *str, int len)
405 {
406     ref r;
407     r.type = nametype;
408     r.rsize = len;
409     r.value.nameval = pdf_scan_alloc(ps, str, len);
410     if (r.value.nameval == NULL)
411 	return invalidref;
412     return r;
413 }
414 
nameref_equals(ref * r,const char * name)415 static BOOL nameref_equals(ref *r, const char *name)
416 {
417     int len = (int)strlen(name);
418     if (r->type != nametype)
419 	return FALSE;
420     if (r->rsize != len)
421 	return FALSE;
422     return (memcmp(r->value.nameval, name, len) == 0);
423 }
424 
425 /* Get a reference from a dictionary */
426 /* Return the result, but don't push it */
dict_get(PDFSCAN * ps,const char * name)427 static ref dict_get(PDFSCAN *ps, const char *name)
428 {
429     int namelen = (int)strlen(name);
430     ref dict = top_stack(ps);
431     ref *r;
432     int dictlen;
433     int i;
434     if (dict.type == invalidtype)
435 	return invalidref;
436     dictlen = dict.rsize * 2;
437     for (i = 0; i<dictlen; i+=2) {
438 	r = &dict.value.dictval[i];
439 	if ((r->rsize == namelen) && (r->type == nametype) &&
440 	    (memcmp(r->value.nameval, name, namelen) ==0))
441 	    return dict.value.dictval[i+1];
442     }
443     return invalidref;
444 }
445 
446 /* convert the items on the stack to an array on the stack */
array_to_mark(PDFSCAN * ps)447 static ref array_to_mark(PDFSCAN *ps)
448 {
449     ref r;
450     ref *array;
451     int n = ps->ostack_idx;
452     int len;
453     while ((n>0) && (ps->ostack[n].type != marktype))
454 	n--;
455     if (n == 0) {
456 	pdf_scan_msgf(ps, "array_to_mark: no mark on stack\n");
457 	return invalidref;
458     }
459     len = ps->ostack_idx - n;
460     r.type = arraytype;
461     r.rsize = len;
462     r.value.arrayval = NULL;
463     if (len) {
464         array = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
465 	if (array)
466 	    r.value.arrayval = array;
467 	else
468 	    return invalidref;
469     }
470     ps->ostack_idx -= len + 1;
471     push_stack(ps, r);
472     return r;
473 }
474 
475 /* convert the items on the stack to a dictionary on the stack */
dict_to_mark(PDFSCAN * ps)476 static ref dict_to_mark(PDFSCAN *ps)
477 {
478     ref r;
479     ref *dict;
480     int n = ps->ostack_idx;
481     int len;
482     while ((n>0) && (ps->ostack[n].type != marktype))
483 	n--;
484     if (n == 0) {
485 	pdf_scan_msgf(ps, "dict_to_mark: no mark on stack\n");
486 	return invalidref;
487     }
488     len = ps->ostack_idx - n;
489     if (len & 1) {
490 	pdf_scan_msgf(ps, "dict_to_mark: must have name/value pairs\n");
491 	return invalidref;
492     }
493     r.type = dicttype;
494     r.rsize = len/2;
495     r.value.arrayval = NULL;
496     if (len) {
497         dict = pdf_scan_alloc(ps, &ps->ostack[n+1], len * sizeof(ref));
498 	if (dict)
499 	    r.value.arrayval = dict;
500 	else
501 	    return invalidref;
502     }
503     ps->ostack_idx -= len + 1;
504     push_stack(ps, r);
505     return r;
506 }
507 
508 /*****************************************************************/
509 
510 /* Push reference from a token */
push_token(PDFSCAN * ps)511 static ref push_token(PDFSCAN *ps)
512 {
513     ref r;
514     int len = ps->end - ps->begin;
515     const char *p = ps->buf + ps->begin;
516     r.type = ps->token_type;
517     r.rsize = 0;
518     r.value.voidval = NULL;
519     switch(r.type) {
520       case invalidtype:
521 	break;
522       case marktype:
523 	break;
524       case nulltype:
525 	break;
526       case booltype:
527 	if ((len == 4) && (memcmp(p, "true", 4)==0))
528 	    r.value.boolval = TRUE;
529 	else if ((len == 5) && (memcmp(p, "true", 5)==0))
530 	    r.value.boolval = FALSE;
531 	else
532 	    r = invalidref;
533 	break;
534       case integertype:
535 	{   char buf[64];
536 	    if (len > (int)sizeof(buf)-1)
537 		r = invalidref;
538 	    else {
539 		memcpy(buf, p, len);
540 		buf[len] = '\0';
541 		r.value.intval = atoi(buf);
542 	    }
543 	}
544 	break;
545       case realtype:
546 	{   char buf[64];
547 	    if (len > (int)sizeof(buf)-1)
548 		r = invalidref;
549 	    else {
550 		memcpy(buf, p, len);
551 		buf[len] = '\0';
552 		r.value.realval = (float)atof(buf);
553 	    }
554 	}
555 	break;
556       case nametype:
557 	r = make_name(ps, p+1, len-1);
558 	break;
559       case stringtype:
560 	r = make_string(ps, p, len);
561 	break;
562       case streamtype:
563       case commenttype:
564       case objtype:
565       case optype:
566       case arraytype:
567       case dicttype:
568 	/* Can't push these from a token */
569 	/* These are made by operators like stream, R, ], >> */
570 	return invalidref;
571       default:
572 	r.type = invalidtype;
573 	break;
574     }
575     push_stack(ps, r);
576     return r;
577 }
578 
579 /* Process known operators */
process_op(PDFSCAN * ps)580 static int process_op(PDFSCAN *ps)
581 {
582    ref r;
583    if (ps->token_type != optype)
584 	return 1;	/* not an op */
585    if (is_optoken(ps, "R")) {
586 	/* convert "n 0 R" to an indirect reference */
587 	ref r1 = index_stack(ps, 1);
588 	r = top_stack(ps);
589 	if ((r.type == integertype) && (r1.type == integertype)) {
590 	    r.type = objtype;
591 	    r.rsize = r.value.intval;
592 	    r.value.intval = r1.value.intval;
593 	    pop_stack(ps);
594 	    pop_stack(ps);
595 	    push_stack(ps, r);
596 	}
597    }
598    else if (is_optoken(ps, "]")) {
599 	array_to_mark(ps);
600    }
601    else if (is_optoken(ps, ">>")) {
602 	dict_to_mark(ps);
603    }
604    else if (is_optoken(ps, "null")) {
605 	r.type = nulltype;
606 	r.rsize = 0;
607 	r.value.voidval = NULL;
608 	push_stack(ps, r);
609    }
610    else if (is_optoken(ps, "obj")) {
611 	pdf_scan_msgf(ps, "ignoring obj token\n");
612 	/* ignore */
613    }
614    else if (is_optoken(ps, "endobj")) {
615 	pdf_scan_msgf(ps, "ignoring endobj token\n");
616 	/* ignore */
617    }
618    else if (is_optoken(ps, "stream")) {
619 	/* stream object contains offset to start of stream */
620 	r.type = streamtype;
621 	r.rsize = 0;
622 	r.value.streamval = ps->offset + ps->end;
623 	push_stack(ps, r);
624 	/* Now skip over stream */
625         pdf_scan_next_token(ps);
626    }
627    else {
628 	pdf_scan_msgf(ps, "process_op: unrecognised operator \042");
629 	pdf_scan_write(ps, ps->buf+ps->begin, ps->end-ps->begin);
630 	pdf_scan_msgf(ps, "\042\n");
631 	return -1;
632    }
633    return 0;
634 }
635 
636 /*****************************************************************/
637 /* Debugging and error messages */
638 
639 #ifdef NOTUSED
640 
641 /* Print a reference, returning number of characters written */
642 static int
print_ref(PDFSCAN * ps,ref * r)643 print_ref(PDFSCAN *ps, ref *r)
644 {
645     int n = 0;
646     switch(r->type) {
647       case invalidtype:
648 	n = pdf_scan_msgf(ps, "--invalid--");
649 	break;
650       case marktype:
651 	n = pdf_scan_msgf(ps, "--mark--");
652 	break;
653       case nulltype:
654 	n = pdf_scan_msgf(ps, "--null--");
655 	break;
656       case booltype:
657 	n = pdf_scan_msgf(ps, "%s", r->value.boolval ? "true" : "false");
658 	break;
659       case integertype:
660 	n = pdf_scan_msgf(ps, "%d", r->value.intval);
661 	break;
662       case realtype:
663 	n = pdf_scan_msgf(ps, "%g", r->value.realval);
664 	break;
665       case nametype:
666 	n = pdf_scan_write(ps, "/", 1);
667 	pdf_scan_write(ps, r->value.nameval, r->rsize);
668 	break;
669       case stringtype:
670 	n = pdf_scan_write(ps, "(", 1);
671 	n += pdf_scan_write(ps, r->value.strval, r->rsize);
672 	n += pdf_scan_write(ps, ")", 1);
673 	break;
674       case streamtype:
675 	n = pdf_scan_msgf(ps, "--stream:%d--", r->value.streamval);
676 	break;
677       case commenttype:
678 	n = pdf_scan_msgf(ps, "--comment--");
679 	break;
680       case objtype:
681 	n = pdf_scan_msgf(ps, "--obj:%d--", r->value.objval);
682 	break;
683       case optype:
684 	n = pdf_scan_msgf(ps, "--op:");
685 	n += pdf_scan_write(ps, r->value.opval, r->rsize);
686 	n += pdf_scan_write(ps, "--", 2);
687 	break;
688       case arraytype:
689 	n = pdf_scan_msgf(ps, "--array:%d--", r->rsize);
690 	break;
691       case dicttype:
692 	n = pdf_scan_msgf(ps, "--dict:%d--", r->rsize);
693 	break;
694       default:
695 	n = pdf_scan_msgf(ps, "--unknown--");
696 	break;
697     }
698     return n;
699 }
700 
701 /* print a reference, expanding array and dict */
702 static int
print_ref_expand(PDFSCAN * ps,ref * r)703 print_ref_expand(PDFSCAN *ps, ref *r)
704 {
705     int i;
706     int n = 0;;
707     if (r->type == arraytype) {
708 	n += pdf_scan_msgf(ps, "[ ");
709 	for (i=0; i<r->rsize; i++) {
710 	    n += print_ref(ps, &r->value.arrayval[i]);
711 	    n += pdf_scan_msgf(ps, " ");
712 	}
713 	n += pdf_scan_msgf(ps, "]");
714     }
715     else if (r->type == dicttype) {
716 	n += pdf_scan_msgf(ps, "<< ");
717 	for (i=0; i<r->rsize; i++) {
718 	    n += print_ref(ps, &r->value.dictval[i+i]);
719 	    n += pdf_scan_msgf(ps, " ");
720 	    n += print_ref(ps, &r->value.dictval[i+i+1]);
721 	    n += pdf_scan_msgf(ps, " ");
722 	}
723 	n += pdf_scan_msgf(ps, ">>");
724     }
725     else
726 	n += print_ref(ps, r);
727     return n;
728 }
729 
730 static void
print_stack(PDFSCAN * ps)731 print_stack(PDFSCAN *ps)
732 {
733     int i, n=ps->ostack_idx;
734     int col = 0;
735     pdf_scan_msgf(ps, "Stack: ");
736     for (i=1; i<=n; i++) {
737 	col += print_ref(ps, &ps->ostack[i]);
738 	if (col > 70) {
739             pdf_scan_msgf(ps, "\n");
740 	    col = 0;
741 	}
742 	else
743             col += pdf_scan_msgf(ps, " ");
744     }
745     pdf_scan_msgf(ps, "\n");
746 }
747 
748 static void
print_stack_expand(PDFSCAN * ps)749 print_stack_expand(PDFSCAN *ps)
750 {
751     int i, n=ps->ostack_idx;
752     pdf_scan_msgf(ps, "Stack:\n");
753     for (i=1; i<=n; i++) {
754         pdf_scan_msgf(ps, "%2d: ", i);
755 	print_ref_expand(ps, &ps->ostack[i]);
756         pdf_scan_msgf(ps, "\n");
757     }
758 }
759 
pdf_scan_print_allocated(PDFSCAN * ps)760 static void pdf_scan_print_allocated(PDFSCAN *ps)
761 {
762     int count = 0;
763     int len = 0;
764     PDFMEM *mem = ps->memory_head;
765     while (mem) {
766 	len += sizeof(PDFMEM);
767 	len += mem->len;
768 	count++;
769 	mem = mem->next;
770     }
771     pdf_scan_msgf(ps, "Allocated memory %d bytes in %d objects\n",
772 	len, count);
773 }
774 
775 #endif
776 
777 /*****************************************************************/
778 /* object reading and cache */
779 
obj_add(PDFSCAN * ps,int objnum,ref objref)780 static int obj_add(PDFSCAN *ps, int objnum, ref objref)
781 {
782     if (ps->objs_count + 2 >= ps->objs_len) {
783 	/* allocate more space */
784 	ref *newobjs;
785 	int newlen = ps->objs_len + 256;
786 	if (newlen > ps->objs_maxlen) {
787 	    pdf_scan_msgf(ps, "obj_add: too many objects to cache\n");
788 	    return 0;
789 	}
790 	newobjs = (ref *)malloc(newlen * sizeof(ref));
791 	if (newobjs == NULL) {
792 	    pdf_scan_msgf(ps, "obj_add: Out of memory\n");
793 	    return 0;
794 	}
795 	memcpy(newobjs, ps->objs, ps->objs_len * sizeof(ref));
796 	free(ps->objs);
797 	ps->objs = newobjs;
798 	ps->objs_len = newlen;
799     }
800     ps->objs[ps->objs_count++] = make_int(objnum);
801     ps->objs[ps->objs_count++] = objref;
802     return ps->objs_count;
803 }
804 
obj_find(PDFSCAN * ps,int objnum)805 static ref obj_find(PDFSCAN *ps, int objnum)
806 {
807     int i;
808     for (i=0; i<ps->objs_count; i+=2) {
809 	if (objnum == ps->objs[i].value.intval)
810 	    return ps->objs[i+1];
811     }
812     return invalidref;
813 }
814 
clear_objs(PDFSCAN * ps)815 static void clear_objs(PDFSCAN *ps)
816 {
817     ps->objs_count = 0;
818 }
819 
820 /*****************************************************************/
821 /* token parsing */
822 
is_white(char ch)823 static int is_white(char ch)
824 {
825     return (ch == '\0') || (ch == '\t') || (ch == '\n') ||
826 	(ch == '\f') || (ch == '\r') || (ch == ' ');
827 }
828 
is_delimiter(char ch)829 static int is_delimiter(char ch)
830 {
831     return (ch == '(') || (ch == ')') ||
832 	(ch == '<') || (ch == '>') ||
833 	(ch == '[') || (ch == ']') ||
834 	(ch == '{') || (ch == '}') ||
835 	(ch == '/') || (ch == '%');
836 }
837 
838 
839 /* Scan next token from buffer, returning token type and offset to begin
840  * and end of token.
841  * Return 0 if OK, 1 if no token or not enough data, -1 on error
842  */
pdf_scan_token(const char * buf,int buflen,rtype * ttype,int * tbegin,int * tend)843 static int pdf_scan_token(const char *buf, int buflen,
844     rtype *ttype, int *tbegin, int *tend)
845 {
846     int code = -1;
847     int i = 0;
848     rtype type;
849     int begin, end;
850     *ttype = type = invalidtype;
851     *tbegin = begin = 0;
852     *tend = end = 0;
853     while ((i < buflen) && is_white(buf[i]))
854 	i++;
855     if (i == buflen)
856 	return 1;
857 
858     begin = i;
859     if (buf[i] == '%') {
860 	while (i < buflen) {
861 	    if ((buf[i] == '\n') || (buf[i] == '\r')) {
862 		type = commenttype;
863 		end = i;
864 		code = 0;
865 		break;
866 	    }
867 	    i++;
868 	}
869         if (i >= buflen)
870 	    code = 1;
871 
872     }
873     else if (buf[i] == '(') {
874 	/* string */
875 	int pcount = 0;
876 	type = stringtype;
877 	i++;
878 	while (i < buflen) {
879 	    if (buf[i] == '\\')
880 		i++;
881 	    else if (buf[i] == '(')
882 		pcount++;
883 	    else if (buf[i] == ')') {
884 		if (pcount <= 0) {
885 		    end = i+1;
886 		    code = 0;
887 		    break;
888 		}
889 		else
890 		    pcount--;
891 	    }
892 	    i++;
893 	}
894 	if (i >= buflen)
895 	    code = 1;
896     }
897     else if (buf[i] == '<') {
898 	i++;
899 	if (i >= buflen) {
900 	    code = 1;
901 	}
902 	else if (buf[i] == '<') {
903 	    /* marktype */
904 	    end = i+1;
905 	    type = marktype;
906 	    code = 0;
907 	}
908 	else {
909 	    /* hexadecimal string */
910 	    type = stringtype;
911 	    while (i < buflen) {
912 		if (buf[i] == '>') {
913 		    end = i+1;
914 		    code = 0;
915 		    break;
916 		}
917 		i++;
918 	    }
919 	    if (i >= buflen)
920 		code = 1;
921 	}
922     }
923     else if (buf[i] == '[') {
924 	code = 0;
925 	end = i+1;
926 	type = marktype;
927     }
928     else if (buf[i] == '/') {
929 	/* name */
930 	type = nametype;
931 	i++;
932 	while (i < buflen) {
933 	    if (is_white(buf[i]) || is_delimiter(buf[i])) {
934 		end = i;
935 		code = 0;
936 		break;
937 	    }
938 	    i++;
939 	}
940 	if (i >= buflen)
941 	    code = 1;
942     }
943     else if (is_delimiter(buf[i])) {
944 	/* skip over delimiter */
945 	if (buf[i] == '>') {
946 	    i++;
947 	    if (i < buflen) {
948 		if (buf[i] == '>') {
949 		    type = optype;
950 		    end = i+1;
951 		    code = 0;
952 		}
953 		else
954 		    code = -1;
955 	    }
956 	}
957 	else {
958 	    type = optype;
959 	    end = i+1;
960 	    code = 0;
961 	}
962 	if (i >= buflen)
963 	    code = 1;
964     }
965     else {
966 	/* First assume that it is an op */
967 	type = optype;
968 	while (i < buflen) {
969 	    if (is_white(buf[i]) || is_delimiter(buf[i])) {
970 		end = i;
971 		code = 0;
972 		break;
973 	    }
974 	    i++;
975 	}
976 	if (i >= buflen)
977 	    code = 1;
978 
979 	/* try to convert it into a bool */
980 	if ((code == 0) && (type == optype)) {
981 	    if ((end - begin == 4) &&
982 		(memcmp(buf+begin, "true", 4) == 0)) {
983 		type = booltype;
984 	    }
985 	    else if ((end - begin == 5) &&
986 		(memcmp(buf+begin, "false", 5) == 0)) {
987 		type = booltype;
988 	    }
989 	}
990 
991 	/* try to convert it into an integer */
992 	if ((code == 0) && (type == optype)) {
993 	    int j;
994 	    char ch;
995 	    BOOL isreal = FALSE;
996 	    BOOL isnum = TRUE;
997 	    for (j=begin; j<end; j++) {
998 		ch = buf[j];
999 		if (ch == '.')
1000 		    isreal = TRUE;
1001 		if (!((ch == '-') || (ch == '+') || (ch == '.') ||
1002 		    isdigit((int)ch)))
1003 		    isnum = FALSE;
1004 	    }
1005 	    if (isnum) {
1006 		if (isreal)
1007 		    type = realtype;
1008 		else
1009 		    type = integertype;
1010 	    }
1011 	}
1012     }
1013 
1014     *ttype = type;
1015     *tbegin = begin;
1016     *tend = end;
1017     return code;
1018 }
1019 
1020 /*****************************************************************/
1021 
pdf_scan_finish(PDFSCAN * ps)1022 static void pdf_scan_finish(PDFSCAN *ps)
1023 {
1024     if (ps->file) {
1025 	fclose(ps->file);
1026 	ps->file = NULL;
1027     }
1028     if (ps->buf) {
1029 	free(ps->buf);
1030 	ps->buf = NULL;
1031     }
1032     ps->buflen = 0;
1033     if (ps->xref) {
1034 	free(ps->xref);
1035 	ps->xref = NULL;
1036     }
1037     ps->xref_len = 0;
1038     if (ps->ostack) {
1039 	free(ps->ostack);
1040 	ps->ostack = NULL;
1041     }
1042     ps->ostack_len = 0;
1043     ps->ostack_idx = 0;
1044 
1045     if (ps->objs) {
1046 	free(ps->objs);
1047 	ps->objs = NULL;
1048     }
1049     ps->objs_len = 0;
1050     ps->objs_count = 0;
1051     memset(ps, 0, sizeof(PDFSCAN));
1052 }
1053 
pdf_scan_open_file(PDFSCAN * ps)1054 static int pdf_scan_open_file(PDFSCAN *ps)
1055 {
1056     ps->file = csfopen(ps->filename, TEXT("rb"));
1057     if (ps->file == NULL)
1058 	return -1;
1059     return 0;
1060 }
1061 
pdf_scan_init(PDFSCAN * ps,const TCHAR * name)1062 static int pdf_scan_init(PDFSCAN *ps, const TCHAR *name)
1063 {
1064     int len = (int)(cslen(name)+1) * sizeof(TCHAR);
1065     if (len > (int)sizeof(ps->filename))
1066 	return -1;
1067     memcpy(ps->filename, name, len);
1068     if (pdf_scan_open_file(ps) != 0)
1069 	return -1;
1070     ps->buflen = 256;
1071     ps->buf = (char *)malloc(ps->buflen);
1072     if (ps->buf == NULL) {
1073 	pdf_scan_finish(ps);
1074 	return -2;
1075     }
1076     ps->ostack_maxlen = 4096;
1077     ps->ostack_len = 256;
1078     ps->ostack_idx = 0;	/* empty */
1079     ps->ostack = (ref *)malloc(ps->ostack_len * sizeof(ref));
1080     if (ps->ostack == NULL) {
1081 	pdf_scan_finish(ps);
1082 	return -2;
1083     }
1084     /* make first item on stack invalid */
1085     ps->ostack[0].type = invalidtype;
1086     ps->ostack[0].rsize = 0;
1087     ps->ostack[0].value.voidval = NULL;
1088 
1089     /* object cache */
1090     ps->objs_maxlen = 1024;
1091     ps->objs_len = 256;
1092     ps->objs_count = 0;	/* empty */
1093     ps->objs = (ref *)malloc(ps->objs_len * sizeof(ref));
1094     if (ps->objs == NULL) {
1095 	pdf_scan_finish(ps);
1096 	return -2;
1097     }
1098 
1099     ps->pagenum = -1;	/* no cached media info yet */
1100 
1101     return 0;
1102 }
1103 
pdf_scan_seek(PDFSCAN * ps,long offset,PDFSEEK whence)1104 static int pdf_scan_seek(PDFSCAN *ps, long offset, PDFSEEK whence)
1105 {
1106     int code = -1;
1107     switch (whence) {
1108 	case PDFSEEK_CUR:
1109 	    offset = ps->offset + ps->end + offset;
1110 	case PDFSEEK_SET:
1111 	    ps->begin = ps->end = ps->len = 0;
1112 	    code = fseek(ps->file, offset, SEEK_SET);
1113 	    ps->offset = offset;
1114 	    break;
1115 	case PDFSEEK_END:
1116 	    code = fseek(ps->file, 0, SEEK_END);
1117 	    ps->begin = ps->end = ps->len = 0;
1118 	    ps->offset = ftell(ps->file);
1119 	    break;
1120     }
1121     return code;
1122 }
1123 
1124 /* Read next token from PDF file */
1125 /* Return 0 if OK, or -1 if EOF, -2 if error */
1126 /* Set *token_type to token type */
pdf_scan_next_token(PDFSCAN * ps)1127 static int pdf_scan_next_token(PDFSCAN *ps)
1128 {
1129     int code = 0;
1130     int count;
1131     rtype type=invalidtype;
1132     int begin=0, end=0;
1133 
1134     do {
1135 	if ((code == 1) && ps->end) {
1136 	    /* move characters to front of buffer */
1137 	    if (ps->len - ps->end)
1138 		memmove(ps->buf, ps->buf+ps->end, ps->len - ps->end);
1139 	    ps->offset += ps->end;
1140 	    ps->len = ps->len - ps->end;
1141 	    ps->begin = 0;
1142 	    ps->end = 0;
1143 	}
1144 
1145 	if ((code == 1) && (ps->len >= ps->buflen)) {
1146 	    /* increase buffer size */
1147 	    char *newbuf;
1148 	    int newbuflen = 2 * ps->buflen;
1149 	    newbuf = (char *)malloc(newbuflen);
1150 	    if (newbuf) {
1151 		memcpy(newbuf, ps->buf, ps->buflen);
1152 		free(ps->buf);
1153 		ps->buf = newbuf;
1154 		ps->buflen = newbuflen;
1155 	    }
1156 	    else {
1157 		pdf_scan_msgf(ps, "Out of memory in pdf_scan_next_token\n");
1158 		pdf_scan_msgf(ps, "Tried to realloc %d to %d\n",
1159 		    ps->buflen, newbuflen);
1160 		code = -2;
1161 		break;
1162 	    }
1163 	}
1164 
1165 	if ((code == 1) || (ps->len == 0)) {
1166 	    count = (int)fread(ps->buf+ps->len, 1, ps->buflen-ps->len,
1167 		ps->file);
1168 	    if (count == 0) {
1169 		pdf_scan_msgf(ps, "EOF in pdf_scan_next_token\n");
1170 		code = -1;
1171 		break;
1172 	    }
1173 	    ps->len += count;
1174 	}
1175 
1176 	while (ps->instream) {
1177 	    /* We are in a stream.  Keep reading until we find
1178 	     * the endstream.  This isn't robust. It can be fooled
1179 	     * by "endstream" occuring within a stream.
1180 	     */
1181 	    while ((ps->end < ps->len) && (ps->buf[ps->end] != 'e'))
1182 		ps->end++;
1183 	    /* look for endstream */
1184 	    if (ps->end + 9 >= ps->len) {
1185 		code = 1;	/* need more */
1186 		break;
1187 	    }
1188 	    if (memcmp(ps->buf+ps->end, "endstream", 9) == 0)
1189 		ps->instream = FALSE;
1190 	    else
1191 		ps->end++;
1192 	}
1193 	if (!ps->instream)
1194 	    code = pdf_scan_token(ps->buf+ps->end, ps->len - ps->end,
1195 		&type, &begin, &end);
1196     } while (code == 1);
1197 
1198 
1199     if (code == 0) {
1200 	/* got a token */
1201 	ps->begin = ps->end + begin;
1202 	ps->end = ps->end + end;
1203 	ps->token_type = type;
1204 
1205 	if ((type == optype) && (ps->end-ps->begin == 6) &&
1206 		(memcmp(ps->buf+ps->begin, "stream", 6) == 0))
1207 	    ps->instream = TRUE;
1208     }
1209 
1210     return code;
1211 }
1212 
1213 /*****************************************************************/
1214 /* Reading %%EOF, xref, traler */
1215 
1216 static int
previous_line(const char * str,int len)1217 previous_line(const char *str, int len)
1218 {
1219     int i = len-1;
1220     /* first skip over EOL */
1221     while ((i > 0) && ((str[i]=='\r') || (str[i]=='\n')))
1222 	i--;
1223     while ((i > 0) && !((str[i]=='\r') || (str[i]=='\n')))
1224 	i--;
1225     if (!((str[i]=='\r') || (str[i]=='\n')))
1226 	return -1; /* didn't find a line */
1227     return i+1;
1228 }
1229 
1230 static int
pdf_scan_find_xref(PDFSCAN * ps)1231 pdf_scan_find_xref(PDFSCAN *ps)
1232 {
1233     char buf[4096];
1234     int i, j;
1235     int code = -1;
1236     int count;
1237     pdf_scan_seek(ps, 0, PDFSEEK_END);
1238     count = min((int)sizeof(buf), ps->offset);
1239     pdf_scan_seek(ps, -count, PDFSEEK_CUR);
1240     count = (int)fread(buf, 1, sizeof(buf), ps->file);
1241     pdf_scan_seek(ps, 0, PDFSEEK_SET);
1242     if (count == 0)
1243 	return -1;
1244     i = count - 5;
1245     while (i > 0) {
1246 	/* Find %%EOF */
1247 	if (memcmp(buf+i, "%%EOF", 5) == 0) {
1248 	    code = 0;
1249 	    break;
1250 	}
1251 	i--;
1252     }
1253     if (i == 0) {
1254 	pdf_scan_msgf(ps, "Failed to find %%EOF\n");
1255 	code = -1;
1256     }
1257     if (code == 0) {
1258 	/* Look for xref table offset */
1259 	j = previous_line(buf, i);
1260 	if (j >= 0)
1261 	    ps->xref_offset = atol(buf+j);
1262 	else
1263 	    code = -1;
1264 	i = j;
1265 	if (ps->xref_offset == 0)
1266 	    code = -1;
1267 	if (code != 0)
1268 	    pdf_scan_msgf(ps, "Failed to find cross reference table\n");
1269     }
1270 
1271     if (code == 0) {
1272 	/* Look for "startxref" */
1273 	j = previous_line(buf, i);
1274 	if (j >= 0) {
1275 	    if (memcmp(buf+j, "startxref", 9) != 0)
1276 		code = -1;
1277 	}
1278 	else {
1279 	    code = -1;
1280 	}
1281 	if (code != 0)
1282 	    pdf_scan_msgf(ps, "Failed to find startxref\n");
1283     }
1284     return code;
1285 }
1286 
1287 /* Read a cross reference table */
1288 /* This is called for each cross reference table */
1289 static int
pdf_scan_read_xref(PDFSCAN * ps,unsigned long xref_offset)1290 pdf_scan_read_xref(PDFSCAN *ps, unsigned long xref_offset)
1291 {
1292     int code;
1293     int i;
1294     int first = 0;
1295     int count = 0;
1296     unsigned long prev = 0;
1297     unsigned long offset = 0;
1298     int generation = 0;
1299     BOOL used = FALSE;
1300     pdf_scan_seek(ps, xref_offset, PDFSEEK_SET);
1301     code = pdf_scan_next_token(ps);
1302     if (code == 0)
1303 	code = op_check(ps, "xref");
1304     while (code == 0) {
1305         code = pdf_scan_next_token(ps);
1306         if ((code == 0) && is_optoken(ps, "trailer"))
1307 	    break;	/* finished this xref table */
1308 	if (code == 0) {
1309 	    first = atoi(ps->buf + ps->begin);
1310             code = pdf_scan_next_token(ps);
1311 	}
1312 	if (code == 0) {
1313 	    count = atoi(ps->buf + ps->begin);
1314 	}
1315 	if (code == 0) {
1316 	    /* make sure there is enough space in the table */
1317 	    if (first + count > ps->xref_len) {
1318 		int len = (first + count) * sizeof(PDFXREF);
1319 		PDFXREF *newxref = (PDFXREF *)malloc(len);
1320 		if (newxref) {
1321 		    memset(newxref, 0, len);
1322 		    memcpy(newxref, ps->xref, ps->xref_len * sizeof(PDFXREF));
1323 		    free(ps->xref);
1324 		    ps->xref = newxref;
1325 		    ps->xref_len = first + count;
1326 		}
1327 		else {
1328 		    pdf_scan_msgf(ps, "pdf_scan_read_xref: out of memory\n");
1329 		    code = -2;
1330 		    break;
1331 		}
1332 	    }
1333 	}
1334 	for (i=first; i<first+count; i++) {
1335             code = pdf_scan_next_token(ps);
1336 	    if (code == 0) {
1337 		offset = atol(ps->buf+ps->begin);
1338                 code = pdf_scan_next_token(ps);
1339 	    }
1340 	    if (code == 0) {
1341 		generation = atoi(ps->buf+ps->begin);
1342                 code = pdf_scan_next_token(ps);
1343 	    }
1344 	    if (code == 0) {
1345 		if (is_optoken(ps, "n"))
1346 		    used = TRUE;
1347 		else if (is_optoken(ps, "f"))
1348 		    used = FALSE;
1349 		else
1350 		    code = -1;
1351 	    }
1352 	    /* We don't deal correctly with generation.
1353 	     * We assume that the first xref table that marks an
1354 	     * object as used is the definitive reference.
1355 	     */
1356 	    if (code == 0) {
1357 		if (!(ps->xref[i].used)) {
1358 		    ps->xref[i].offset = offset;
1359 		    ps->xref[i].generation = generation;
1360 		    ps->xref[i].used = used;
1361 		}
1362 	    }
1363 	}
1364     }
1365 
1366     if (code == 0) {
1367 	code = pdf_scan_read_trailer(ps, &prev);
1368 	if ((code == 0) && prev && prev != ps->xref_offset) {
1369 	    /* read older xref and trailer */
1370 	    code = pdf_scan_read_xref(ps, prev);
1371 	}
1372     }
1373 
1374     return code;
1375 }
1376 
1377 /* Read a trailer */
1378 static int
pdf_scan_read_trailer(PDFSCAN * ps,unsigned long * prev)1379 pdf_scan_read_trailer(PDFSCAN *ps, unsigned long *prev)
1380 {
1381     int code = 0;
1382     ref p;
1383     code = pdf_scan_next_token(ps);
1384     if ((code == 0) && (ps->token_type != marktype))
1385 	code = -1;
1386     push_token(ps);
1387     while (code == 0) {
1388         code = pdf_scan_next_token(ps);
1389 	if (code != 0)
1390 	    break;
1391 	if (is_optoken(ps, "startxref")) {
1392 	    if (ps->root == 0) {
1393 	        p = dict_get(ps, "Root");
1394 	        if (p.type == objtype)
1395 		    ps->root = p.value.objval;
1396 		else {
1397 		    pdf_scan_msgf(ps,
1398 			"trailer /Root requires indirect reference\n");
1399 		    code = -1;
1400 		}
1401 	    }
1402 	    p = dict_get(ps, "Prev");
1403 	    if (p.type == integertype)
1404 		*prev = p.value.intval;
1405 	    else if (p.type != invalidtype) {
1406 		code = -1;
1407 		pdf_scan_msgf(ps, "trailer /Prev requires integer\n");
1408 	    }
1409 	    break;
1410 	}
1411 	if (process_op(ps) != 0)
1412 	    push_token(ps);
1413     }
1414     if (code != 0)
1415 	pdf_scan_msgf(ps, "Error reading trailer\n");
1416     return code;
1417 }
1418 
1419 
pdf_scan_read_object_start(PDFSCAN * ps,int objnum)1420 static int pdf_scan_read_object_start(PDFSCAN *ps, int objnum)
1421 {
1422     int code = 0;
1423     int value = 0;
1424     if (objnum == 0) {
1425 	pdf_scan_msgf(ps, "Object 0 is always unused\n");
1426 	return -1;
1427     }
1428     if (objnum >= ps->xref_len) {
1429 	pdf_scan_msgf(ps, "Object reference %d doesn't exist.  There are only %d objects\n", objnum, ps->xref_len);
1430 	return -1;
1431     }
1432     if (!ps->xref[objnum].used) {
1433 	pdf_scan_msgf(ps, "Object %d is unused\n", objnum);
1434 	return -1;
1435     }
1436     pdf_scan_seek(ps, ps->xref[objnum].offset, PDFSEEK_SET);
1437 
1438     code = pdf_scan_next_token(ps);		/* object number */
1439     if (code == 0)
1440 	code = type_check(ps, integertype);
1441     if (code == 0) {
1442 	value = atoi(ps->buf+ps->begin);	/* object number */
1443 	code = pdf_scan_next_token(ps); 	/* generation */
1444     }
1445     if (code == 0)
1446 	code = type_check(ps, integertype);
1447     if (code == 0)
1448 	code = pdf_scan_next_token(ps);   	/* obj */
1449     if (code == 0)
1450 	code = op_check(ps, "obj");
1451 
1452     if (value != objnum) {
1453 	pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
1454 	return -1;
1455     }
1456     return code;
1457 }
1458 
1459 /*****************************************************************/
1460 
1461 /* Read an object, and leave it on the stack */
1462 static int
pdf_scan_read_object(PDFSCAN * ps,int objnum)1463 pdf_scan_read_object(PDFSCAN *ps, int objnum)
1464 {
1465     int code;
1466     ref objref = obj_find(ps, objnum);
1467 
1468     if (objref.type != invalidtype) {
1469 	/* found in cache */
1470 	push_stack(ps, objref);
1471 	return 0;
1472     }
1473 
1474     code = pdf_scan_read_object_start(ps, objnum);
1475     if (code) {
1476 	pdf_scan_msgf(ps, "Didn't find object %d\n", objnum);
1477 	return -1;
1478     }
1479 
1480     code = pdf_scan_next_token(ps);
1481     if ((code == 0) && (ps->token_type != marktype))
1482 	code = -1;
1483     push_token(ps);
1484     while (code == 0) {
1485         code = pdf_scan_next_token(ps);
1486 	if (code != 0)
1487 	    break;
1488 	if (is_optoken(ps, "endobj")) {
1489 	    obj_add(ps, objnum, top_stack(ps));
1490 	    break;
1491 	}
1492 	if (process_op(ps) != 0)
1493 	    push_token(ps);
1494     }
1495     return code;
1496 }
1497 
1498 /*****************************************************************/
1499 
1500 /* find the object number for a page */
1501 /* Return <= 0 if failure, or object number */
1502 /* First page is 0 */
pdf_scan_find_page(PDFSCAN * ps,int pagenum)1503 static int pdf_scan_find_page(PDFSCAN *ps, int pagenum)
1504 {
1505     int code;
1506     ref kids;
1507     ref r;
1508     int pageobj = 0;
1509     int count_base = 0;
1510     int count;
1511     ref *pref;
1512     int i;
1513     int inext;
1514 
1515     if (pagenum >= ps->page_count) {
1516 	pdf_scan_msgf(ps, "Not that many pages\n");
1517 	return -1;
1518     }
1519     code = pdf_scan_read_object(ps, ps->pages);
1520     if (code) {
1521 	pdf_scan_msgf(ps, "Didn't find Pages object\n");
1522 	return -1;
1523     }
1524     /* iterate through Kids, looking for the one that includes this page */
1525     kids = dict_get(ps, "Kids");
1526     if (kids.type != arraytype) {
1527 	pdf_scan_msgf(ps, "/Pages object %d must contain /Kids array\n",
1528 	    ps->pages);
1529 	return -1;
1530     }
1531     pop_stack(ps);	/* First Pages */
1532     for (i = 0; (i < kids.rsize) && (code == 0); i=inext) {
1533 	inext = i+1;
1534 	pref = &kids.value.arrayval[i];
1535 	if (pref->type == objtype)
1536         code = pdf_scan_read_object(ps, pref->value.objval);
1537 	if (code == 0) {
1538 	    r = dict_get(ps, "Type");
1539 	    if (nameref_equals(&r, "Page")) {
1540 		if (count_base + i == pagenum) {
1541 		    /* this is it */
1542 		    pageobj = pref->value.objval;
1543 		    pop_stack(ps);	/* the wanted page */
1544 		    break;
1545 		}
1546 	    }
1547 	    else if (nameref_equals(&r, "Pages")) {
1548 	        r = dict_get(ps, "Count");
1549 		if (r.type == integertype) {
1550 		    count = r.value.intval;
1551 		    if (pagenum < count_base + count) {
1552 			/* It's under this child */
1553 			inext = 0;
1554 		        pop_stack(ps);	/* The old /Pages */
1555 			code = pdf_scan_read_object(ps, pref->value.objval);
1556 			if (code == 0) {
1557 			    kids = dict_get(ps, "Kids");
1558 			    if (kids.type != arraytype) {
1559 				pdf_scan_msgf(ps,
1560 				"/Pages object %d must contain /Kids array\n",
1561 				    pref->value.objval);
1562 				code = -1;
1563 			    }
1564 			}
1565 		    }
1566 		    else {
1567 			count_base += count;
1568 		    }
1569 		}
1570 		else {
1571 		    pdf_scan_msgf(ps, "/Pages /Count must be integer\n");
1572 		    code = -1;
1573 		}
1574 	    }
1575 	    else {
1576 		pdf_scan_msgf(ps,
1577 		    "pdf_scan_find_page: object %d isn't Pages or Page\n",
1578 		    pref->value.objval);
1579 		code = -1;
1580 	    }
1581 	    pop_stack(ps);
1582 	}
1583     }
1584 
1585     if (pageobj <= 0) {
1586 	pdf_scan_msgf(ps, "Failed to find page %d\n", pagenum+1);
1587 	code = -1;
1588     }
1589 
1590     if (code)
1591 	return -1;
1592 
1593     /* Don't clean up, since we will use the cached objects
1594      * when extracting the page media.
1595      */
1596 
1597     return pageobj;
1598 }
1599 
1600 
1601 static int
pdf_scan_read_page_count(PDFSCAN * ps)1602 pdf_scan_read_page_count(PDFSCAN *ps)
1603 {
1604     int code;
1605     ref p;
1606     code = pdf_scan_read_object(ps, ps->pages);
1607     if (code) {
1608 	pdf_scan_msgf(ps, "Didn't find Pages object\n");
1609 	return -1;
1610     }
1611 
1612     p = dict_get(ps, "Type");
1613     if (!nameref_equals(&p, "Pages")) {
1614 	pdf_scan_msgf(ps, "Pages object didn't have /Type /Pages\n");
1615 	return -1;
1616     }
1617     p = dict_get(ps, "Count");
1618     if (p.type != integertype) {
1619 	pdf_scan_msgf(ps, "Pages object didn't integer /Count\n");
1620 	return -1;
1621     }
1622     ps->page_count = p.value.intval;
1623 
1624     return code;
1625 }
1626 
convert_float(ref r,float * f)1627 static int convert_float(ref r, float *f)
1628 {
1629     if (r.type == realtype)
1630 	*f = r.value.realval;
1631     else if (r.type == integertype)
1632 	*f = (float)r.value.intval;
1633     else
1634        return -1;
1635     return 0;
1636 }
1637 
1638 static int
pdf_scan_read_bbox(PDFBBOX * box,ref array)1639 pdf_scan_read_bbox(PDFBBOX *box, ref array)
1640 {
1641     int code = 0;
1642     if (array.type != arraytype)
1643 	code = -1;
1644     if (array.rsize != 4)
1645 	code = -1;
1646     if (code == 0)
1647         code = convert_float(array.value.arrayval[0], &box->llx);
1648     if (code == 0)
1649 	code = convert_float(array.value.arrayval[1], &box->lly);
1650     if (code == 0)
1651 	code = convert_float(array.value.arrayval[2], &box->urx);
1652     if (code == 0)
1653 	code = convert_float(array.value.arrayval[3], &box->ury);
1654     return code;
1655 }
1656 
1657 /* Read catalog and leave on stack */
1658 static int
pdf_scan_read_catalog(PDFSCAN * ps)1659 pdf_scan_read_catalog(PDFSCAN *ps)
1660 {
1661     int code;
1662     ref p;
1663     /* Read root object, making sure it is /Type /Catalog,
1664      * and that /Pages is an indirect reference
1665      */
1666     code = pdf_scan_read_object(ps, ps->root);
1667     if (code) {
1668 	pdf_scan_msgf(ps, "Didn't find Root object\n");
1669 	return -1;
1670     }
1671 
1672     p = dict_get(ps, "Type");
1673     if (!nameref_equals(&p, "Catalog")) {
1674 	pdf_scan_msgf(ps, "Root object didn't have /Type /Catalog\n");
1675 	return -1;
1676     }
1677     p = dict_get(ps, "Pages");
1678     if (p.type != objtype) {
1679 	pdf_scan_msgf(ps, "Root object didn't indirect reference to /Pages\n");
1680 	return -1;
1681     }
1682     ps->pages = p.value.intval;
1683     return 0;
1684 }
1685 
1686 /*****************************************************************/
1687 /* public functions */
1688 
1689 
1690 void
pdf_scan_close(PDFSCAN * ps)1691 pdf_scan_close(PDFSCAN *ps)
1692 {
1693     pdf_scan_cleanup(ps);
1694     pdf_scan_finish(ps);
1695     free(ps);
1696 }
1697 
1698 
1699 PDFSCAN *
pdf_scan_open(const TCHAR * filename,void * handle,int (* fn)(void * handle,const char * ptr,int len))1700 pdf_scan_open(const TCHAR *filename, void *handle,
1701     int (*fn)(void *handle, const char *ptr, int len))
1702 {
1703     int code;
1704     int rotate;
1705     PDFBBOX mediabox, cropbox;
1706     PDFSCAN *ps = (PDFSCAN *)malloc(sizeof(PDFSCAN));
1707     if (ps == NULL)
1708 	return NULL;
1709     memset(ps, 0, sizeof(PDFSCAN));
1710     ps->handle = handle;
1711     ps->print_fn = fn;
1712     code = pdf_scan_init(ps, filename);
1713     if (code == -1)
1714 	pdf_scan_msgf(ps, "Couldn't open PDF file\n");
1715     else if (code != 0)
1716 	pdf_scan_msgf(ps, "Error initialising PDF scanner\n");
1717 
1718     if (code == 0)
1719         code = pdf_scan_find_xref(ps);
1720     if (code == 0)
1721 	code = pdf_scan_read_xref(ps, ps->xref_offset);
1722     if (code == 0)
1723 	code = pdf_scan_read_catalog(ps);
1724     if (code == 0)
1725 	code = pdf_scan_read_page_count(ps);
1726     if (code == 0)
1727 	code = pdf_scan_page_media(ps, 0, &rotate, &mediabox, &cropbox);
1728 
1729     pdf_scan_cleanup(ps);
1730     if (code != 0) {
1731 	pdf_scan_close(ps);
1732 	ps = NULL;
1733     }
1734     return ps;
1735 }
1736 
1737 int
pdf_scan_page_count(PDFSCAN * ps)1738 pdf_scan_page_count(PDFSCAN *ps)
1739 {
1740     if (ps == NULL)
1741 	return 0;
1742     return ps->page_count;
1743 }
1744 
1745 int
pdf_scan_page_media(PDFSCAN * ps,int pagenum,int * rotate,PDFBBOX * mediabox,PDFBBOX * cropbox)1746 pdf_scan_page_media(PDFSCAN *ps, int pagenum, int *rotate,
1747     PDFBBOX *mediabox, PDFBBOX *cropbox)
1748 {
1749     BOOL found_rotate = FALSE;
1750     BOOL found_mediabox = FALSE;
1751     BOOL found_cropbox = FALSE;
1752     BOOL has_parent = TRUE;
1753     ref p, objref;
1754     int objnum;
1755 
1756     if (ps == NULL)
1757 	return -1;
1758 
1759     if (pagenum == ps->pagenum) {
1760 	/* Used cached values */
1761 	*rotate = ps->rotate;
1762 	*mediabox = ps->mediabox;
1763 	*cropbox = ps->cropbox;
1764 	return 0;
1765     }
1766 
1767     if (ps->file == NULL) {
1768 	if (pdf_scan_open_file(ps) != 0)
1769 	    return -1;
1770     }
1771     objnum = pdf_scan_find_page(ps, pagenum);
1772     if (objnum <= 0) {
1773 	pdf_scan_cleanup(ps);
1774 	return -1;
1775     }
1776     if (pdf_scan_read_object(ps, objnum) < 0) {
1777 	pdf_scan_cleanup(ps);
1778 	return -1;
1779     }
1780 
1781     while (has_parent) {
1782 	if (!found_rotate) {
1783 	    p = dict_get(ps, "Rotate");
1784 	    if (p.type == integertype) {
1785 		*rotate = p.value.intval;
1786 		found_rotate = TRUE;
1787 	    }
1788 	}
1789 	if (!found_mediabox) {
1790 	    p = dict_get(ps, "MediaBox");
1791 	    if (pdf_scan_read_bbox(mediabox, p) == 0)
1792 		found_mediabox = TRUE;
1793 	}
1794 	if (!found_cropbox) {
1795 	    p = dict_get(ps, "CropBox");
1796 	    if (pdf_scan_read_bbox(cropbox, p) == 0)
1797 		found_cropbox = TRUE;
1798 	}
1799         if (found_rotate && found_mediabox && found_cropbox)
1800 	    break;
1801 
1802 	p = dict_get(ps, "Parent");
1803 	if (p.type == objtype) {
1804 	    objref = pop_stack(ps);
1805 	    if (pdf_scan_read_object(ps, p.value.objval) < 0) {
1806 		push_stack(ps, objref);
1807 		has_parent = FALSE;
1808 	    }
1809 	}
1810 	else
1811 	    has_parent = FALSE;
1812     }
1813     pop_stack(ps);
1814     if (!found_cropbox) {
1815 	*cropbox = *mediabox;
1816 	found_cropbox = TRUE;
1817     }
1818     if (!found_rotate) {
1819 	*rotate = 0;
1820 	found_rotate = TRUE;
1821     }
1822 
1823     pdf_scan_cleanup(ps);
1824 
1825     if (found_rotate && found_mediabox && found_cropbox) {
1826 	/* cache these values */
1827 	ps->pagenum = pagenum;
1828 	ps->rotate = *rotate;
1829 	ps->mediabox = *mediabox;
1830 	ps->cropbox = *cropbox;
1831         return 0;
1832     }
1833 
1834     return -1;
1835 }
1836 
1837 /*****************************************************************/
1838 
1839 #ifdef DEMO_PDFSCAN
1840 
test_print_fn(void * handle,const char * ptr,int len)1841 int test_print_fn(void *handle, const char *ptr, int len)
1842 {
1843     fwrite(ptr, 1, len, stdout);
1844     return len;
1845 }
1846 
main(int argc,char * argv[])1847 int main(int argc, char *argv[])
1848 {
1849     PDFSCAN *ps;
1850     int i, count;
1851     int code;
1852     PDFBBOX mediabox, cropbox;
1853     int rotate;
1854 
1855     if (argc < 2) {
1856 	fprintf(stdout, "Usage: cpdfscan filename\n");
1857 	return 1;
1858     }
1859 
1860     ps = pdf_scan_open(argv[1], NULL, test_print_fn);
1861     if (ps) {
1862 	count = pdf_scan_page_count(ps);
1863 	pdf_scan_msgf(ps, "Page count is %d\n", count);
1864 	for (i=0; i<count; i++) {
1865 	    code = pdf_scan_page_media(ps, i, &rotate, &mediabox, &cropbox);
1866 	    if (code == 0) {
1867 	        fprintf(stdout, "Page %d /Rotate %d ", i+1, rotate);
1868 	        fprintf(stdout, "/MediaBox [%g %g %g %g] /CropBox [%g %g %g %g]\n",
1869 		    mediabox.llx, mediabox.lly, mediabox.urx, mediabox.ury,
1870 		    cropbox.llx, cropbox.lly, cropbox.urx, cropbox.ury);
1871 	    }
1872 	    else
1873 	        fprintf(stdout, "Page %d media unknown\n", i+1);
1874 	}
1875 	pdf_scan_close(ps);
1876     }
1877     return 0;
1878 }
1879 
1880 #endif
1881