1// Copyright 2014 The Go Authors.  All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Reading of PDF tokens and objects from a raw byte stream.
6
7package pdf
8
9import (
10	"fmt"
11	"io"
12	"strconv"
13)
14
15// A token is a PDF token in the input stream, one of the following Go types:
16//
17//	bool, a PDF boolean
18//	int64, a PDF integer
19//	float64, a PDF real
20//	string, a PDF string literal
21//	keyword, a PDF keyword
22//	name, a PDF name without the leading slash
23//
24type token interface{}
25
26// A name is a PDF name, without the leading slash.
27type name string
28
29// A keyword is a PDF keyword.
30// Delimiter tokens used in higher-level syntax,
31// such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords.
32type keyword string
33
34// A buffer holds buffered input bytes from the PDF file.
35type buffer struct {
36	r           io.Reader // source of data
37	buf         []byte    // buffered data
38	pos         int       // read index in buf
39	offset      int64     // offset at end of buf; aka offset of next read
40	tmp         []byte    // scratch space for accumulating token
41	unread      []token   // queue of read but then unread tokens
42	allowEOF    bool
43	allowObjptr bool
44	allowStream bool
45	eof         bool
46	key         []byte
47	useAES      bool
48	objptr      objptr
49}
50
51// newBuffer returns a new buffer reading from r at the given offset.
52func newBuffer(r io.Reader, offset int64) *buffer {
53	return &buffer{
54		r:           r,
55		offset:      offset,
56		buf:         make([]byte, 0, 4096),
57		allowObjptr: true,
58		allowStream: true,
59	}
60}
61
62func (b *buffer) seek(offset int64) {
63	b.offset = offset
64	b.buf = b.buf[:0]
65	b.pos = 0
66	b.unread = b.unread[:0]
67}
68
69func (b *buffer) readByte() byte {
70	if b.pos >= len(b.buf) {
71		b.reload()
72		if b.pos >= len(b.buf) {
73			return '\n'
74		}
75	}
76	c := b.buf[b.pos]
77	b.pos++
78	return c
79}
80
81func (b *buffer) errorf(format string, args ...interface{}) {
82	panic(fmt.Errorf(format, args...))
83}
84
85func (b *buffer) reload() bool {
86	n := cap(b.buf) - int(b.offset%int64(cap(b.buf)))
87	n, err := b.r.Read(b.buf[:n])
88	if n == 0 && err != nil {
89		b.buf = b.buf[:0]
90		b.pos = 0
91		if b.allowEOF && err == io.EOF {
92			b.eof = true
93			return false
94		}
95		b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)
96		return false
97	}
98	b.offset += int64(n)
99	b.buf = b.buf[:n]
100	b.pos = 0
101	return true
102}
103
104func (b *buffer) seekForward(offset int64) {
105	for b.offset < offset {
106		if !b.reload() {
107			return
108		}
109	}
110	b.pos = len(b.buf) - int(b.offset-offset)
111}
112
113func (b *buffer) readOffset() int64 {
114	return b.offset - int64(len(b.buf)) + int64(b.pos)
115}
116
117func (b *buffer) unreadByte() {
118	if b.pos > 0 {
119		b.pos--
120	}
121}
122
123func (b *buffer) unreadToken(t token) {
124	b.unread = append(b.unread, t)
125}
126
127func (b *buffer) readToken() token {
128	if n := len(b.unread); n > 0 {
129		t := b.unread[n-1]
130		b.unread = b.unread[:n-1]
131		return t
132	}
133
134	// Find first non-space, non-comment byte.
135	c := b.readByte()
136	for {
137		if isSpace(c) {
138			if b.eof {
139				return io.EOF
140			}
141			c = b.readByte()
142		} else if c == '%' {
143			for c != '\r' && c != '\n' {
144				c = b.readByte()
145			}
146		} else {
147			break
148		}
149	}
150
151	switch c {
152	case '<':
153		if b.readByte() == '<' {
154			return keyword("<<")
155		}
156		b.unreadByte()
157		return b.readHexString()
158
159	case '(':
160		return b.readLiteralString()
161
162	case '[', ']', '{', '}':
163		return keyword(string(c))
164
165	case '/':
166		return b.readName()
167
168	case '>':
169		if b.readByte() == '>' {
170			return keyword(">>")
171		}
172		b.unreadByte()
173		fallthrough
174
175	default:
176		if isDelim(c) {
177			b.errorf("unexpected delimiter %#q", rune(c))
178			return nil
179		}
180		b.unreadByte()
181		return b.readKeyword()
182	}
183}
184
185func (b *buffer) readHexString() token {
186	tmp := b.tmp[:0]
187	for {
188	Loop:
189		c := b.readByte()
190		if c == '>' {
191			break
192		}
193		if isSpace(c) {
194			goto Loop
195		}
196	Loop2:
197		c2 := b.readByte()
198		if isSpace(c2) {
199			goto Loop2
200		}
201		x := unhex(c)<<4 | unhex(c2)
202		if x < 0 {
203			b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])
204			break
205		}
206		tmp = append(tmp, byte(x))
207	}
208	b.tmp = tmp
209	return string(tmp)
210}
211
212func unhex(b byte) int {
213	switch {
214	case '0' <= b && b <= '9':
215		return int(b) - '0'
216	case 'a' <= b && b <= 'f':
217		return int(b) - 'a' + 10
218	case 'A' <= b && b <= 'F':
219		return int(b) - 'A' + 10
220	}
221	return -1
222}
223
224func (b *buffer) readLiteralString() token {
225	tmp := b.tmp[:0]
226	depth := 1
227Loop:
228	for {
229		c := b.readByte()
230		switch c {
231		default:
232			tmp = append(tmp, c)
233		case '(':
234			depth++
235			tmp = append(tmp, c)
236		case ')':
237			if depth--; depth == 0 {
238				break Loop
239			}
240			tmp = append(tmp, c)
241		case '\\':
242			switch c = b.readByte(); c {
243			default:
244				b.errorf("invalid escape sequence \\%c", c)
245				tmp = append(tmp, '\\', c)
246			case 'n':
247				tmp = append(tmp, '\n')
248			case 'r':
249				tmp = append(tmp, '\r')
250			case 'b':
251				tmp = append(tmp, '\b')
252			case 't':
253				tmp = append(tmp, '\t')
254			case 'f':
255				tmp = append(tmp, '\f')
256			case '(', ')', '\\':
257				tmp = append(tmp, c)
258			case '\r':
259				if b.readByte() != '\n' {
260					b.unreadByte()
261				}
262				fallthrough
263			case '\n':
264				// no append
265			case '0', '1', '2', '3', '4', '5', '6', '7':
266				x := int(c - '0')
267				for i := 0; i < 2; i++ {
268					c = b.readByte()
269					if c < '0' || c > '7' {
270						b.unreadByte()
271						break
272					}
273					x = x*8 + int(c-'0')
274				}
275				if x > 255 {
276					b.errorf("invalid octal escape \\%03o", x)
277				}
278				tmp = append(tmp, byte(x))
279			}
280		}
281	}
282	b.tmp = tmp
283	return string(tmp)
284}
285
286func (b *buffer) readName() token {
287	tmp := b.tmp[:0]
288	for {
289		c := b.readByte()
290		if isDelim(c) || isSpace(c) {
291			b.unreadByte()
292			break
293		}
294		if c == '#' {
295			x := unhex(b.readByte())<<4 | unhex(b.readByte())
296			if x < 0 {
297				b.errorf("malformed name")
298			}
299			tmp = append(tmp, byte(x))
300			continue
301		}
302		tmp = append(tmp, c)
303	}
304	b.tmp = tmp
305	return name(string(tmp))
306}
307
308func (b *buffer) readKeyword() token {
309	tmp := b.tmp[:0]
310	for {
311		c := b.readByte()
312		if isDelim(c) || isSpace(c) {
313			b.unreadByte()
314			break
315		}
316		tmp = append(tmp, c)
317	}
318	b.tmp = tmp
319	s := string(tmp)
320	switch {
321	case s == "true":
322		return true
323	case s == "false":
324		return false
325	case isInteger(s):
326		x, err := strconv.ParseInt(s, 10, 64)
327		if err != nil {
328			b.errorf("invalid integer %s", s)
329		}
330		return x
331	case isReal(s):
332		x, err := strconv.ParseFloat(s, 64)
333		if err != nil {
334			b.errorf("invalid real %s", s)
335		}
336		return x
337	}
338	return keyword(string(tmp))
339}
340
341func isInteger(s string) bool {
342	if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
343		s = s[1:]
344	}
345	if len(s) == 0 {
346		return false
347	}
348	for _, c := range s {
349		if c < '0' || '9' < c {
350			return false
351		}
352	}
353	return true
354}
355
356func isReal(s string) bool {
357	if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
358		s = s[1:]
359	}
360	if len(s) == 0 {
361		return false
362	}
363	ndot := 0
364	for _, c := range s {
365		if c == '.' {
366			ndot++
367			continue
368		}
369		if c < '0' || '9' < c {
370			return false
371		}
372	}
373	return ndot == 1
374}
375
376// An object is a PDF syntax object, one of the following Go types:
377//
378//	bool, a PDF boolean
379//	int64, a PDF integer
380//	float64, a PDF real
381//	string, a PDF string literal
382//	name, a PDF name without the leading slash
383//	dict, a PDF dictionary
384//	array, a PDF array
385//	stream, a PDF stream
386//	objptr, a PDF object reference
387//	objdef, a PDF object definition
388//
389// An object may also be nil, to represent the PDF null.
390type object interface{}
391
392type dict map[name]object
393
394type array []object
395
396type stream struct {
397	hdr    dict
398	ptr    objptr
399	offset int64
400}
401
402type objptr struct {
403	id  uint32
404	gen uint16
405}
406
407type objdef struct {
408	ptr objptr
409	obj object
410}
411
412func (b *buffer) readObject() object {
413	tok := b.readToken()
414	if kw, ok := tok.(keyword); ok {
415		switch kw {
416		case "null":
417			return nil
418		case "<<":
419			return b.readDict()
420		case "[":
421			return b.readArray()
422		}
423		b.errorf("unexpected keyword %q parsing object", kw)
424		return nil
425	}
426
427	if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 {
428		tok = decryptString(b.key, b.useAES, b.objptr, str)
429	}
430
431	if !b.allowObjptr {
432		return tok
433	}
434
435	if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 {
436		tok2 := b.readToken()
437		if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 {
438			tok3 := b.readToken()
439			switch tok3 {
440			case keyword("R"):
441				return objptr{uint32(t1), uint16(t2)}
442			case keyword("obj"):
443				old := b.objptr
444				b.objptr = objptr{uint32(t1), uint16(t2)}
445				obj := b.readObject()
446				if _, ok := obj.(stream); !ok {
447					tok4 := b.readToken()
448					if tok4 != keyword("endobj") {
449						b.errorf("missing endobj after indirect object definition")
450						b.unreadToken(tok4)
451					}
452				}
453				b.objptr = old
454				return objdef{objptr{uint32(t1), uint16(t2)}, obj}
455			}
456			b.unreadToken(tok3)
457		}
458		b.unreadToken(tok2)
459	}
460	return tok
461}
462
463func (b *buffer) readArray() object {
464	var x array
465	for {
466		tok := b.readToken()
467		if tok == nil || tok == keyword("]") {
468			break
469		}
470		b.unreadToken(tok)
471		x = append(x, b.readObject())
472	}
473	return x
474}
475
476func (b *buffer) readDict() object {
477	x := make(dict)
478	for {
479		tok := b.readToken()
480		if tok == nil || tok == keyword(">>") {
481			break
482		}
483		n, ok := tok.(name)
484		if !ok {
485			b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok)
486			continue
487		}
488		x[n] = b.readObject()
489	}
490
491	if !b.allowStream {
492		return x
493	}
494
495	tok := b.readToken()
496	if tok != keyword("stream") {
497		b.unreadToken(tok)
498		return x
499	}
500
501	switch b.readByte() {
502	case '\r':
503		if b.readByte() != '\n' {
504			b.unreadByte()
505		}
506	case '\n':
507		// ok
508	default:
509		b.errorf("stream keyword not followed by newline")
510	}
511
512	return stream{x, b.objptr, b.readOffset()}
513}
514
515func isSpace(b byte) bool {
516	switch b {
517	case '\x00', '\t', '\n', '\f', '\r', ' ':
518		return true
519	}
520	return false
521}
522
523func isDelim(b byte) bool {
524	switch b {
525	case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%':
526		return true
527	}
528	return false
529}
530