1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Reading of PDF tokens and objects from a raw byte stream. 6 7package pdf 8 9import ( 10 "fmt" 11 "io" 12 "strconv" 13) 14 15// A token is a PDF token in the input stream, one of the following Go types: 16// 17// bool, a PDF boolean 18// int64, a PDF integer 19// float64, a PDF real 20// string, a PDF string literal 21// keyword, a PDF keyword 22// name, a PDF name without the leading slash 23// 24type token interface{} 25 26// A name is a PDF name, without the leading slash. 27type name string 28 29// A keyword is a PDF keyword. 30// Delimiter tokens used in higher-level syntax, 31// such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords. 32type keyword string 33 34// A buffer holds buffered input bytes from the PDF file. 35type buffer struct { 36 r io.Reader // source of data 37 buf []byte // buffered data 38 pos int // read index in buf 39 offset int64 // offset at end of buf; aka offset of next read 40 tmp []byte // scratch space for accumulating token 41 unread []token // queue of read but then unread tokens 42 allowEOF bool 43 allowObjptr bool 44 allowStream bool 45 eof bool 46 key []byte 47 useAES bool 48 objptr objptr 49} 50 51// newBuffer returns a new buffer reading from r at the given offset. 52func newBuffer(r io.Reader, offset int64) *buffer { 53 return &buffer{ 54 r: r, 55 offset: offset, 56 buf: make([]byte, 0, 4096), 57 allowObjptr: true, 58 allowStream: true, 59 } 60} 61 62func (b *buffer) seek(offset int64) { 63 b.offset = offset 64 b.buf = b.buf[:0] 65 b.pos = 0 66 b.unread = b.unread[:0] 67} 68 69func (b *buffer) readByte() byte { 70 if b.pos >= len(b.buf) { 71 b.reload() 72 if b.pos >= len(b.buf) { 73 return '\n' 74 } 75 } 76 c := b.buf[b.pos] 77 b.pos++ 78 return c 79} 80 81func (b *buffer) errorf(format string, args ...interface{}) { 82 panic(fmt.Errorf(format, args...)) 83} 84 85func (b *buffer) reload() bool { 86 n := cap(b.buf) - int(b.offset%int64(cap(b.buf))) 87 n, err := b.r.Read(b.buf[:n]) 88 if n == 0 && err != nil { 89 b.buf = b.buf[:0] 90 b.pos = 0 91 if b.allowEOF && err == io.EOF { 92 b.eof = true 93 return false 94 } 95 b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err) 96 return false 97 } 98 b.offset += int64(n) 99 b.buf = b.buf[:n] 100 b.pos = 0 101 return true 102} 103 104func (b *buffer) seekForward(offset int64) { 105 for b.offset < offset { 106 if !b.reload() { 107 return 108 } 109 } 110 b.pos = len(b.buf) - int(b.offset-offset) 111} 112 113func (b *buffer) readOffset() int64 { 114 return b.offset - int64(len(b.buf)) + int64(b.pos) 115} 116 117func (b *buffer) unreadByte() { 118 if b.pos > 0 { 119 b.pos-- 120 } 121} 122 123func (b *buffer) unreadToken(t token) { 124 b.unread = append(b.unread, t) 125} 126 127func (b *buffer) readToken() token { 128 if n := len(b.unread); n > 0 { 129 t := b.unread[n-1] 130 b.unread = b.unread[:n-1] 131 return t 132 } 133 134 // Find first non-space, non-comment byte. 135 c := b.readByte() 136 for { 137 if isSpace(c) { 138 if b.eof { 139 return io.EOF 140 } 141 c = b.readByte() 142 } else if c == '%' { 143 for c != '\r' && c != '\n' { 144 c = b.readByte() 145 } 146 } else { 147 break 148 } 149 } 150 151 switch c { 152 case '<': 153 if b.readByte() == '<' { 154 return keyword("<<") 155 } 156 b.unreadByte() 157 return b.readHexString() 158 159 case '(': 160 return b.readLiteralString() 161 162 case '[', ']', '{', '}': 163 return keyword(string(c)) 164 165 case '/': 166 return b.readName() 167 168 case '>': 169 if b.readByte() == '>' { 170 return keyword(">>") 171 } 172 b.unreadByte() 173 fallthrough 174 175 default: 176 if isDelim(c) { 177 b.errorf("unexpected delimiter %#q", rune(c)) 178 return nil 179 } 180 b.unreadByte() 181 return b.readKeyword() 182 } 183} 184 185func (b *buffer) readHexString() token { 186 tmp := b.tmp[:0] 187 for { 188 Loop: 189 c := b.readByte() 190 if c == '>' { 191 break 192 } 193 if isSpace(c) { 194 goto Loop 195 } 196 Loop2: 197 c2 := b.readByte() 198 if isSpace(c2) { 199 goto Loop2 200 } 201 x := unhex(c)<<4 | unhex(c2) 202 if x < 0 { 203 b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]) 204 break 205 } 206 tmp = append(tmp, byte(x)) 207 } 208 b.tmp = tmp 209 return string(tmp) 210} 211 212func unhex(b byte) int { 213 switch { 214 case '0' <= b && b <= '9': 215 return int(b) - '0' 216 case 'a' <= b && b <= 'f': 217 return int(b) - 'a' + 10 218 case 'A' <= b && b <= 'F': 219 return int(b) - 'A' + 10 220 } 221 return -1 222} 223 224func (b *buffer) readLiteralString() token { 225 tmp := b.tmp[:0] 226 depth := 1 227Loop: 228 for { 229 c := b.readByte() 230 switch c { 231 default: 232 tmp = append(tmp, c) 233 case '(': 234 depth++ 235 tmp = append(tmp, c) 236 case ')': 237 if depth--; depth == 0 { 238 break Loop 239 } 240 tmp = append(tmp, c) 241 case '\\': 242 switch c = b.readByte(); c { 243 default: 244 b.errorf("invalid escape sequence \\%c", c) 245 tmp = append(tmp, '\\', c) 246 case 'n': 247 tmp = append(tmp, '\n') 248 case 'r': 249 tmp = append(tmp, '\r') 250 case 'b': 251 tmp = append(tmp, '\b') 252 case 't': 253 tmp = append(tmp, '\t') 254 case 'f': 255 tmp = append(tmp, '\f') 256 case '(', ')', '\\': 257 tmp = append(tmp, c) 258 case '\r': 259 if b.readByte() != '\n' { 260 b.unreadByte() 261 } 262 fallthrough 263 case '\n': 264 // no append 265 case '0', '1', '2', '3', '4', '5', '6', '7': 266 x := int(c - '0') 267 for i := 0; i < 2; i++ { 268 c = b.readByte() 269 if c < '0' || c > '7' { 270 b.unreadByte() 271 break 272 } 273 x = x*8 + int(c-'0') 274 } 275 if x > 255 { 276 b.errorf("invalid octal escape \\%03o", x) 277 } 278 tmp = append(tmp, byte(x)) 279 } 280 } 281 } 282 b.tmp = tmp 283 return string(tmp) 284} 285 286func (b *buffer) readName() token { 287 tmp := b.tmp[:0] 288 for { 289 c := b.readByte() 290 if isDelim(c) || isSpace(c) { 291 b.unreadByte() 292 break 293 } 294 if c == '#' { 295 x := unhex(b.readByte())<<4 | unhex(b.readByte()) 296 if x < 0 { 297 b.errorf("malformed name") 298 } 299 tmp = append(tmp, byte(x)) 300 continue 301 } 302 tmp = append(tmp, c) 303 } 304 b.tmp = tmp 305 return name(string(tmp)) 306} 307 308func (b *buffer) readKeyword() token { 309 tmp := b.tmp[:0] 310 for { 311 c := b.readByte() 312 if isDelim(c) || isSpace(c) { 313 b.unreadByte() 314 break 315 } 316 tmp = append(tmp, c) 317 } 318 b.tmp = tmp 319 s := string(tmp) 320 switch { 321 case s == "true": 322 return true 323 case s == "false": 324 return false 325 case isInteger(s): 326 x, err := strconv.ParseInt(s, 10, 64) 327 if err != nil { 328 b.errorf("invalid integer %s", s) 329 } 330 return x 331 case isReal(s): 332 x, err := strconv.ParseFloat(s, 64) 333 if err != nil { 334 b.errorf("invalid real %s", s) 335 } 336 return x 337 } 338 return keyword(string(tmp)) 339} 340 341func isInteger(s string) bool { 342 if len(s) > 0 && (s[0] == '+' || s[0] == '-') { 343 s = s[1:] 344 } 345 if len(s) == 0 { 346 return false 347 } 348 for _, c := range s { 349 if c < '0' || '9' < c { 350 return false 351 } 352 } 353 return true 354} 355 356func isReal(s string) bool { 357 if len(s) > 0 && (s[0] == '+' || s[0] == '-') { 358 s = s[1:] 359 } 360 if len(s) == 0 { 361 return false 362 } 363 ndot := 0 364 for _, c := range s { 365 if c == '.' { 366 ndot++ 367 continue 368 } 369 if c < '0' || '9' < c { 370 return false 371 } 372 } 373 return ndot == 1 374} 375 376// An object is a PDF syntax object, one of the following Go types: 377// 378// bool, a PDF boolean 379// int64, a PDF integer 380// float64, a PDF real 381// string, a PDF string literal 382// name, a PDF name without the leading slash 383// dict, a PDF dictionary 384// array, a PDF array 385// stream, a PDF stream 386// objptr, a PDF object reference 387// objdef, a PDF object definition 388// 389// An object may also be nil, to represent the PDF null. 390type object interface{} 391 392type dict map[name]object 393 394type array []object 395 396type stream struct { 397 hdr dict 398 ptr objptr 399 offset int64 400} 401 402type objptr struct { 403 id uint32 404 gen uint16 405} 406 407type objdef struct { 408 ptr objptr 409 obj object 410} 411 412func (b *buffer) readObject() object { 413 tok := b.readToken() 414 if kw, ok := tok.(keyword); ok { 415 switch kw { 416 case "null": 417 return nil 418 case "<<": 419 return b.readDict() 420 case "[": 421 return b.readArray() 422 } 423 b.errorf("unexpected keyword %q parsing object", kw) 424 return nil 425 } 426 427 if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 { 428 tok = decryptString(b.key, b.useAES, b.objptr, str) 429 } 430 431 if !b.allowObjptr { 432 return tok 433 } 434 435 if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 { 436 tok2 := b.readToken() 437 if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 { 438 tok3 := b.readToken() 439 switch tok3 { 440 case keyword("R"): 441 return objptr{uint32(t1), uint16(t2)} 442 case keyword("obj"): 443 old := b.objptr 444 b.objptr = objptr{uint32(t1), uint16(t2)} 445 obj := b.readObject() 446 if _, ok := obj.(stream); !ok { 447 tok4 := b.readToken() 448 if tok4 != keyword("endobj") { 449 b.errorf("missing endobj after indirect object definition") 450 b.unreadToken(tok4) 451 } 452 } 453 b.objptr = old 454 return objdef{objptr{uint32(t1), uint16(t2)}, obj} 455 } 456 b.unreadToken(tok3) 457 } 458 b.unreadToken(tok2) 459 } 460 return tok 461} 462 463func (b *buffer) readArray() object { 464 var x array 465 for { 466 tok := b.readToken() 467 if tok == nil || tok == keyword("]") { 468 break 469 } 470 b.unreadToken(tok) 471 x = append(x, b.readObject()) 472 } 473 return x 474} 475 476func (b *buffer) readDict() object { 477 x := make(dict) 478 for { 479 tok := b.readToken() 480 if tok == nil || tok == keyword(">>") { 481 break 482 } 483 n, ok := tok.(name) 484 if !ok { 485 b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) 486 continue 487 } 488 x[n] = b.readObject() 489 } 490 491 if !b.allowStream { 492 return x 493 } 494 495 tok := b.readToken() 496 if tok != keyword("stream") { 497 b.unreadToken(tok) 498 return x 499 } 500 501 switch b.readByte() { 502 case '\r': 503 if b.readByte() != '\n' { 504 b.unreadByte() 505 } 506 case '\n': 507 // ok 508 default: 509 b.errorf("stream keyword not followed by newline") 510 } 511 512 return stream{x, b.objptr, b.readOffset()} 513} 514 515func isSpace(b byte) bool { 516 switch b { 517 case '\x00', '\t', '\n', '\f', '\r', ' ': 518 return true 519 } 520 return false 521} 522 523func isDelim(b byte) bool { 524 switch b { 525 case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%': 526 return true 527 } 528 return false 529} 530