1#!/usr/bin/env python3 2 3# -*- coding: utf-8 -*- 4 5import re 6import logging 7 8 9from . import settings 10from .utils import choplist 11 12log = logging.getLogger(__name__) 13 14 15class PSException(Exception): 16 pass 17 18 19class PSEOF(PSException): 20 pass 21 22 23class PSSyntaxError(PSException): 24 pass 25 26 27class PSTypeError(PSException): 28 pass 29 30 31class PSValueError(PSException): 32 pass 33 34 35class PSObject: 36 """Base class for all PS or PDF-related data types.""" 37 38 pass 39 40 41class PSLiteral(PSObject): 42 43 """A class that represents a PostScript literal. 44 45 Postscript literals are used as identifiers, such as 46 variable names, property names and dictionary keys. 47 Literals are case sensitive and denoted by a preceding 48 slash sign (e.g. "/Name") 49 50 Note: Do not create an instance of PSLiteral directly. 51 Always use PSLiteralTable.intern(). 52 """ 53 54 def __init__(self, name): 55 self.name = name 56 57 def __repr__(self): 58 name = self.name 59 return '/%r' % name 60 61 62class PSKeyword(PSObject): 63 64 """A class that represents a PostScript keyword. 65 66 PostScript keywords are a dozen of predefined words. 67 Commands and directives in PostScript are expressed by keywords. 68 They are also used to denote the content boundaries. 69 70 Note: Do not create an instance of PSKeyword directly. 71 Always use PSKeywordTable.intern(). 72 """ 73 74 def __init__(self, name): 75 self.name = name 76 return 77 78 def __repr__(self): 79 name = self.name 80 return '/%r' % name 81 82 83class PSSymbolTable: 84 """A utility class for storing PSLiteral/PSKeyword objects. 85 86 Interned objects can be checked its identity with "is" operator. 87 """ 88 89 def __init__(self, klass): 90 self.dict = {} 91 self.klass = klass 92 return 93 94 def intern(self, name): 95 if name in self.dict: 96 lit = self.dict[name] 97 else: 98 lit = self.klass(name) 99 self.dict[name] = lit 100 return lit 101 102 103PSLiteralTable = PSSymbolTable(PSLiteral) 104PSKeywordTable = PSSymbolTable(PSKeyword) 105LIT = PSLiteralTable.intern 106KWD = PSKeywordTable.intern 107KEYWORD_PROC_BEGIN = KWD(b'{') 108KEYWORD_PROC_END = KWD(b'}') 109KEYWORD_ARRAY_BEGIN = KWD(b'[') 110KEYWORD_ARRAY_END = KWD(b']') 111KEYWORD_DICT_BEGIN = KWD(b'<<') 112KEYWORD_DICT_END = KWD(b'>>') 113 114 115def literal_name(x): 116 if not isinstance(x, PSLiteral): 117 if settings.STRICT: 118 raise PSTypeError('Literal required: {!r}'.format(x)) 119 else: 120 name = x 121 else: 122 name = x.name 123 try: 124 name = str(name, 'utf-8') 125 except Exception: 126 pass 127 return name 128 129 130def keyword_name(x): 131 if not isinstance(x, PSKeyword): 132 if settings.STRICT: 133 raise PSTypeError('Keyword required: %r' % x) 134 else: 135 name = x 136 else: 137 name = str(x.name, 'utf-8', 'ignore') 138 return name 139 140 141EOL = re.compile(br'[\r\n]') 142SPC = re.compile(br'\s') 143NONSPC = re.compile(br'\S') 144HEX = re.compile(br'[0-9a-fA-F]') 145END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]') 146END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]') 147HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.') 148END_NUMBER = re.compile(br'[^0-9]') 149END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]') 150END_STRING = re.compile(br'[()\134]') 151OCT_STRING = re.compile(br'[0-7]') 152ESC_STRING = { 153 b'b': 8, 154 b't': 9, 155 b'n': 10, 156 b'f': 12, 157 b'r': 13, 158 b'(': 40, 159 b')': 41, 160 b'\\': 92 161} 162 163 164class PSBaseParser: 165 166 """Most basic PostScript parser that performs only tokenization. 167 """ 168 BUFSIZ = 4096 169 170 def __init__(self, fp): 171 self.fp = fp 172 self.seek(0) 173 return 174 175 def __repr__(self): 176 return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, 177 self.bufpos) 178 179 def flush(self): 180 return 181 182 def close(self): 183 self.flush() 184 return 185 186 def tell(self): 187 return self.bufpos+self.charpos 188 189 def poll(self, pos=None, n=80): 190 pos0 = self.fp.tell() 191 if not pos: 192 pos = self.bufpos+self.charpos 193 self.fp.seek(pos) 194 log.info('poll(%d): %r', pos, self.fp.read(n)) 195 self.fp.seek(pos0) 196 return 197 198 def seek(self, pos): 199 """Seeks the parser to the given position. 200 """ 201 log.debug('seek: %r', pos) 202 self.fp.seek(pos) 203 # reset the status for nextline() 204 self.bufpos = pos 205 self.buf = b'' 206 self.charpos = 0 207 # reset the status for nexttoken() 208 self._parse1 = self._parse_main 209 self._curtoken = b'' 210 self._curtokenpos = 0 211 self._tokens = [] 212 return 213 214 def fillbuf(self): 215 if self.charpos < len(self.buf): 216 return 217 # fetch next chunk. 218 self.bufpos = self.fp.tell() 219 self.buf = self.fp.read(self.BUFSIZ) 220 if not self.buf: 221 raise PSEOF('Unexpected EOF') 222 self.charpos = 0 223 return 224 225 def nextline(self): 226 """Fetches a next line that ends either with \\r or \\n. 227 """ 228 linebuf = b'' 229 linepos = self.bufpos + self.charpos 230 eol = False 231 while 1: 232 self.fillbuf() 233 if eol: 234 c = self.buf[self.charpos:self.charpos+1] 235 # handle b'\r\n' 236 if c == b'\n': 237 linebuf += c 238 self.charpos += 1 239 break 240 m = EOL.search(self.buf, self.charpos) 241 if m: 242 linebuf += self.buf[self.charpos:m.end(0)] 243 self.charpos = m.end(0) 244 if linebuf[-1:] == b'\r': 245 eol = True 246 else: 247 break 248 else: 249 linebuf += self.buf[self.charpos:] 250 self.charpos = len(self.buf) 251 log.debug('nextline: %r, %r', linepos, linebuf) 252 253 return (linepos, linebuf) 254 255 def revreadlines(self): 256 """Fetches a next line backword. 257 258 This is used to locate the trailers at the end of a file. 259 """ 260 self.fp.seek(0, 2) 261 pos = self.fp.tell() 262 buf = b'' 263 while 0 < pos: 264 prevpos = pos 265 pos = max(0, pos-self.BUFSIZ) 266 self.fp.seek(pos) 267 s = self.fp.read(prevpos-pos) 268 if not s: 269 break 270 while 1: 271 n = max(s.rfind(b'\r'), s.rfind(b'\n')) 272 if n == -1: 273 buf = s + buf 274 break 275 yield s[n:] + buf 276 s = s[:n] 277 buf = b'' 278 return 279 280 def _parse_main(self, s, i): 281 m = NONSPC.search(s, i) 282 if not m: 283 return len(s) 284 j = m.start(0) 285 c = s[j:j+1] 286 self._curtokenpos = self.bufpos+j 287 if c == b'%': 288 self._curtoken = b'%' 289 self._parse1 = self._parse_comment 290 return j+1 291 elif c == b'/': 292 self._curtoken = b'' 293 self._parse1 = self._parse_literal 294 return j+1 295 elif c in b'-+' or c.isdigit(): 296 self._curtoken = c 297 self._parse1 = self._parse_number 298 return j+1 299 elif c == b'.': 300 self._curtoken = c 301 self._parse1 = self._parse_float 302 return j+1 303 elif c.isalpha(): 304 self._curtoken = c 305 self._parse1 = self._parse_keyword 306 return j+1 307 elif c == b'(': 308 self._curtoken = b'' 309 self.paren = 1 310 self._parse1 = self._parse_string 311 return j+1 312 elif c == b'<': 313 self._curtoken = b'' 314 self._parse1 = self._parse_wopen 315 return j+1 316 elif c == b'>': 317 self._curtoken = b'' 318 self._parse1 = self._parse_wclose 319 return j+1 320 else: 321 self._add_token(KWD(c)) 322 return j+1 323 324 def _add_token(self, obj): 325 self._tokens.append((self._curtokenpos, obj)) 326 return 327 328 def _parse_comment(self, s, i): 329 m = EOL.search(s, i) 330 if not m: 331 self._curtoken += s[i:] 332 return len(s) 333 j = m.start(0) 334 self._curtoken += s[i:j] 335 self._parse1 = self._parse_main 336 # We ignore comments. 337 # self._tokens.append(self._curtoken) 338 return j 339 340 def _parse_literal(self, s, i): 341 m = END_LITERAL.search(s, i) 342 if not m: 343 self._curtoken += s[i:] 344 return len(s) 345 j = m.start(0) 346 self._curtoken += s[i:j] 347 c = s[j:j+1] 348 if c == b'#': 349 self.hex = b'' 350 self._parse1 = self._parse_literal_hex 351 return j+1 352 try: 353 self._curtoken = str(self._curtoken, 'utf-8') 354 except Exception: 355 pass 356 self._add_token(LIT(self._curtoken)) 357 self._parse1 = self._parse_main 358 return j 359 360 def _parse_literal_hex(self, s, i): 361 c = s[i:i+1] 362 if HEX.match(c) and len(self.hex) < 2: 363 self.hex += c 364 return i+1 365 if self.hex: 366 self._curtoken += bytes((int(self.hex, 16),)) 367 self._parse1 = self._parse_literal 368 return i 369 370 def _parse_number(self, s, i): 371 m = END_NUMBER.search(s, i) 372 if not m: 373 self._curtoken += s[i:] 374 return len(s) 375 j = m.start(0) 376 self._curtoken += s[i:j] 377 c = s[j:j+1] 378 if c == b'.': 379 self._curtoken += c 380 self._parse1 = self._parse_float 381 return j+1 382 try: 383 self._add_token(int(self._curtoken)) 384 except ValueError: 385 pass 386 self._parse1 = self._parse_main 387 return j 388 389 def _parse_float(self, s, i): 390 m = END_NUMBER.search(s, i) 391 if not m: 392 self._curtoken += s[i:] 393 return len(s) 394 j = m.start(0) 395 self._curtoken += s[i:j] 396 try: 397 self._add_token(float(self._curtoken)) 398 except ValueError: 399 pass 400 self._parse1 = self._parse_main 401 return j 402 403 def _parse_keyword(self, s, i): 404 m = END_KEYWORD.search(s, i) 405 if not m: 406 self._curtoken += s[i:] 407 return len(s) 408 j = m.start(0) 409 self._curtoken += s[i:j] 410 if self._curtoken == b'true': 411 token = True 412 elif self._curtoken == b'false': 413 token = False 414 else: 415 token = KWD(self._curtoken) 416 self._add_token(token) 417 self._parse1 = self._parse_main 418 return j 419 420 def _parse_string(self, s, i): 421 m = END_STRING.search(s, i) 422 if not m: 423 self._curtoken += s[i:] 424 return len(s) 425 j = m.start(0) 426 self._curtoken += s[i:j] 427 c = s[j:j+1] 428 if c == b'\\': 429 self.oct = b'' 430 self._parse1 = self._parse_string_1 431 return j+1 432 if c == b'(': 433 self.paren += 1 434 self._curtoken += c 435 return j+1 436 if c == b')': 437 self.paren -= 1 438 if self.paren: 439 # WTF, they said balanced parens need no special treatment. 440 self._curtoken += c 441 return j+1 442 self._add_token(self._curtoken) 443 self._parse1 = self._parse_main 444 return j+1 445 446 def _parse_string_1(self, s, i): 447 c = s[i:i+1] 448 if OCT_STRING.match(c) and len(self.oct) < 3: 449 self.oct += c 450 return i+1 451 if self.oct: 452 self._curtoken += bytes((int(self.oct, 8),)) 453 self._parse1 = self._parse_string 454 return i 455 if c in ESC_STRING: 456 self._curtoken += bytes((ESC_STRING[c],)) 457 self._parse1 = self._parse_string 458 return i+1 459 460 def _parse_wopen(self, s, i): 461 c = s[i:i+1] 462 if c == b'<': 463 self._add_token(KEYWORD_DICT_BEGIN) 464 self._parse1 = self._parse_main 465 i += 1 466 else: 467 self._parse1 = self._parse_hexstring 468 return i 469 470 def _parse_wclose(self, s, i): 471 c = s[i:i+1] 472 if c == b'>': 473 self._add_token(KEYWORD_DICT_END) 474 i += 1 475 self._parse1 = self._parse_main 476 return i 477 478 def _parse_hexstring(self, s, i): 479 m = END_HEX_STRING.search(s, i) 480 if not m: 481 self._curtoken += s[i:] 482 return len(s) 483 j = m.start(0) 484 self._curtoken += s[i:j] 485 token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)), 486 SPC.sub(b'', self._curtoken)) 487 self._add_token(token) 488 self._parse1 = self._parse_main 489 return j 490 491 def nexttoken(self): 492 while not self._tokens: 493 self.fillbuf() 494 self.charpos = self._parse1(self.buf, self.charpos) 495 token = self._tokens.pop(0) 496 log.debug('nexttoken: %r', token) 497 return token 498 499 500class PSStackParser(PSBaseParser): 501 def __init__(self, fp): 502 PSBaseParser.__init__(self, fp) 503 self.reset() 504 return 505 506 def reset(self): 507 self.context = [] 508 self.curtype = None 509 self.curstack = [] 510 self.results = [] 511 return 512 513 def seek(self, pos): 514 PSBaseParser.seek(self, pos) 515 self.reset() 516 return 517 518 def push(self, *objs): 519 self.curstack.extend(objs) 520 return 521 522 def pop(self, n): 523 objs = self.curstack[-n:] 524 self.curstack[-n:] = [] 525 return objs 526 527 def popall(self): 528 objs = self.curstack 529 self.curstack = [] 530 return objs 531 532 def add_results(self, *objs): 533 try: 534 log.debug('add_results: %r', objs) 535 except Exception: 536 log.debug('add_results: (unprintable object)') 537 self.results.extend(objs) 538 return 539 540 def start_type(self, pos, type): 541 self.context.append((pos, self.curtype, self.curstack)) 542 (self.curtype, self.curstack) = (type, []) 543 log.debug('start_type: pos=%r, type=%r', pos, type) 544 return 545 546 def end_type(self, type): 547 if self.curtype != type: 548 raise PSTypeError('Type mismatch: {!r} != {!r}' 549 .format(self.curtype, type)) 550 objs = [obj for (_, obj) in self.curstack] 551 (pos, self.curtype, self.curstack) = self.context.pop() 552 log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs) 553 return (pos, objs) 554 555 def do_keyword(self, pos, token): 556 return 557 558 def nextobject(self): 559 """Yields a list of objects. 560 561 Arrays and dictionaries are represented as Python lists and 562 dictionaries. 563 564 :return: keywords, literals, strings, numbers, arrays and dictionaries. 565 """ 566 while not self.results: 567 (pos, token) = self.nexttoken() 568 if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): 569 # normal token 570 self.push((pos, token)) 571 elif token == KEYWORD_ARRAY_BEGIN: 572 # begin array 573 self.start_type(pos, 'a') 574 elif token == KEYWORD_ARRAY_END: 575 # end array 576 try: 577 self.push(self.end_type('a')) 578 except PSTypeError: 579 if settings.STRICT: 580 raise 581 elif token == KEYWORD_DICT_BEGIN: 582 # begin dictionary 583 self.start_type(pos, 'd') 584 elif token == KEYWORD_DICT_END: 585 # end dictionary 586 try: 587 (pos, objs) = self.end_type('d') 588 if len(objs) % 2 != 0: 589 error_msg = 'Invalid dictionary construct: %r' % objs 590 raise PSSyntaxError(error_msg) 591 d = {literal_name(k): v 592 for (k, v) in choplist(2, objs) if v is not None} 593 self.push((pos, d)) 594 except PSTypeError: 595 if settings.STRICT: 596 raise 597 elif token == KEYWORD_PROC_BEGIN: 598 # begin proc 599 self.start_type(pos, 'p') 600 elif token == KEYWORD_PROC_END: 601 # end proc 602 try: 603 self.push(self.end_type('p')) 604 except PSTypeError: 605 if settings.STRICT: 606 raise 607 elif isinstance(token, PSKeyword): 608 log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, 609 token, self.curstack) 610 self.do_keyword(pos, token) 611 else: 612 log.error('unknown token: pos=%r, token=%r, stack=%r', pos, 613 token, self.curstack) 614 self.do_keyword(pos, token) 615 raise 616 if self.context: 617 continue 618 else: 619 self.flush() 620 obj = self.results.pop(0) 621 try: 622 log.debug('nextobject: %r', obj) 623 except Exception: 624 log.debug('nextobject: (unprintable object)') 625 return obj 626