1#!/usr/bin/env python3
2
3# -*- coding: utf-8 -*-
4
5import re
6import logging
7
8
9from . import settings
10from .utils import choplist
11
12log = logging.getLogger(__name__)
13
14
15class PSException(Exception):
16    pass
17
18
19class PSEOF(PSException):
20    pass
21
22
23class PSSyntaxError(PSException):
24    pass
25
26
27class PSTypeError(PSException):
28    pass
29
30
31class PSValueError(PSException):
32    pass
33
34
35class PSObject:
36    """Base class for all PS or PDF-related data types."""
37
38    pass
39
40
41class PSLiteral(PSObject):
42
43    """A class that represents a PostScript literal.
44
45    Postscript literals are used as identifiers, such as
46    variable names, property names and dictionary keys.
47    Literals are case sensitive and denoted by a preceding
48    slash sign (e.g. "/Name")
49
50    Note: Do not create an instance of PSLiteral directly.
51    Always use PSLiteralTable.intern().
52    """
53
54    def __init__(self, name):
55        self.name = name
56
57    def __repr__(self):
58        name = self.name
59        return '/%r' % name
60
61
62class PSKeyword(PSObject):
63
64    """A class that represents a PostScript keyword.
65
66    PostScript keywords are a dozen of predefined words.
67    Commands and directives in PostScript are expressed by keywords.
68    They are also used to denote the content boundaries.
69
70    Note: Do not create an instance of PSKeyword directly.
71    Always use PSKeywordTable.intern().
72    """
73
74    def __init__(self, name):
75        self.name = name
76        return
77
78    def __repr__(self):
79        name = self.name
80        return '/%r' % name
81
82
83class PSSymbolTable:
84    """A utility class for storing PSLiteral/PSKeyword objects.
85
86    Interned objects can be checked its identity with "is" operator.
87    """
88
89    def __init__(self, klass):
90        self.dict = {}
91        self.klass = klass
92        return
93
94    def intern(self, name):
95        if name in self.dict:
96            lit = self.dict[name]
97        else:
98            lit = self.klass(name)
99            self.dict[name] = lit
100        return lit
101
102
103PSLiteralTable = PSSymbolTable(PSLiteral)
104PSKeywordTable = PSSymbolTable(PSKeyword)
105LIT = PSLiteralTable.intern
106KWD = PSKeywordTable.intern
107KEYWORD_PROC_BEGIN = KWD(b'{')
108KEYWORD_PROC_END = KWD(b'}')
109KEYWORD_ARRAY_BEGIN = KWD(b'[')
110KEYWORD_ARRAY_END = KWD(b']')
111KEYWORD_DICT_BEGIN = KWD(b'<<')
112KEYWORD_DICT_END = KWD(b'>>')
113
114
115def literal_name(x):
116    if not isinstance(x, PSLiteral):
117        if settings.STRICT:
118            raise PSTypeError('Literal required: {!r}'.format(x))
119        else:
120            name = x
121    else:
122        name = x.name
123        try:
124            name = str(name, 'utf-8')
125        except Exception:
126            pass
127    return name
128
129
130def keyword_name(x):
131    if not isinstance(x, PSKeyword):
132        if settings.STRICT:
133            raise PSTypeError('Keyword required: %r' % x)
134        else:
135            name = x
136    else:
137        name = str(x.name, 'utf-8', 'ignore')
138    return name
139
140
141EOL = re.compile(br'[\r\n]')
142SPC = re.compile(br'\s')
143NONSPC = re.compile(br'\S')
144HEX = re.compile(br'[0-9a-fA-F]')
145END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
146END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
147HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
148END_NUMBER = re.compile(br'[^0-9]')
149END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
150END_STRING = re.compile(br'[()\134]')
151OCT_STRING = re.compile(br'[0-7]')
152ESC_STRING = {
153    b'b': 8,
154    b't': 9,
155    b'n': 10,
156    b'f': 12,
157    b'r': 13,
158    b'(': 40,
159    b')': 41,
160    b'\\': 92
161}
162
163
164class PSBaseParser:
165
166    """Most basic PostScript parser that performs only tokenization.
167    """
168    BUFSIZ = 4096
169
170    def __init__(self, fp):
171        self.fp = fp
172        self.seek(0)
173        return
174
175    def __repr__(self):
176        return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
177                                        self.bufpos)
178
179    def flush(self):
180        return
181
182    def close(self):
183        self.flush()
184        return
185
186    def tell(self):
187        return self.bufpos+self.charpos
188
189    def poll(self, pos=None, n=80):
190        pos0 = self.fp.tell()
191        if not pos:
192            pos = self.bufpos+self.charpos
193        self.fp.seek(pos)
194        log.info('poll(%d): %r', pos, self.fp.read(n))
195        self.fp.seek(pos0)
196        return
197
198    def seek(self, pos):
199        """Seeks the parser to the given position.
200        """
201        log.debug('seek: %r', pos)
202        self.fp.seek(pos)
203        # reset the status for nextline()
204        self.bufpos = pos
205        self.buf = b''
206        self.charpos = 0
207        # reset the status for nexttoken()
208        self._parse1 = self._parse_main
209        self._curtoken = b''
210        self._curtokenpos = 0
211        self._tokens = []
212        return
213
214    def fillbuf(self):
215        if self.charpos < len(self.buf):
216            return
217        # fetch next chunk.
218        self.bufpos = self.fp.tell()
219        self.buf = self.fp.read(self.BUFSIZ)
220        if not self.buf:
221            raise PSEOF('Unexpected EOF')
222        self.charpos = 0
223        return
224
225    def nextline(self):
226        """Fetches a next line that ends either with \\r or \\n.
227        """
228        linebuf = b''
229        linepos = self.bufpos + self.charpos
230        eol = False
231        while 1:
232            self.fillbuf()
233            if eol:
234                c = self.buf[self.charpos:self.charpos+1]
235                # handle b'\r\n'
236                if c == b'\n':
237                    linebuf += c
238                    self.charpos += 1
239                break
240            m = EOL.search(self.buf, self.charpos)
241            if m:
242                linebuf += self.buf[self.charpos:m.end(0)]
243                self.charpos = m.end(0)
244                if linebuf[-1:] == b'\r':
245                    eol = True
246                else:
247                    break
248            else:
249                linebuf += self.buf[self.charpos:]
250                self.charpos = len(self.buf)
251        log.debug('nextline: %r, %r', linepos, linebuf)
252
253        return (linepos, linebuf)
254
255    def revreadlines(self):
256        """Fetches a next line backword.
257
258        This is used to locate the trailers at the end of a file.
259        """
260        self.fp.seek(0, 2)
261        pos = self.fp.tell()
262        buf = b''
263        while 0 < pos:
264            prevpos = pos
265            pos = max(0, pos-self.BUFSIZ)
266            self.fp.seek(pos)
267            s = self.fp.read(prevpos-pos)
268            if not s:
269                break
270            while 1:
271                n = max(s.rfind(b'\r'), s.rfind(b'\n'))
272                if n == -1:
273                    buf = s + buf
274                    break
275                yield s[n:] + buf
276                s = s[:n]
277                buf = b''
278        return
279
280    def _parse_main(self, s, i):
281        m = NONSPC.search(s, i)
282        if not m:
283            return len(s)
284        j = m.start(0)
285        c = s[j:j+1]
286        self._curtokenpos = self.bufpos+j
287        if c == b'%':
288            self._curtoken = b'%'
289            self._parse1 = self._parse_comment
290            return j+1
291        elif c == b'/':
292            self._curtoken = b''
293            self._parse1 = self._parse_literal
294            return j+1
295        elif c in b'-+' or c.isdigit():
296            self._curtoken = c
297            self._parse1 = self._parse_number
298            return j+1
299        elif c == b'.':
300            self._curtoken = c
301            self._parse1 = self._parse_float
302            return j+1
303        elif c.isalpha():
304            self._curtoken = c
305            self._parse1 = self._parse_keyword
306            return j+1
307        elif c == b'(':
308            self._curtoken = b''
309            self.paren = 1
310            self._parse1 = self._parse_string
311            return j+1
312        elif c == b'<':
313            self._curtoken = b''
314            self._parse1 = self._parse_wopen
315            return j+1
316        elif c == b'>':
317            self._curtoken = b''
318            self._parse1 = self._parse_wclose
319            return j+1
320        else:
321            self._add_token(KWD(c))
322            return j+1
323
324    def _add_token(self, obj):
325        self._tokens.append((self._curtokenpos, obj))
326        return
327
328    def _parse_comment(self, s, i):
329        m = EOL.search(s, i)
330        if not m:
331            self._curtoken += s[i:]
332            return len(s)
333        j = m.start(0)
334        self._curtoken += s[i:j]
335        self._parse1 = self._parse_main
336        # We ignore comments.
337        # self._tokens.append(self._curtoken)
338        return j
339
340    def _parse_literal(self, s, i):
341        m = END_LITERAL.search(s, i)
342        if not m:
343            self._curtoken += s[i:]
344            return len(s)
345        j = m.start(0)
346        self._curtoken += s[i:j]
347        c = s[j:j+1]
348        if c == b'#':
349            self.hex = b''
350            self._parse1 = self._parse_literal_hex
351            return j+1
352        try:
353            self._curtoken = str(self._curtoken, 'utf-8')
354        except Exception:
355            pass
356        self._add_token(LIT(self._curtoken))
357        self._parse1 = self._parse_main
358        return j
359
360    def _parse_literal_hex(self, s, i):
361        c = s[i:i+1]
362        if HEX.match(c) and len(self.hex) < 2:
363            self.hex += c
364            return i+1
365        if self.hex:
366            self._curtoken += bytes((int(self.hex, 16),))
367        self._parse1 = self._parse_literal
368        return i
369
370    def _parse_number(self, s, i):
371        m = END_NUMBER.search(s, i)
372        if not m:
373            self._curtoken += s[i:]
374            return len(s)
375        j = m.start(0)
376        self._curtoken += s[i:j]
377        c = s[j:j+1]
378        if c == b'.':
379            self._curtoken += c
380            self._parse1 = self._parse_float
381            return j+1
382        try:
383            self._add_token(int(self._curtoken))
384        except ValueError:
385            pass
386        self._parse1 = self._parse_main
387        return j
388
389    def _parse_float(self, s, i):
390        m = END_NUMBER.search(s, i)
391        if not m:
392            self._curtoken += s[i:]
393            return len(s)
394        j = m.start(0)
395        self._curtoken += s[i:j]
396        try:
397            self._add_token(float(self._curtoken))
398        except ValueError:
399            pass
400        self._parse1 = self._parse_main
401        return j
402
403    def _parse_keyword(self, s, i):
404        m = END_KEYWORD.search(s, i)
405        if not m:
406            self._curtoken += s[i:]
407            return len(s)
408        j = m.start(0)
409        self._curtoken += s[i:j]
410        if self._curtoken == b'true':
411            token = True
412        elif self._curtoken == b'false':
413            token = False
414        else:
415            token = KWD(self._curtoken)
416        self._add_token(token)
417        self._parse1 = self._parse_main
418        return j
419
420    def _parse_string(self, s, i):
421        m = END_STRING.search(s, i)
422        if not m:
423            self._curtoken += s[i:]
424            return len(s)
425        j = m.start(0)
426        self._curtoken += s[i:j]
427        c = s[j:j+1]
428        if c == b'\\':
429            self.oct = b''
430            self._parse1 = self._parse_string_1
431            return j+1
432        if c == b'(':
433            self.paren += 1
434            self._curtoken += c
435            return j+1
436        if c == b')':
437            self.paren -= 1
438            if self.paren:
439                # WTF, they said balanced parens need no special treatment.
440                self._curtoken += c
441                return j+1
442        self._add_token(self._curtoken)
443        self._parse1 = self._parse_main
444        return j+1
445
446    def _parse_string_1(self, s, i):
447        c = s[i:i+1]
448        if OCT_STRING.match(c) and len(self.oct) < 3:
449            self.oct += c
450            return i+1
451        if self.oct:
452            self._curtoken += bytes((int(self.oct, 8),))
453            self._parse1 = self._parse_string
454            return i
455        if c in ESC_STRING:
456            self._curtoken += bytes((ESC_STRING[c],))
457        self._parse1 = self._parse_string
458        return i+1
459
460    def _parse_wopen(self, s, i):
461        c = s[i:i+1]
462        if c == b'<':
463            self._add_token(KEYWORD_DICT_BEGIN)
464            self._parse1 = self._parse_main
465            i += 1
466        else:
467            self._parse1 = self._parse_hexstring
468        return i
469
470    def _parse_wclose(self, s, i):
471        c = s[i:i+1]
472        if c == b'>':
473            self._add_token(KEYWORD_DICT_END)
474            i += 1
475        self._parse1 = self._parse_main
476        return i
477
478    def _parse_hexstring(self, s, i):
479        m = END_HEX_STRING.search(s, i)
480        if not m:
481            self._curtoken += s[i:]
482            return len(s)
483        j = m.start(0)
484        self._curtoken += s[i:j]
485        token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)),
486                             SPC.sub(b'', self._curtoken))
487        self._add_token(token)
488        self._parse1 = self._parse_main
489        return j
490
491    def nexttoken(self):
492        while not self._tokens:
493            self.fillbuf()
494            self.charpos = self._parse1(self.buf, self.charpos)
495        token = self._tokens.pop(0)
496        log.debug('nexttoken: %r', token)
497        return token
498
499
500class PSStackParser(PSBaseParser):
501    def __init__(self, fp):
502        PSBaseParser.__init__(self, fp)
503        self.reset()
504        return
505
506    def reset(self):
507        self.context = []
508        self.curtype = None
509        self.curstack = []
510        self.results = []
511        return
512
513    def seek(self, pos):
514        PSBaseParser.seek(self, pos)
515        self.reset()
516        return
517
518    def push(self, *objs):
519        self.curstack.extend(objs)
520        return
521
522    def pop(self, n):
523        objs = self.curstack[-n:]
524        self.curstack[-n:] = []
525        return objs
526
527    def popall(self):
528        objs = self.curstack
529        self.curstack = []
530        return objs
531
532    def add_results(self, *objs):
533        try:
534            log.debug('add_results: %r', objs)
535        except Exception:
536            log.debug('add_results: (unprintable object)')
537        self.results.extend(objs)
538        return
539
540    def start_type(self, pos, type):
541        self.context.append((pos, self.curtype, self.curstack))
542        (self.curtype, self.curstack) = (type, [])
543        log.debug('start_type: pos=%r, type=%r', pos, type)
544        return
545
546    def end_type(self, type):
547        if self.curtype != type:
548            raise PSTypeError('Type mismatch: {!r} != {!r}'
549                              .format(self.curtype, type))
550        objs = [obj for (_, obj) in self.curstack]
551        (pos, self.curtype, self.curstack) = self.context.pop()
552        log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
553        return (pos, objs)
554
555    def do_keyword(self, pos, token):
556        return
557
558    def nextobject(self):
559        """Yields a list of objects.
560
561        Arrays and dictionaries are represented as Python lists and
562        dictionaries.
563
564        :return: keywords, literals, strings, numbers, arrays and dictionaries.
565        """
566        while not self.results:
567            (pos, token) = self.nexttoken()
568            if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
569                # normal token
570                self.push((pos, token))
571            elif token == KEYWORD_ARRAY_BEGIN:
572                # begin array
573                self.start_type(pos, 'a')
574            elif token == KEYWORD_ARRAY_END:
575                # end array
576                try:
577                    self.push(self.end_type('a'))
578                except PSTypeError:
579                    if settings.STRICT:
580                        raise
581            elif token == KEYWORD_DICT_BEGIN:
582                # begin dictionary
583                self.start_type(pos, 'd')
584            elif token == KEYWORD_DICT_END:
585                # end dictionary
586                try:
587                    (pos, objs) = self.end_type('d')
588                    if len(objs) % 2 != 0:
589                        error_msg = 'Invalid dictionary construct: %r' % objs
590                        raise PSSyntaxError(error_msg)
591                    d = {literal_name(k): v
592                         for (k, v) in choplist(2, objs) if v is not None}
593                    self.push((pos, d))
594                except PSTypeError:
595                    if settings.STRICT:
596                        raise
597            elif token == KEYWORD_PROC_BEGIN:
598                # begin proc
599                self.start_type(pos, 'p')
600            elif token == KEYWORD_PROC_END:
601                # end proc
602                try:
603                    self.push(self.end_type('p'))
604                except PSTypeError:
605                    if settings.STRICT:
606                        raise
607            elif isinstance(token, PSKeyword):
608                log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
609                          token, self.curstack)
610                self.do_keyword(pos, token)
611            else:
612                log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
613                          token, self.curstack)
614                self.do_keyword(pos, token)
615                raise
616            if self.context:
617                continue
618            else:
619                self.flush()
620        obj = self.results.pop(0)
621        try:
622            log.debug('nextobject: %r', obj)
623        except Exception:
624            log.debug('nextobject: (unprintable object)')
625        return obj
626