1import logging
2import struct
3import sys
4from io import BytesIO
5
6from . import settings
7from .cmapdb import CMap
8from .cmapdb import CMapDB
9from .cmapdb import CMapParser
10from .cmapdb import FileUnicodeMap
11from .encodingdb import EncodingDB
12from .encodingdb import name2unicode
13from .fontmetrics import FONT_METRICS
14from .pdftypes import PDFException
15from .pdftypes import PDFStream
16from .pdftypes import dict_value
17from .pdftypes import int_value
18from .pdftypes import list_value
19from .pdftypes import num_value
20from .pdftypes import resolve1, resolve_all
21from .pdftypes import stream_value
22from .psparser import KWD
23from .psparser import LIT
24from .psparser import PSEOF
25from .psparser import PSLiteral
26from .psparser import PSStackParser
27from .psparser import literal_name
28from .utils import apply_matrix_norm
29from .utils import choplist
30from .utils import isnumber
31from .utils import nunpack
32
33log = logging.getLogger(__name__)
34
35
36def get_widths(seq):
37    widths = {}
38    r = []
39    for v in seq:
40        if isinstance(v, list):
41            if r:
42                char1 = r[-1]
43                for (i, w) in enumerate(v):
44                    widths[char1+i] = w
45                r = []
46        elif isnumber(v):
47            r.append(v)
48            if len(r) == 3:
49                (char1, char2, w) = r
50                for i in range(char1, char2+1):
51                    widths[i] = w
52                r = []
53    return widths
54
55
56def get_widths2(seq):
57    widths = {}
58    r = []
59    for v in seq:
60        if isinstance(v, list):
61            if r:
62                char1 = r[-1]
63                for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
64                    widths[char1+i] = (w, (vx, vy))
65                r = []
66        elif isnumber(v):
67            r.append(v)
68            if len(r) == 5:
69                (char1, char2, w, vx, vy) = r
70                for i in range(char1, char2+1):
71                    widths[i] = (w, (vx, vy))
72                r = []
73    return widths
74
75
76class FontMetricsDB:
77
78    @classmethod
79    def get_metrics(cls, fontname):
80        return FONT_METRICS[fontname]
81
82
83class Type1FontHeaderParser(PSStackParser):
84
85    KEYWORD_BEGIN = KWD(b'begin')
86    KEYWORD_END = KWD(b'end')
87    KEYWORD_DEF = KWD(b'def')
88    KEYWORD_PUT = KWD(b'put')
89    KEYWORD_DICT = KWD(b'dict')
90    KEYWORD_ARRAY = KWD(b'array')
91    KEYWORD_READONLY = KWD(b'readonly')
92    KEYWORD_FOR = KWD(b'for')
93
94    def __init__(self, data):
95        PSStackParser.__init__(self, data)
96        self._cid2unicode = {}
97        return
98
99    def get_encoding(self):
100        """Parse the font encoding.
101
102        The Type1 font encoding maps character codes to character names. These
103        character names could either be standard Adobe glyph names, or
104        character names associated with custom CharStrings for this font. A
105        CharString is a sequence of operations that describe how the character
106        should be drawn. Currently, this function returns '' (empty string)
107        for character names that are associated with a CharStrings.
108
109        Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
110
111        :returns mapping of character identifiers (cid's) to unicode characters
112        """
113        while 1:
114            try:
115                (cid, name) = self.nextobject()
116            except PSEOF:
117                break
118            try:
119                self._cid2unicode[cid] = name2unicode(name)
120            except KeyError as e:
121                log.debug(str(e))
122        return self._cid2unicode
123
124    def do_keyword(self, pos, token):
125        if token is self.KEYWORD_PUT:
126            ((_, key), (_, value)) = self.pop(2)
127            if (isinstance(key, int) and isinstance(value, PSLiteral)):
128                self.add_results((key, literal_name(value)))
129        return
130
131
132NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
133           None, '-')
134
135# Mapping of cmap names. Original cmap name is kept if not in the mapping.
136# (missing reference for why DLIdent is mapped to Identity)
137IDENTITY_ENCODER = {
138    'DLIdent-H': 'Identity-H',
139    'DLIdent-V': 'Identity-V',
140}
141
142
143def getdict(data):
144    d = {}
145    fp = BytesIO(data)
146    stack = []
147    while 1:
148        c = fp.read(1)
149        if not c:
150            break
151        b0 = ord(c)
152        if b0 <= 21:
153            d[b0] = stack
154            stack = []
155            continue
156        if b0 == 30:
157            s = ''
158            loop = True
159            while loop:
160                b = ord(fp.read(1))
161                for n in (b >> 4, b & 15):
162                    if n == 15:
163                        loop = False
164                    else:
165                        s += NIBBLES[n]
166            value = float(s)
167        elif 32 <= b0 and b0 <= 246:
168            value = b0-139
169        else:
170            b1 = ord(fp.read(1))
171            if 247 <= b0 and b0 <= 250:
172                value = ((b0-247) << 8)+b1+108
173            elif 251 <= b0 and b0 <= 254:
174                value = -((b0-251) << 8)-b1-108
175            else:
176                b2 = ord(fp.read(1))
177                if 128 <= b1:
178                    b1 -= 256
179                if b0 == 28:
180                    value = b1 << 8 | b2
181                else:
182                    value = b1 << 24 | b2 << 16 | \
183                            struct.unpack('>H', fp.read(2))[0]
184        stack.append(value)
185    return d
186
187
188class CFFFont:
189
190    STANDARD_STRINGS = (
191      '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
192      'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
193      'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
194      'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
195      'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
196      'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
197      'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
198      'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
199      'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
200      'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
201      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
202      'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
203      'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
204      'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
205      'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
206      'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
207      'quotesinglbase', 'quotedblbase', 'quotedblright',
208      'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
209      'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
210      'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
211      'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
212      'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
213      'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
214      'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
215      'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
216      'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
217      'multiply', 'threesuperior', 'copyright', 'Aacute',
218      'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
219      'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
220      'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
221      'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
222      'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
223      'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
224      'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
225      'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
226      'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
227      'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
228      'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
229      'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
230      'dollarsuperior', 'ampersandsmall', 'Acutesmall',
231      'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
232      'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
233      'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
234      'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
235      'commasuperior', 'threequartersemdash', 'periodsuperior',
236      'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
237      'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
238      'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
239      'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
240      'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
241      'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
242      'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
243      'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
244      'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
245      'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
246      'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
247      'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
248      'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
249      'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
250      'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
251      'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
252      'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
253      'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
254      'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
255      'seveninferior', 'eightinferior', 'nineinferior',
256      'centinferior', 'dollarinferior', 'periodinferior',
257      'commainferior', 'Agravesmall', 'Aacutesmall',
258      'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
259      'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
260      'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
261      'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
262      'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
263      'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
264      'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
265      'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
266      'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
267      '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
268      'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
269    )
270
271    class INDEX:
272
273        def __init__(self, fp):
274            self.fp = fp
275            self.offsets = []
276            (count, offsize) = struct.unpack('>HB', self.fp.read(3))
277            for i in range(count+1):
278                self.offsets.append(nunpack(self.fp.read(offsize)))
279            self.base = self.fp.tell()-1
280            self.fp.seek(self.base+self.offsets[-1])
281            return
282
283        def __repr__(self):
284            return '<INDEX: size=%d>' % len(self)
285
286        def __len__(self):
287            return len(self.offsets)-1
288
289        def __getitem__(self, i):
290            self.fp.seek(self.base+self.offsets[i])
291            return self.fp.read(self.offsets[i+1]-self.offsets[i])
292
293        def __iter__(self):
294            return iter(self[i] for i in range(len(self)))
295
296    def __init__(self, name, fp):
297        self.name = name
298        self.fp = fp
299        # Header
300        (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB',
301                                                           self.fp.read(4))
302        self.fp.read(hdrsize-4)
303        # Name INDEX
304        self.name_index = self.INDEX(self.fp)
305        # Top DICT INDEX
306        self.dict_index = self.INDEX(self.fp)
307        # String INDEX
308        self.string_index = self.INDEX(self.fp)
309        # Global Subr INDEX
310        self.subr_index = self.INDEX(self.fp)
311        # Top DICT DATA
312        self.top_dict = getdict(self.dict_index[0])
313        (charset_pos,) = self.top_dict.get(15, [0])
314        (encoding_pos,) = self.top_dict.get(16, [0])
315        (charstring_pos,) = self.top_dict.get(17, [0])
316        # CharStrings
317        self.fp.seek(charstring_pos)
318        self.charstring = self.INDEX(self.fp)
319        self.nglyphs = len(self.charstring)
320        # Encodings
321        self.code2gid = {}
322        self.gid2code = {}
323        self.fp.seek(encoding_pos)
324        format = self.fp.read(1)
325        if format == b'\x00':
326            # Format 0
327            (n,) = struct.unpack('B', self.fp.read(1))
328            for (code, gid) in enumerate(struct.unpack('B'*n,
329                                                       self.fp.read(n))):
330                self.code2gid[code] = gid
331                self.gid2code[gid] = code
332        elif format == b'\x01':
333            # Format 1
334            (n,) = struct.unpack('B', self.fp.read(1))
335            code = 0
336            for i in range(n):
337                (first, nleft) = struct.unpack('BB', self.fp.read(2))
338                for gid in range(first, first+nleft+1):
339                    self.code2gid[code] = gid
340                    self.gid2code[gid] = code
341                    code += 1
342        else:
343            raise ValueError('unsupported encoding format: %r' % format)
344        # Charsets
345        self.name2gid = {}
346        self.gid2name = {}
347        self.fp.seek(charset_pos)
348        format = self.fp.read(1)
349        if format == b'\x00':
350            # Format 0
351            n = self.nglyphs-1
352            for (gid, sid) in enumerate(struct.unpack('>'+'H'*n,
353                                                      self.fp.read(2*n))):
354                gid += 1
355                name = self.getstr(sid)
356                self.name2gid[name] = gid
357                self.gid2name[gid] = name
358        elif format == b'\x01':
359            # Format 1
360            (n,) = struct.unpack('B', self.fp.read(1))
361            sid = 0
362            for i in range(n):
363                (first, nleft) = struct.unpack('BB', self.fp.read(2))
364                for gid in range(first, first+nleft+1):
365                    name = self.getstr(sid)
366                    self.name2gid[name] = gid
367                    self.gid2name[gid] = name
368                    sid += 1
369        elif format == b'\x02':
370            # Format 2
371            assert False, str(('Unhandled', format))
372        else:
373            raise ValueError('unsupported charset format: %r' % format)
374        return
375
376    def getstr(self, sid):
377        if sid < len(self.STANDARD_STRINGS):
378            return self.STANDARD_STRINGS[sid]
379        return self.string_index[sid-len(self.STANDARD_STRINGS)]
380
381
382class TrueTypeFont:
383
384    class CMapNotFound(Exception):
385        pass
386
387    def __init__(self, name, fp):
388        self.name = name
389        self.fp = fp
390        self.tables = {}
391        self.fonttype = fp.read(4)
392        try:
393            (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
394            for _ in range(ntables):
395                (name, tsum, offset, length) = struct.unpack('>4sLLL',
396                                                             fp.read(16))
397                self.tables[name] = (offset, length)
398        except struct.error:
399            # Do not fail if there are not enough bytes to read. Even for
400            # corrupted PDFs we would like to get as much information as
401            # possible, so continue.
402            pass
403        return
404
405    def create_unicode_map(self):
406        if 'cmap' not in self.tables:
407            raise TrueTypeFont.CMapNotFound
408        (base_offset, length) = self.tables['cmap']
409        fp = self.fp
410        fp.seek(base_offset)
411        (version, nsubtables) = struct.unpack('>HH', fp.read(4))
412        subtables = []
413        for i in range(nsubtables):
414            subtables.append(struct.unpack('>HHL', fp.read(8)))
415        char2gid = {}
416        # Only supports subtable type 0, 2 and 4.
417        for (_1, _2, st_offset) in subtables:
418            fp.seek(base_offset+st_offset)
419            (fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
420            if fmttype == 0:
421                char2gid.update(enumerate(struct.unpack('>256B',
422                                                        fp.read(256))))
423            elif fmttype == 2:
424                subheaderkeys = struct.unpack('>256H', fp.read(512))
425                firstbytes = [0]*8192
426                for (i, k) in enumerate(subheaderkeys):
427                    firstbytes[k//8] = i
428                nhdrs = max(subheaderkeys)//8 + 1
429                hdrs = []
430                for i in range(nhdrs):
431                    (firstcode, entcount, delta, offset) = \
432                        struct.unpack('>HHhH', fp.read(8))
433                    hdrs.append((i, firstcode, entcount, delta,
434                                 fp.tell()-2+offset))
435                for (i, firstcode, entcount, delta, pos) in hdrs:
436                    if not entcount:
437                        continue
438                    first = firstcode + (firstbytes[i] << 8)
439                    fp.seek(pos)
440                    for c in range(entcount):
441                        gid = struct.unpack('>H', fp.read(2))
442                        if gid:
443                            gid += delta
444                        char2gid[first+c] = gid
445            elif fmttype == 4:
446                (segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
447                segcount //= 2
448                ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
449                fp.read(2)
450                scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
451                idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
452                pos = fp.tell()
453                idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
454                for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
455                    if idr:
456                        fp.seek(pos+idr)
457                        for c in range(sc, ec+1):
458                            b = struct.unpack('>H', fp.read(2))[0]
459                            char2gid[c] = (b + idd) & 0xffff
460                    else:
461                        for c in range(sc, ec+1):
462                            char2gid[c] = (c + idd) & 0xffff
463            else:
464                assert False, str(('Unhandled', fmttype))
465        # create unicode map
466        unicode_map = FileUnicodeMap()
467        for (char, gid) in char2gid.items():
468            unicode_map.add_cid2unichr(gid, char)
469        return unicode_map
470
471
472class PDFFontError(PDFException):
473    pass
474
475
476class PDFUnicodeNotDefined(PDFFontError):
477    pass
478
479
480LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
481LITERAL_TYPE1C = LIT('Type1C')
482
483
484class PDFFont:
485
486    def __init__(self, descriptor, widths, default_width=None):
487        self.descriptor = descriptor
488        self.widths = resolve_all(widths)
489        self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
490        if isinstance(self.fontname, PSLiteral):
491            self.fontname = literal_name(self.fontname)
492        self.flags = int_value(descriptor.get('Flags', 0))
493        self.ascent = num_value(descriptor.get('Ascent', 0))
494        self.descent = num_value(descriptor.get('Descent', 0))
495        self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
496        if default_width is None:
497            self.default_width = num_value(descriptor.get('MissingWidth', 0))
498        else:
499            self.default_width = default_width
500        self.leading = num_value(descriptor.get('Leading', 0))
501        self.bbox = list_value(resolve_all(descriptor.get('FontBBox',
502                                                          (0, 0, 0, 0))))
503        self.hscale = self.vscale = .001
504
505        # PDF RM 9.8.1 specifies /Descent should always be a negative number.
506        # PScript5.dll seems to produce Descent with a positive number, but
507        # text analysis will be wrong if this is taken as correct. So force
508        # descent to negative.
509        if self.descent > 0:
510            self.descent = -self.descent
511        return
512
513    def __repr__(self):
514        return '<PDFFont>'
515
516    def is_vertical(self):
517        return False
518
519    def is_multibyte(self):
520        return False
521
522    def decode(self, bytes):
523        return bytearray(bytes)  # map(ord, bytes)
524
525    def get_ascent(self):
526        """Ascent above the baseline, in text space units"""
527        return self.ascent * self.vscale
528
529    def get_descent(self):
530        """Descent below the baseline, in text space units; always negative"""
531        return self.descent * self.vscale
532
533    def get_width(self):
534        w = self.bbox[2]-self.bbox[0]
535        if w == 0:
536            w = -self.default_width
537        return w * self.hscale
538
539    def get_height(self):
540        h = self.bbox[3]-self.bbox[1]
541        if h == 0:
542            h = self.ascent - self.descent
543        return h * self.vscale
544
545    def char_width(self, cid):
546        try:
547            return self.widths[cid] * self.hscale
548        except KeyError:
549            try:
550                return self.widths[self.to_unichr(cid)] * self.hscale
551            except (KeyError, PDFUnicodeNotDefined):
552                return self.default_width * self.hscale
553
554    def char_disp(self, cid):
555        return 0
556
557    def string_width(self, s):
558        return sum(self.char_width(cid) for cid in self.decode(s))
559
560
561class PDFSimpleFont(PDFFont):
562
563    def __init__(self, descriptor, widths, spec):
564        # Font encoding is specified either by a name of
565        # built-in encoding or a dictionary that describes
566        # the differences.
567        if 'Encoding' in spec:
568            encoding = resolve1(spec['Encoding'])
569        else:
570            encoding = LITERAL_STANDARD_ENCODING
571        if isinstance(encoding, dict):
572            name = literal_name(encoding.get('BaseEncoding',
573                                             LITERAL_STANDARD_ENCODING))
574            diff = list_value(encoding.get('Differences', []))
575            self.cid2unicode = EncodingDB.get_encoding(name, diff)
576        else:
577            self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
578        self.unicode_map = None
579        if 'ToUnicode' in spec:
580            strm = stream_value(spec['ToUnicode'])
581            self.unicode_map = FileUnicodeMap()
582            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
583        PDFFont.__init__(self, descriptor, widths)
584        return
585
586    def to_unichr(self, cid):
587        if self.unicode_map:
588            try:
589                return self.unicode_map.get_unichr(cid)
590            except KeyError:
591                pass
592        try:
593            return self.cid2unicode[cid]
594        except KeyError:
595            raise PDFUnicodeNotDefined(None, cid)
596
597
598class PDFType1Font(PDFSimpleFont):
599
600    def __init__(self, rsrcmgr, spec):
601        try:
602            self.basefont = literal_name(spec['BaseFont'])
603        except KeyError:
604            if settings.STRICT:
605                raise PDFFontError('BaseFont is missing')
606            self.basefont = 'unknown'
607        try:
608            (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
609        except KeyError:
610            descriptor = dict_value(spec.get('FontDescriptor', {}))
611            firstchar = int_value(spec.get('FirstChar', 0))
612            # lastchar = int_value(spec.get('LastChar', 255))
613            widths = list_value(spec.get('Widths', [0]*256))
614            widths = {i+firstchar: w for (i, w) in enumerate(widths)}
615        PDFSimpleFont.__init__(self, descriptor, widths, spec)
616        if 'Encoding' not in spec and 'FontFile' in descriptor:
617            # try to recover the missing encoding info from the font file.
618            self.fontfile = stream_value(descriptor.get('FontFile'))
619            length1 = int_value(self.fontfile['Length1'])
620            data = self.fontfile.get_data()[:length1]
621            parser = Type1FontHeaderParser(BytesIO(data))
622            self.cid2unicode = parser.get_encoding()
623        return
624
625    def __repr__(self):
626        return '<PDFType1Font: basefont=%r>' % self.basefont
627
628
629class PDFTrueTypeFont(PDFType1Font):
630
631    def __repr__(self):
632        return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
633
634
635class PDFType3Font(PDFSimpleFont):
636
637    def __init__(self, rsrcmgr, spec):
638        firstchar = int_value(spec.get('FirstChar', 0))
639        # lastchar = int_value(spec.get('LastChar', 0))
640        widths = list_value(spec.get('Widths', [0]*256))
641        widths = {i+firstchar: w for (i, w) in enumerate(widths)}
642        if 'FontDescriptor' in spec:
643            descriptor = dict_value(spec['FontDescriptor'])
644        else:
645            descriptor = {'Ascent': 0, 'Descent': 0,
646                          'FontBBox': spec['FontBBox']}
647        PDFSimpleFont.__init__(self, descriptor, widths, spec)
648        self.matrix = tuple(list_value(spec.get('FontMatrix')))
649        (_, self.descent, _, self.ascent) = self.bbox
650        (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
651        return
652
653    def __repr__(self):
654        return '<PDFType3Font>'
655
656
657class PDFCIDFont(PDFFont):
658
659    def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
660        try:
661            self.basefont = literal_name(spec['BaseFont'])
662        except KeyError:
663            if strict:
664                raise PDFFontError('BaseFont is missing')
665            self.basefont = 'unknown'
666        self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
667        cid_registry = resolve1(
668            self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1")
669        cid_ordering = resolve1(
670            self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")
671        self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering)
672        self.cmap = self.get_cmap_from_spec(spec, strict)
673
674        try:
675            descriptor = dict_value(spec['FontDescriptor'])
676        except KeyError:
677            if strict:
678                raise PDFFontError('FontDescriptor is missing')
679            descriptor = {}
680        ttf = None
681        if 'FontFile2' in descriptor:
682            self.fontfile = stream_value(descriptor.get('FontFile2'))
683            ttf = TrueTypeFont(self.basefont,
684                               BytesIO(self.fontfile.get_data()))
685        self.unicode_map = None
686        if 'ToUnicode' in spec:
687            strm = stream_value(spec['ToUnicode'])
688            self.unicode_map = FileUnicodeMap()
689            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
690        elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
691            if ttf:
692                try:
693                    self.unicode_map = ttf.create_unicode_map()
694                except TrueTypeFont.CMapNotFound:
695                    pass
696        else:
697            try:
698                self.unicode_map = CMapDB.get_unicode_map(
699                    self.cidcoding, self.cmap.is_vertical())
700            except CMapDB.CMapNotFound:
701                pass
702
703        self.vertical = self.cmap.is_vertical()
704        if self.vertical:
705            # writing mode: vertical
706            widths = get_widths2(list_value(spec.get('W2', [])))
707            self.disps = {cid: (vx, vy)
708                          for (cid, (_, (vx, vy))) in widths.items()}
709            (vy, w) = spec.get('DW2', [880, -1000])
710            self.default_disp = (None, vy)
711            widths = {cid: w for (cid, (w, _)) in widths.items()}
712            default_width = w
713        else:
714            # writing mode: horizontal
715            self.disps = {}
716            self.default_disp = 0
717            widths = get_widths(list_value(spec.get('W', [])))
718            default_width = spec.get('DW', 1000)
719        PDFFont.__init__(self, descriptor, widths, default_width=default_width)
720        return
721
722    def get_cmap_from_spec(self, spec, strict):
723        """Get cmap from font specification
724
725        For certain PDFs, Encoding Type isn't mentioned as an attribute of
726        Encoding but as an attribute of CMapName, where CMapName is an
727        attribute of spec['Encoding'].
728        The horizontal/vertical modes are mentioned with different name
729        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
730        """
731        cmap_name = self._get_cmap_name(spec, strict)
732
733        try:
734            return CMapDB.get_cmap(cmap_name)
735        except CMapDB.CMapNotFound as e:
736            if strict:
737                raise PDFFontError(e)
738            return CMap()
739
740    @staticmethod
741    def _get_cmap_name(spec, strict):
742        """Get cmap name from font specification"""
743        cmap_name = 'unknown'  # default value
744
745        try:
746            spec_encoding = spec['Encoding']
747            if hasattr(spec_encoding, 'name'):
748                cmap_name = literal_name(spec['Encoding'])
749            else:
750                cmap_name = literal_name(spec_encoding['CMapName'])
751        except KeyError:
752            if strict:
753                raise PDFFontError('Encoding is unspecified')
754
755        if type(cmap_name) is PDFStream:
756            if 'CMapName' in cmap_name:
757                cmap_name = cmap_name.get('CMapName').name
758            else:
759                if strict:
760                    raise PDFFontError('CMapName unspecified for encoding')
761
762        cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
763        return cmap_name
764
765    def __repr__(self):
766        return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
767            .format(self.basefont, self.cidcoding)
768
769    def is_vertical(self):
770        return self.vertical
771
772    def is_multibyte(self):
773        return True
774
775    def decode(self, bytes):
776        return self.cmap.decode(bytes)
777
778    def char_disp(self, cid):
779        "Returns an integer for horizontal fonts, a tuple for vertical fonts."
780        return self.disps.get(cid, self.default_disp)
781
782    def to_unichr(self, cid):
783        try:
784            if not self.unicode_map:
785                raise KeyError(cid)
786            return self.unicode_map.get_unichr(cid)
787        except KeyError:
788            raise PDFUnicodeNotDefined(self.cidcoding, cid)
789
790
791def main(argv):
792    for fname in argv[1:]:
793        fp = open(fname, 'rb')
794        font = CFFFont(fname, fp)
795        print(font)
796        fp.close()
797    return
798
799
800if __name__ == '__main__':
801    sys.exit(main(sys.argv))
802