1import logging 2import struct 3import sys 4from io import BytesIO 5 6from . import settings 7from .cmapdb import CMap 8from .cmapdb import CMapDB 9from .cmapdb import CMapParser 10from .cmapdb import FileUnicodeMap 11from .encodingdb import EncodingDB 12from .encodingdb import name2unicode 13from .fontmetrics import FONT_METRICS 14from .pdftypes import PDFException 15from .pdftypes import PDFStream 16from .pdftypes import dict_value 17from .pdftypes import int_value 18from .pdftypes import list_value 19from .pdftypes import num_value 20from .pdftypes import resolve1, resolve_all 21from .pdftypes import stream_value 22from .psparser import KWD 23from .psparser import LIT 24from .psparser import PSEOF 25from .psparser import PSLiteral 26from .psparser import PSStackParser 27from .psparser import literal_name 28from .utils import apply_matrix_norm 29from .utils import choplist 30from .utils import isnumber 31from .utils import nunpack 32 33log = logging.getLogger(__name__) 34 35 36def get_widths(seq): 37 widths = {} 38 r = [] 39 for v in seq: 40 if isinstance(v, list): 41 if r: 42 char1 = r[-1] 43 for (i, w) in enumerate(v): 44 widths[char1+i] = w 45 r = [] 46 elif isnumber(v): 47 r.append(v) 48 if len(r) == 3: 49 (char1, char2, w) = r 50 for i in range(char1, char2+1): 51 widths[i] = w 52 r = [] 53 return widths 54 55 56def get_widths2(seq): 57 widths = {} 58 r = [] 59 for v in seq: 60 if isinstance(v, list): 61 if r: 62 char1 = r[-1] 63 for (i, (w, vx, vy)) in enumerate(choplist(3, v)): 64 widths[char1+i] = (w, (vx, vy)) 65 r = [] 66 elif isnumber(v): 67 r.append(v) 68 if len(r) == 5: 69 (char1, char2, w, vx, vy) = r 70 for i in range(char1, char2+1): 71 widths[i] = (w, (vx, vy)) 72 r = [] 73 return widths 74 75 76class FontMetricsDB: 77 78 @classmethod 79 def get_metrics(cls, fontname): 80 return FONT_METRICS[fontname] 81 82 83class Type1FontHeaderParser(PSStackParser): 84 85 KEYWORD_BEGIN = KWD(b'begin') 86 KEYWORD_END = KWD(b'end') 87 KEYWORD_DEF = KWD(b'def') 88 KEYWORD_PUT = KWD(b'put') 89 KEYWORD_DICT = KWD(b'dict') 90 KEYWORD_ARRAY = KWD(b'array') 91 KEYWORD_READONLY = KWD(b'readonly') 92 KEYWORD_FOR = KWD(b'for') 93 94 def __init__(self, data): 95 PSStackParser.__init__(self, data) 96 self._cid2unicode = {} 97 return 98 99 def get_encoding(self): 100 """Parse the font encoding. 101 102 The Type1 font encoding maps character codes to character names. These 103 character names could either be standard Adobe glyph names, or 104 character names associated with custom CharStrings for this font. A 105 CharString is a sequence of operations that describe how the character 106 should be drawn. Currently, this function returns '' (empty string) 107 for character names that are associated with a CharStrings. 108 109 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format 110 111 :returns mapping of character identifiers (cid's) to unicode characters 112 """ 113 while 1: 114 try: 115 (cid, name) = self.nextobject() 116 except PSEOF: 117 break 118 try: 119 self._cid2unicode[cid] = name2unicode(name) 120 except KeyError as e: 121 log.debug(str(e)) 122 return self._cid2unicode 123 124 def do_keyword(self, pos, token): 125 if token is self.KEYWORD_PUT: 126 ((_, key), (_, value)) = self.pop(2) 127 if (isinstance(key, int) and isinstance(value, PSLiteral)): 128 self.add_results((key, literal_name(value))) 129 return 130 131 132NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', 133 None, '-') 134 135# Mapping of cmap names. Original cmap name is kept if not in the mapping. 136# (missing reference for why DLIdent is mapped to Identity) 137IDENTITY_ENCODER = { 138 'DLIdent-H': 'Identity-H', 139 'DLIdent-V': 'Identity-V', 140} 141 142 143def getdict(data): 144 d = {} 145 fp = BytesIO(data) 146 stack = [] 147 while 1: 148 c = fp.read(1) 149 if not c: 150 break 151 b0 = ord(c) 152 if b0 <= 21: 153 d[b0] = stack 154 stack = [] 155 continue 156 if b0 == 30: 157 s = '' 158 loop = True 159 while loop: 160 b = ord(fp.read(1)) 161 for n in (b >> 4, b & 15): 162 if n == 15: 163 loop = False 164 else: 165 s += NIBBLES[n] 166 value = float(s) 167 elif 32 <= b0 and b0 <= 246: 168 value = b0-139 169 else: 170 b1 = ord(fp.read(1)) 171 if 247 <= b0 and b0 <= 250: 172 value = ((b0-247) << 8)+b1+108 173 elif 251 <= b0 and b0 <= 254: 174 value = -((b0-251) << 8)-b1-108 175 else: 176 b2 = ord(fp.read(1)) 177 if 128 <= b1: 178 b1 -= 256 179 if b0 == 28: 180 value = b1 << 8 | b2 181 else: 182 value = b1 << 24 | b2 << 16 | \ 183 struct.unpack('>H', fp.read(2))[0] 184 stack.append(value) 185 return d 186 187 188class CFFFont: 189 190 STANDARD_STRINGS = ( 191 '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 192 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft', 193 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period', 194 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 195 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal', 196 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 197 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 198 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash', 199 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a', 200 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 201 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 202 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown', 203 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section', 204 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft', 205 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash', 206 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet', 207 'quotesinglbase', 'quotedblbase', 'quotedblright', 208 'guillemotright', 'ellipsis', 'perthousand', 'questiondown', 209 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve', 210 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut', 211 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash', 212 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 213 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu', 214 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn', 215 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn', 216 'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 217 'multiply', 'threesuperior', 'copyright', 'Aacute', 218 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde', 219 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave', 220 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 221 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 222 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave', 223 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex', 224 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute', 225 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex', 226 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 227 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute', 228 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis', 229 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle', 230 'dollarsuperior', 'ampersandsmall', 'Acutesmall', 231 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 232 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle', 233 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle', 234 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle', 235 'commasuperior', 'threequartersemdash', 'periodsuperior', 236 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 237 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior', 238 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior', 239 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior', 240 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall', 241 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 242 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 243 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall', 244 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall', 245 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall', 246 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 247 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall', 248 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior', 249 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall', 250 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths', 251 'onethird', 'twothirds', 'zerosuperior', 'foursuperior', 252 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior', 253 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 254 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior', 255 'seveninferior', 'eightinferior', 'nineinferior', 256 'centinferior', 'dollarinferior', 'periodinferior', 257 'commainferior', 'Agravesmall', 'Aacutesmall', 258 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 259 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall', 260 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall', 261 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall', 262 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 263 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall', 264 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall', 265 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall', 266 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', 267 '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book', 268 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', 269 ) 270 271 class INDEX: 272 273 def __init__(self, fp): 274 self.fp = fp 275 self.offsets = [] 276 (count, offsize) = struct.unpack('>HB', self.fp.read(3)) 277 for i in range(count+1): 278 self.offsets.append(nunpack(self.fp.read(offsize))) 279 self.base = self.fp.tell()-1 280 self.fp.seek(self.base+self.offsets[-1]) 281 return 282 283 def __repr__(self): 284 return '<INDEX: size=%d>' % len(self) 285 286 def __len__(self): 287 return len(self.offsets)-1 288 289 def __getitem__(self, i): 290 self.fp.seek(self.base+self.offsets[i]) 291 return self.fp.read(self.offsets[i+1]-self.offsets[i]) 292 293 def __iter__(self): 294 return iter(self[i] for i in range(len(self))) 295 296 def __init__(self, name, fp): 297 self.name = name 298 self.fp = fp 299 # Header 300 (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', 301 self.fp.read(4)) 302 self.fp.read(hdrsize-4) 303 # Name INDEX 304 self.name_index = self.INDEX(self.fp) 305 # Top DICT INDEX 306 self.dict_index = self.INDEX(self.fp) 307 # String INDEX 308 self.string_index = self.INDEX(self.fp) 309 # Global Subr INDEX 310 self.subr_index = self.INDEX(self.fp) 311 # Top DICT DATA 312 self.top_dict = getdict(self.dict_index[0]) 313 (charset_pos,) = self.top_dict.get(15, [0]) 314 (encoding_pos,) = self.top_dict.get(16, [0]) 315 (charstring_pos,) = self.top_dict.get(17, [0]) 316 # CharStrings 317 self.fp.seek(charstring_pos) 318 self.charstring = self.INDEX(self.fp) 319 self.nglyphs = len(self.charstring) 320 # Encodings 321 self.code2gid = {} 322 self.gid2code = {} 323 self.fp.seek(encoding_pos) 324 format = self.fp.read(1) 325 if format == b'\x00': 326 # Format 0 327 (n,) = struct.unpack('B', self.fp.read(1)) 328 for (code, gid) in enumerate(struct.unpack('B'*n, 329 self.fp.read(n))): 330 self.code2gid[code] = gid 331 self.gid2code[gid] = code 332 elif format == b'\x01': 333 # Format 1 334 (n,) = struct.unpack('B', self.fp.read(1)) 335 code = 0 336 for i in range(n): 337 (first, nleft) = struct.unpack('BB', self.fp.read(2)) 338 for gid in range(first, first+nleft+1): 339 self.code2gid[code] = gid 340 self.gid2code[gid] = code 341 code += 1 342 else: 343 raise ValueError('unsupported encoding format: %r' % format) 344 # Charsets 345 self.name2gid = {} 346 self.gid2name = {} 347 self.fp.seek(charset_pos) 348 format = self.fp.read(1) 349 if format == b'\x00': 350 # Format 0 351 n = self.nglyphs-1 352 for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, 353 self.fp.read(2*n))): 354 gid += 1 355 name = self.getstr(sid) 356 self.name2gid[name] = gid 357 self.gid2name[gid] = name 358 elif format == b'\x01': 359 # Format 1 360 (n,) = struct.unpack('B', self.fp.read(1)) 361 sid = 0 362 for i in range(n): 363 (first, nleft) = struct.unpack('BB', self.fp.read(2)) 364 for gid in range(first, first+nleft+1): 365 name = self.getstr(sid) 366 self.name2gid[name] = gid 367 self.gid2name[gid] = name 368 sid += 1 369 elif format == b'\x02': 370 # Format 2 371 assert False, str(('Unhandled', format)) 372 else: 373 raise ValueError('unsupported charset format: %r' % format) 374 return 375 376 def getstr(self, sid): 377 if sid < len(self.STANDARD_STRINGS): 378 return self.STANDARD_STRINGS[sid] 379 return self.string_index[sid-len(self.STANDARD_STRINGS)] 380 381 382class TrueTypeFont: 383 384 class CMapNotFound(Exception): 385 pass 386 387 def __init__(self, name, fp): 388 self.name = name 389 self.fp = fp 390 self.tables = {} 391 self.fonttype = fp.read(4) 392 try: 393 (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8)) 394 for _ in range(ntables): 395 (name, tsum, offset, length) = struct.unpack('>4sLLL', 396 fp.read(16)) 397 self.tables[name] = (offset, length) 398 except struct.error: 399 # Do not fail if there are not enough bytes to read. Even for 400 # corrupted PDFs we would like to get as much information as 401 # possible, so continue. 402 pass 403 return 404 405 def create_unicode_map(self): 406 if 'cmap' not in self.tables: 407 raise TrueTypeFont.CMapNotFound 408 (base_offset, length) = self.tables['cmap'] 409 fp = self.fp 410 fp.seek(base_offset) 411 (version, nsubtables) = struct.unpack('>HH', fp.read(4)) 412 subtables = [] 413 for i in range(nsubtables): 414 subtables.append(struct.unpack('>HHL', fp.read(8))) 415 char2gid = {} 416 # Only supports subtable type 0, 2 and 4. 417 for (_1, _2, st_offset) in subtables: 418 fp.seek(base_offset+st_offset) 419 (fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6)) 420 if fmttype == 0: 421 char2gid.update(enumerate(struct.unpack('>256B', 422 fp.read(256)))) 423 elif fmttype == 2: 424 subheaderkeys = struct.unpack('>256H', fp.read(512)) 425 firstbytes = [0]*8192 426 for (i, k) in enumerate(subheaderkeys): 427 firstbytes[k//8] = i 428 nhdrs = max(subheaderkeys)//8 + 1 429 hdrs = [] 430 for i in range(nhdrs): 431 (firstcode, entcount, delta, offset) = \ 432 struct.unpack('>HHhH', fp.read(8)) 433 hdrs.append((i, firstcode, entcount, delta, 434 fp.tell()-2+offset)) 435 for (i, firstcode, entcount, delta, pos) in hdrs: 436 if not entcount: 437 continue 438 first = firstcode + (firstbytes[i] << 8) 439 fp.seek(pos) 440 for c in range(entcount): 441 gid = struct.unpack('>H', fp.read(2)) 442 if gid: 443 gid += delta 444 char2gid[first+c] = gid 445 elif fmttype == 4: 446 (segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8)) 447 segcount //= 2 448 ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) 449 fp.read(2) 450 scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) 451 idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount)) 452 pos = fp.tell() 453 idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) 454 for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs): 455 if idr: 456 fp.seek(pos+idr) 457 for c in range(sc, ec+1): 458 b = struct.unpack('>H', fp.read(2))[0] 459 char2gid[c] = (b + idd) & 0xffff 460 else: 461 for c in range(sc, ec+1): 462 char2gid[c] = (c + idd) & 0xffff 463 else: 464 assert False, str(('Unhandled', fmttype)) 465 # create unicode map 466 unicode_map = FileUnicodeMap() 467 for (char, gid) in char2gid.items(): 468 unicode_map.add_cid2unichr(gid, char) 469 return unicode_map 470 471 472class PDFFontError(PDFException): 473 pass 474 475 476class PDFUnicodeNotDefined(PDFFontError): 477 pass 478 479 480LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') 481LITERAL_TYPE1C = LIT('Type1C') 482 483 484class PDFFont: 485 486 def __init__(self, descriptor, widths, default_width=None): 487 self.descriptor = descriptor 488 self.widths = resolve_all(widths) 489 self.fontname = resolve1(descriptor.get('FontName', 'unknown')) 490 if isinstance(self.fontname, PSLiteral): 491 self.fontname = literal_name(self.fontname) 492 self.flags = int_value(descriptor.get('Flags', 0)) 493 self.ascent = num_value(descriptor.get('Ascent', 0)) 494 self.descent = num_value(descriptor.get('Descent', 0)) 495 self.italic_angle = num_value(descriptor.get('ItalicAngle', 0)) 496 if default_width is None: 497 self.default_width = num_value(descriptor.get('MissingWidth', 0)) 498 else: 499 self.default_width = default_width 500 self.leading = num_value(descriptor.get('Leading', 0)) 501 self.bbox = list_value(resolve_all(descriptor.get('FontBBox', 502 (0, 0, 0, 0)))) 503 self.hscale = self.vscale = .001 504 505 # PDF RM 9.8.1 specifies /Descent should always be a negative number. 506 # PScript5.dll seems to produce Descent with a positive number, but 507 # text analysis will be wrong if this is taken as correct. So force 508 # descent to negative. 509 if self.descent > 0: 510 self.descent = -self.descent 511 return 512 513 def __repr__(self): 514 return '<PDFFont>' 515 516 def is_vertical(self): 517 return False 518 519 def is_multibyte(self): 520 return False 521 522 def decode(self, bytes): 523 return bytearray(bytes) # map(ord, bytes) 524 525 def get_ascent(self): 526 """Ascent above the baseline, in text space units""" 527 return self.ascent * self.vscale 528 529 def get_descent(self): 530 """Descent below the baseline, in text space units; always negative""" 531 return self.descent * self.vscale 532 533 def get_width(self): 534 w = self.bbox[2]-self.bbox[0] 535 if w == 0: 536 w = -self.default_width 537 return w * self.hscale 538 539 def get_height(self): 540 h = self.bbox[3]-self.bbox[1] 541 if h == 0: 542 h = self.ascent - self.descent 543 return h * self.vscale 544 545 def char_width(self, cid): 546 try: 547 return self.widths[cid] * self.hscale 548 except KeyError: 549 try: 550 return self.widths[self.to_unichr(cid)] * self.hscale 551 except (KeyError, PDFUnicodeNotDefined): 552 return self.default_width * self.hscale 553 554 def char_disp(self, cid): 555 return 0 556 557 def string_width(self, s): 558 return sum(self.char_width(cid) for cid in self.decode(s)) 559 560 561class PDFSimpleFont(PDFFont): 562 563 def __init__(self, descriptor, widths, spec): 564 # Font encoding is specified either by a name of 565 # built-in encoding or a dictionary that describes 566 # the differences. 567 if 'Encoding' in spec: 568 encoding = resolve1(spec['Encoding']) 569 else: 570 encoding = LITERAL_STANDARD_ENCODING 571 if isinstance(encoding, dict): 572 name = literal_name(encoding.get('BaseEncoding', 573 LITERAL_STANDARD_ENCODING)) 574 diff = list_value(encoding.get('Differences', [])) 575 self.cid2unicode = EncodingDB.get_encoding(name, diff) 576 else: 577 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) 578 self.unicode_map = None 579 if 'ToUnicode' in spec: 580 strm = stream_value(spec['ToUnicode']) 581 self.unicode_map = FileUnicodeMap() 582 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 583 PDFFont.__init__(self, descriptor, widths) 584 return 585 586 def to_unichr(self, cid): 587 if self.unicode_map: 588 try: 589 return self.unicode_map.get_unichr(cid) 590 except KeyError: 591 pass 592 try: 593 return self.cid2unicode[cid] 594 except KeyError: 595 raise PDFUnicodeNotDefined(None, cid) 596 597 598class PDFType1Font(PDFSimpleFont): 599 600 def __init__(self, rsrcmgr, spec): 601 try: 602 self.basefont = literal_name(spec['BaseFont']) 603 except KeyError: 604 if settings.STRICT: 605 raise PDFFontError('BaseFont is missing') 606 self.basefont = 'unknown' 607 try: 608 (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) 609 except KeyError: 610 descriptor = dict_value(spec.get('FontDescriptor', {})) 611 firstchar = int_value(spec.get('FirstChar', 0)) 612 # lastchar = int_value(spec.get('LastChar', 255)) 613 widths = list_value(spec.get('Widths', [0]*256)) 614 widths = {i+firstchar: w for (i, w) in enumerate(widths)} 615 PDFSimpleFont.__init__(self, descriptor, widths, spec) 616 if 'Encoding' not in spec and 'FontFile' in descriptor: 617 # try to recover the missing encoding info from the font file. 618 self.fontfile = stream_value(descriptor.get('FontFile')) 619 length1 = int_value(self.fontfile['Length1']) 620 data = self.fontfile.get_data()[:length1] 621 parser = Type1FontHeaderParser(BytesIO(data)) 622 self.cid2unicode = parser.get_encoding() 623 return 624 625 def __repr__(self): 626 return '<PDFType1Font: basefont=%r>' % self.basefont 627 628 629class PDFTrueTypeFont(PDFType1Font): 630 631 def __repr__(self): 632 return '<PDFTrueTypeFont: basefont=%r>' % self.basefont 633 634 635class PDFType3Font(PDFSimpleFont): 636 637 def __init__(self, rsrcmgr, spec): 638 firstchar = int_value(spec.get('FirstChar', 0)) 639 # lastchar = int_value(spec.get('LastChar', 0)) 640 widths = list_value(spec.get('Widths', [0]*256)) 641 widths = {i+firstchar: w for (i, w) in enumerate(widths)} 642 if 'FontDescriptor' in spec: 643 descriptor = dict_value(spec['FontDescriptor']) 644 else: 645 descriptor = {'Ascent': 0, 'Descent': 0, 646 'FontBBox': spec['FontBBox']} 647 PDFSimpleFont.__init__(self, descriptor, widths, spec) 648 self.matrix = tuple(list_value(spec.get('FontMatrix'))) 649 (_, self.descent, _, self.ascent) = self.bbox 650 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) 651 return 652 653 def __repr__(self): 654 return '<PDFType3Font>' 655 656 657class PDFCIDFont(PDFFont): 658 659 def __init__(self, rsrcmgr, spec, strict=settings.STRICT): 660 try: 661 self.basefont = literal_name(spec['BaseFont']) 662 except KeyError: 663 if strict: 664 raise PDFFontError('BaseFont is missing') 665 self.basefont = 'unknown' 666 self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) 667 cid_registry = resolve1( 668 self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1") 669 cid_ordering = resolve1( 670 self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1") 671 self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering) 672 self.cmap = self.get_cmap_from_spec(spec, strict) 673 674 try: 675 descriptor = dict_value(spec['FontDescriptor']) 676 except KeyError: 677 if strict: 678 raise PDFFontError('FontDescriptor is missing') 679 descriptor = {} 680 ttf = None 681 if 'FontFile2' in descriptor: 682 self.fontfile = stream_value(descriptor.get('FontFile2')) 683 ttf = TrueTypeFont(self.basefont, 684 BytesIO(self.fontfile.get_data())) 685 self.unicode_map = None 686 if 'ToUnicode' in spec: 687 strm = stream_value(spec['ToUnicode']) 688 self.unicode_map = FileUnicodeMap() 689 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 690 elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'): 691 if ttf: 692 try: 693 self.unicode_map = ttf.create_unicode_map() 694 except TrueTypeFont.CMapNotFound: 695 pass 696 else: 697 try: 698 self.unicode_map = CMapDB.get_unicode_map( 699 self.cidcoding, self.cmap.is_vertical()) 700 except CMapDB.CMapNotFound: 701 pass 702 703 self.vertical = self.cmap.is_vertical() 704 if self.vertical: 705 # writing mode: vertical 706 widths = get_widths2(list_value(spec.get('W2', []))) 707 self.disps = {cid: (vx, vy) 708 for (cid, (_, (vx, vy))) in widths.items()} 709 (vy, w) = spec.get('DW2', [880, -1000]) 710 self.default_disp = (None, vy) 711 widths = {cid: w for (cid, (w, _)) in widths.items()} 712 default_width = w 713 else: 714 # writing mode: horizontal 715 self.disps = {} 716 self.default_disp = 0 717 widths = get_widths(list_value(spec.get('W', []))) 718 default_width = spec.get('DW', 1000) 719 PDFFont.__init__(self, descriptor, widths, default_width=default_width) 720 return 721 722 def get_cmap_from_spec(self, spec, strict): 723 """Get cmap from font specification 724 725 For certain PDFs, Encoding Type isn't mentioned as an attribute of 726 Encoding but as an attribute of CMapName, where CMapName is an 727 attribute of spec['Encoding']. 728 The horizontal/vertical modes are mentioned with different name 729 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. 730 """ 731 cmap_name = self._get_cmap_name(spec, strict) 732 733 try: 734 return CMapDB.get_cmap(cmap_name) 735 except CMapDB.CMapNotFound as e: 736 if strict: 737 raise PDFFontError(e) 738 return CMap() 739 740 @staticmethod 741 def _get_cmap_name(spec, strict): 742 """Get cmap name from font specification""" 743 cmap_name = 'unknown' # default value 744 745 try: 746 spec_encoding = spec['Encoding'] 747 if hasattr(spec_encoding, 'name'): 748 cmap_name = literal_name(spec['Encoding']) 749 else: 750 cmap_name = literal_name(spec_encoding['CMapName']) 751 except KeyError: 752 if strict: 753 raise PDFFontError('Encoding is unspecified') 754 755 if type(cmap_name) is PDFStream: 756 if 'CMapName' in cmap_name: 757 cmap_name = cmap_name.get('CMapName').name 758 else: 759 if strict: 760 raise PDFFontError('CMapName unspecified for encoding') 761 762 cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name) 763 return cmap_name 764 765 def __repr__(self): 766 return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\ 767 .format(self.basefont, self.cidcoding) 768 769 def is_vertical(self): 770 return self.vertical 771 772 def is_multibyte(self): 773 return True 774 775 def decode(self, bytes): 776 return self.cmap.decode(bytes) 777 778 def char_disp(self, cid): 779 "Returns an integer for horizontal fonts, a tuple for vertical fonts." 780 return self.disps.get(cid, self.default_disp) 781 782 def to_unichr(self, cid): 783 try: 784 if not self.unicode_map: 785 raise KeyError(cid) 786 return self.unicode_map.get_unichr(cid) 787 except KeyError: 788 raise PDFUnicodeNotDefined(self.cidcoding, cid) 789 790 791def main(argv): 792 for fname in argv[1:]: 793 fp = open(fname, 'rb') 794 font = CFFFont(fname, fp) 795 print(font) 796 fp.close() 797 return 798 799 800if __name__ == '__main__': 801 sys.exit(main(sys.argv)) 802