1#!/usr/bin/env python3 2 3import sys 4import io 5import struct 6from .cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap 7from .encodingdb import EncodingDB, name2unicode 8from .psparser import PSStackParser 9from .psparser import PSEOF 10from .psparser import LIT, KWD, handle_error 11from .psparser import PSLiteral, literal_name 12from .pdftypes import (PDFException, resolve1, int_value, num_value, list_value, dict_value, 13 stream_value) 14from .fontmetrics import FONT_METRICS 15from .utils import apply_matrix_norm, nunpack, choplist 16 17 18def get_widths(seq): 19 widths = {} 20 r = [] 21 for v in seq: 22 if isinstance(v, list): 23 if r: 24 char1 = r[-1] 25 for (i,w) in enumerate(v): 26 widths[char1+i] = w 27 r = [] 28 elif isinstance(v, int): 29 r.append(v) 30 if len(r) == 3: 31 (char1,char2,w) = r 32 for i in range(char1, char2+1): 33 widths[i] = w 34 r = [] 35 return widths 36#assert get_widths([1]) == {} 37#assert get_widths([1,2,3]) == {1:3, 2:3} 38#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} 39 40def get_widths2(seq): 41 widths = {} 42 r = [] 43 for v in seq: 44 if isinstance(v, list): 45 if r: 46 char1 = r[-1] 47 for (i,(w,vx,vy)) in enumerate(choplist(3,v)): 48 widths[char1+i] = (w,(vx,vy)) 49 r = [] 50 elif isinstance(v, int): 51 r.append(v) 52 if len(r) == 5: 53 (char1,char2,w,vx,vy) = r 54 for i in range(char1, char2+1): 55 widths[i] = (w,(vx,vy)) 56 r = [] 57 return widths 58#assert get_widths2([1]) == {} 59#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))} 60#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))} 61 62 63class FontMetricsDB: 64 65 @classmethod 66 def get_metrics(klass, fontname): 67 return FONT_METRICS[fontname] 68 69 70class Type1FontHeaderParser(PSStackParser): 71 72 KEYWORD_BEGIN = KWD('begin') 73 KEYWORD_END = KWD('end') 74 KEYWORD_DEF = KWD('def') 75 KEYWORD_PUT = KWD('put') 76 KEYWORD_DICT = KWD('dict') 77 KEYWORD_ARRAY = KWD('array') 78 KEYWORD_READONLY = KWD('readonly') 79 KEYWORD_FOR = KWD('for') 80 KEYWORD_FOR = KWD('for') 81 82 def __init__(self, data): 83 PSStackParser.__init__(self, data) 84 self._cid2unicode = {} 85 86 def get_encoding(self): 87 while 1: 88 try: 89 (cid,name) = self.nextobject() 90 except PSEOF: 91 break 92 try: 93 self._cid2unicode[cid] = name2unicode(name) 94 except KeyError: 95 pass 96 return self._cid2unicode 97 98 def do_keyword(self, pos, token): 99 if token is self.KEYWORD_PUT: 100 ((_,key),(_,value)) = self.pop(2) 101 if (isinstance(key, int) and 102 isinstance(value, PSLiteral)): 103 self.add_results((key, literal_name(value))) 104 105 106## CFFFont 107## (Format specified in Adobe Technical Note: #5176 108## "The Compact Font Format Specification") 109## 110NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-') 111def getdict(data): 112 d = {} 113 fp = io.BytesIO(data) 114 stack = [] 115 while 1: 116 c = fp.read(1) 117 if not c: break 118 b0 = ord(c) 119 if b0 <= 21: 120 d[b0] = stack 121 stack = [] 122 continue 123 if b0 == 30: 124 s = '' 125 loop = True 126 while loop: 127 b = ord(fp.read(1)) 128 for n in (b >> 4, b & 15): 129 if n == 15: 130 loop = False 131 else: 132 s += NIBBLES[n] 133 value = float(s) 134 elif 32 <= b0 and b0 <= 246: 135 value = b0-139 136 else: 137 b1 = ord(fp.read(1)) 138 if 247 <= b0 and b0 <= 250: 139 value = ((b0-247)<<8)+b1+108 140 elif 251 <= b0 and b0 <= 254: 141 value = -((b0-251)<<8)-b1-108 142 else: 143 b2 = ord(fp.read(1)) 144 if 128 <= b1: b1 -= 256 145 if b0 == 28: 146 value = b1<<8 | b2 147 else: 148 value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0] 149 stack.append(value) 150 return d 151 152class CFFFont: 153 154 STANDARD_STRINGS = ( 155 '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 156 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft', 157 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period', 158 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 159 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal', 160 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 161 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 162 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash', 163 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a', 164 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 165 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 166 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown', 167 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section', 168 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft', 169 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash', 170 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet', 171 'quotesinglbase', 'quotedblbase', 'quotedblright', 172 'guillemotright', 'ellipsis', 'perthousand', 'questiondown', 173 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve', 174 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut', 175 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash', 176 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 177 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu', 178 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn', 179 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn', 180 'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 181 'multiply', 'threesuperior', 'copyright', 'Aacute', 182 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde', 183 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave', 184 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 185 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 186 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave', 187 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex', 188 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute', 189 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex', 190 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 191 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute', 192 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis', 193 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle', 194 'dollarsuperior', 'ampersandsmall', 'Acutesmall', 195 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 196 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle', 197 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle', 198 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle', 199 'commasuperior', 'threequartersemdash', 'periodsuperior', 200 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 201 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior', 202 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior', 203 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior', 204 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall', 205 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 206 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 207 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall', 208 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall', 209 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall', 210 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 211 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall', 212 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior', 213 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall', 214 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths', 215 'onethird', 'twothirds', 'zerosuperior', 'foursuperior', 216 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior', 217 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 218 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior', 219 'seveninferior', 'eightinferior', 'nineinferior', 220 'centinferior', 'dollarinferior', 'periodinferior', 221 'commainferior', 'Agravesmall', 'Aacutesmall', 222 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 223 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall', 224 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall', 225 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall', 226 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 227 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall', 228 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall', 229 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall', 230 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', 231 '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book', 232 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', 233 ) 234 235 class INDEX: 236 237 def __init__(self, fp): 238 self.fp = fp 239 self.offsets = [] 240 (count, offsize) = struct.unpack(b'>HB', self.fp.read(3)) 241 for i in range(count+1): 242 self.offsets.append(nunpack(self.fp.read(offsize))) 243 self.base = self.fp.tell()-1 244 self.fp.seek(self.base+self.offsets[-1]) 245 246 def __repr__(self): 247 return '<INDEX: size=%d>' % len(self) 248 249 def __len__(self): 250 return len(self.offsets)-1 251 252 def __getitem__(self, i): 253 self.fp.seek(self.base+self.offsets[i]) 254 return self.fp.read(self.offsets[i+1]-self.offsets[i]) 255 256 def __iter__(self): 257 return iter( self[i] for i in range(len(self)) ) 258 259 def __init__(self, name, fp): 260 self.name = name 261 self.fp = fp 262 # Header 263 (_major,_minor,hdrsize,offsize) = struct.unpack(b'BBBB', self.fp.read(4)) 264 self.fp.read(hdrsize-4) 265 # Name INDEX 266 self.name_index = self.INDEX(self.fp) 267 # Top DICT INDEX 268 self.dict_index = self.INDEX(self.fp) 269 # String INDEX 270 self.string_index = self.INDEX(self.fp) 271 # Global Subr INDEX 272 self.subr_index = self.INDEX(self.fp) 273 # Top DICT DATA 274 self.top_dict = getdict(self.dict_index[0]) 275 (charset_pos,) = self.top_dict.get(15, [0]) 276 (encoding_pos,) = self.top_dict.get(16, [0]) 277 (charstring_pos,) = self.top_dict.get(17, [0]) 278 # CharStrings 279 self.fp.seek(charstring_pos) 280 self.charstring = self.INDEX(self.fp) 281 self.nglyphs = len(self.charstring) 282 # Encodings 283 self.code2gid = {} 284 self.gid2code = {} 285 self.fp.seek(encoding_pos) 286 format = self.fp.read(1) 287 if format == b'\x00': 288 # Format 0 289 (n,) = struct.unpack(b'B', self.fp.read(1)) 290 for (code,gid) in enumerate(struct.unpack(b'B'*n, self.fp.read(n))): 291 self.code2gid[code] = gid 292 self.gid2code[gid] = code 293 elif format == b'\x01': 294 # Format 1 295 (n,) = struct.unpack(b'B', self.fp.read(1)) 296 code = 0 297 for i in range(n): 298 (first,nleft) = struct.unpack(b'BB', self.fp.read(2)) 299 for gid in range(first,first+nleft+1): 300 self.code2gid[code] = gid 301 self.gid2code[gid] = code 302 code += 1 303 else: 304 raise ValueError('unsupported encoding format: %r' % format) 305 # Charsets 306 self.name2gid = {} 307 self.gid2name = {} 308 self.fp.seek(charset_pos) 309 format = self.fp.read(1) 310 if format == '\x00': 311 # Format 0 312 n = self.nglyphs-1 313 for (gid,sid) in enumerate(struct.unpack(b'>'+b'H'*n, self.fp.read(2*n))): 314 gid += 1 315 name = self.getstr(sid) 316 self.name2gid[name] = gid 317 self.gid2name[gid] = name 318 elif format == '\x01': 319 # Format 1 320 (n,) = struct.unpack(b'B', self.fp.read(1)) 321 sid = 0 322 for i in range(n): 323 (first,nleft) = struct.unpack(b'BB', self.fp.read(2)) 324 for gid in range(first,first+nleft+1): 325 name = self.getstr(sid) 326 self.name2gid[name] = gid 327 self.gid2name[gid] = name 328 sid += 1 329 elif format == '\x02': 330 # Format 2 331 assert 0 332 else: 333 raise ValueError('unsupported charset format: %r' % format) 334 #print self.code2gid 335 #print self.name2gid 336 #assert 0 337 338 def getstr(self, sid): 339 if sid < len(self.STANDARD_STRINGS): 340 return self.STANDARD_STRINGS[sid] 341 return self.string_index[sid-len(self.STANDARD_STRINGS)] 342 343 344class TrueTypeFont: 345 346 class CMapNotFound(Exception): pass 347 348 def __init__(self, name, fp): 349 self.name = name 350 self.fp = fp 351 self.tables = {} 352 self.fonttype = fp.read(4) 353 (ntables, _1, _2, _3) = struct.unpack(b'>HHHH', fp.read(8)) 354 for _ in range(ntables): 355 (name, tsum, offset, length) = struct.unpack(b'>4sLLL', fp.read(16)) 356 self.tables[name] = (offset, length) 357 358 def create_unicode_map(self): 359 if 'cmap' not in self.tables: 360 raise TrueTypeFont.CMapNotFound 361 (base_offset, length) = self.tables['cmap'] 362 fp = self.fp 363 fp.seek(base_offset) 364 (version, nsubtables) = struct.unpack(b'>HH', fp.read(4)) 365 subtables = [] 366 for i in range(nsubtables): 367 subtables.append(struct.unpack(b'>HHL', fp.read(8))) 368 char2gid = {} 369 # Only supports subtable type 0, 2 and 4. 370 for (_1, _2, st_offset) in subtables: 371 fp.seek(base_offset+st_offset) 372 (fmttype, fmtlen, fmtlang) = struct.unpack(b'>HHH', fp.read(6)) 373 if fmttype == 0: 374 char2gid.update(enumerate(struct.unpack(b'>256B', fp.read(256)))) 375 elif fmttype == 2: 376 subheaderkeys = struct.unpack(b'>256H', fp.read(512)) 377 firstbytes = [0]*8192 378 for (i,k) in enumerate(subheaderkeys): 379 firstbytes[k/8] = i 380 nhdrs = max(subheaderkeys)/8 + 1 381 hdrs = [] 382 for i in range(nhdrs): 383 (firstcode,entcount,delta,offset) = struct.unpack(b'>HHhH', fp.read(8)) 384 hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) 385 for (i,firstcode,entcount,delta,pos) in hdrs: 386 if not entcount: continue 387 first = firstcode + (firstbytes[i] << 8) 388 fp.seek(pos) 389 for c in range(entcount): 390 gid = struct.unpack(b'>H', fp.read(2)) 391 if gid: 392 gid += delta 393 char2gid[first+c] = gid 394 elif fmttype == 4: 395 (segcount, _1, _2, _3) = struct.unpack(b'>HHHH', fp.read(8)) 396 segcount /= 2 397 ecs = struct.unpack(b'>%dH' % segcount, fp.read(2*segcount)) 398 fp.read(2) 399 scs = struct.unpack(b'>%dH' % segcount, fp.read(2*segcount)) 400 idds = struct.unpack(b'>%dh' % segcount, fp.read(2*segcount)) 401 pos = fp.tell() 402 idrs = struct.unpack(b'>%dH' % segcount, fp.read(2*segcount)) 403 for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): 404 if idr: 405 fp.seek(pos+idr) 406 for c in range(sc, ec+1): 407 char2gid[c] = (struct.unpack(b'>H', fp.read(2))[0] + idd) & 0xffff 408 else: 409 for c in range(sc, ec+1): 410 char2gid[c] = (c + idd) & 0xffff 411 else: 412 assert 0 413 # create unicode map 414 unicode_map = FileUnicodeMap() 415 for (char,gid) in char2gid.items(): 416 unicode_map.add_cid2unichr(gid, char) 417 return unicode_map 418 419 420## Fonts 421## 422 423class PDFFontError(PDFException): pass 424class PDFUnicodeNotDefined(PDFFontError): pass 425 426LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') 427LITERAL_TYPE1C = LIT('Type1C') 428 429 430class PDFFont: 431 432 def __init__(self, descriptor, widths, default_width=None): 433 self.descriptor = descriptor 434 self.widths = widths 435 self.fontname = resolve1(descriptor.get('FontName', 'unknown')) 436 if isinstance(self.fontname, PSLiteral): 437 self.fontname = literal_name(self.fontname) 438 self.flags = int_value(descriptor.get('Flags', 0)) 439 self.ascent = num_value(descriptor.get('Ascent', 0)) 440 self.descent = num_value(descriptor.get('Descent', 0)) 441 self.italic_angle = num_value(descriptor.get('ItalicAngle', 0)) 442 self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0)) 443 self.leading = num_value(descriptor.get('Leading', 0)) 444 self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) 445 self.hscale = self.vscale = .001 446 447 def __repr__(self): 448 return '<PDFFont>' 449 450 def is_vertical(self): 451 return False 452 453 def is_multibyte(self): 454 return False 455 456 def decode(self, s): 457 if isinstance(s, str): 458 return list(map(ord, s)) 459 else: # it's already bytes 460 return s 461 462 def get_ascent(self): 463 return self.ascent * self.vscale 464 def get_descent(self): 465 return self.descent * self.vscale 466 467 def get_width(self): 468 w = self.bbox[2]-self.bbox[0] 469 if w == 0: 470 w = -self.default_width 471 return w * self.hscale 472 def get_height(self): 473 h = self.bbox[3]-self.bbox[1] 474 if h == 0: 475 h = self.ascent - self.descent 476 return h * self.vscale 477 478 def char_width(self, cid): 479 return self.widths.get(cid, self.default_width) * self.hscale 480 481 def char_disp(self, cid): 482 return 0 483 484 def string_width(self, s): 485 return sum( self.char_width(cid) for cid in self.decode(s) ) 486 487 488class PDFSimpleFont(PDFFont): 489 490 def __init__(self, descriptor, widths, spec): 491 # Font encoding is specified either by a name of 492 # built-in encoding or a dictionary that describes 493 # the differences. 494 if 'Encoding' in spec: 495 encoding = resolve1(spec['Encoding']) 496 else: 497 encoding = LITERAL_STANDARD_ENCODING 498 if isinstance(encoding, dict): 499 name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) 500 diff = list_value(encoding.get('Differences', None)) 501 self.cid2unicode = EncodingDB.get_encoding(name, diff) 502 else: 503 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) 504 self.unicode_map = None 505 if 'ToUnicode' in spec: 506 strm = stream_value(spec['ToUnicode']) 507 self.unicode_map = FileUnicodeMap() 508 CMapParser(self.unicode_map, io.BytesIO(strm.get_data())).run() 509 PDFFont.__init__(self, descriptor, widths) 510 511 def to_unichr(self, cid): 512 if self.unicode_map: 513 try: 514 return self.unicode_map.get_unichr(cid) 515 except KeyError: 516 pass 517 try: 518 return self.cid2unicode[cid] 519 except KeyError: 520 raise PDFUnicodeNotDefined(None, cid) 521 522class PDFType1Font(PDFSimpleFont): 523 524 def __init__(self, rsrcmgr, spec): 525 try: 526 self.basefont = literal_name(spec['BaseFont']) 527 except KeyError: 528 handle_error(PDFFontError, 'BaseFont is missing') 529 self.basefont = 'unknown' 530 try: 531 (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) 532 except KeyError: 533 descriptor = dict_value(spec.get('FontDescriptor', {})) 534 firstchar = int_value(spec.get('FirstChar', 0)) 535 lastchar = int_value(spec.get('LastChar', 255)) 536 widths = list_value(spec.get('Widths', [0]*256)) 537 widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) 538 PDFSimpleFont.__init__(self, descriptor, widths, spec) 539 if 'Encoding' not in spec and 'FontFile' in descriptor: 540 # try to recover the missing encoding info from the font file. 541 self.fontfile = stream_value(descriptor.get('FontFile')) 542 length1 = int_value(self.fontfile['Length1']) 543 data = self.fontfile.get_data()[:length1] 544 parser = Type1FontHeaderParser(io.BytesIO(data)) 545 self.cid2unicode = parser.get_encoding() 546 547 def __repr__(self): 548 return '<PDFType1Font: basefont=%r>' % self.basefont 549 550class PDFTrueTypeFont(PDFType1Font): 551 552 def __repr__(self): 553 return '<PDFTrueTypeFont: basefont=%r>' % self.basefont 554 555class PDFType3Font(PDFSimpleFont): 556 557 def __init__(self, rsrcmgr, spec): 558 firstchar = int_value(spec.get('FirstChar', 0)) 559 lastchar = int_value(spec.get('LastChar', 0)) 560 widths = list_value(spec.get('Widths', [0]*256)) 561 widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) 562 if 'FontDescriptor' in spec: 563 descriptor = dict_value(spec['FontDescriptor']) 564 else: 565 descriptor = {'Ascent':0, 'Descent':0, 566 'FontBBox':spec['FontBBox']} 567 PDFSimpleFont.__init__(self, descriptor, widths, spec) 568 self.matrix = tuple(list_value(spec.get('FontMatrix'))) 569 (_,self.descent,_,self.ascent) = self.bbox 570 (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1)) 571 572 def __repr__(self): 573 return '<PDFType3Font>' 574 575 576class PDFCIDFont(PDFFont): 577 578 def __init__(self, rsrcmgr, spec): 579 try: 580 self.basefont = literal_name(spec['BaseFont']) 581 except KeyError: 582 handle_error(PDFFontError, 'BaseFont is missing') 583 self.basefont = 'unknown' 584 self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) 585 self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), 586 self.cidsysteminfo.get('Ordering', 'unknown')) 587 try: 588 name = literal_name(spec['Encoding']) 589 except KeyError: 590 handle_error(PDFFontError, 'Encoding is unspecified') 591 name = 'unknown' 592 try: 593 self.cmap = CMapDB.get_cmap(name) 594 except CMapDB.CMapNotFound as e: 595 handle_error(PDFFontError, str(e)) 596 self.cmap = CMap() 597 try: 598 descriptor = dict_value(spec['FontDescriptor']) 599 except KeyError: 600 handle_error(PDFFontError, 'FontDescriptor is missing') 601 descriptor = {} 602 ttf = None 603 if 'FontFile2' in descriptor: 604 self.fontfile = stream_value(descriptor.get('FontFile2')) 605 ttf = TrueTypeFont(self.basefont, 606 io.BytesIO(self.fontfile.get_data())) 607 self.unicode_map = None 608 if 'ToUnicode' in spec: 609 strm = stream_value(spec['ToUnicode']) 610 self.unicode_map = FileUnicodeMap() 611 CMapParser(self.unicode_map, io.BytesIO(strm.get_data())).run() 612 elif self.cidcoding == 'Adobe-Identity': 613 if ttf: 614 try: 615 self.unicode_map = ttf.create_unicode_map() 616 except TrueTypeFont.CMapNotFound: 617 pass 618 else: 619 try: 620 self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical()) 621 except CMapDB.CMapNotFound as e: 622 pass 623 624 self.vertical = self.cmap.is_vertical() 625 if self.vertical: 626 # writing mode: vertical 627 widths = get_widths2(list_value(spec.get('W2', []))) 628 self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.items() ) 629 (vy,w) = spec.get('DW2', [880, -1000]) 630 self.default_disp = (None,vy) 631 widths = dict( (cid,w) for (cid,(w,_)) in widths.items() ) 632 default_width = w 633 else: 634 # writing mode: horizontal 635 self.disps = {} 636 self.default_disp = 0 637 widths = get_widths(list_value(spec.get('W', []))) 638 default_width = spec.get('DW', 1000) 639 PDFFont.__init__(self, descriptor, widths, default_width=default_width) 640 641 def __repr__(self): 642 return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding) 643 644 def is_vertical(self): 645 return self.vertical 646 647 def is_multibyte(self): 648 return True 649 650 def decode(self, bytes): 651 return self.cmap.decode(bytes) 652 653 def char_disp(self, cid): 654 "Returns an integer for horizontal fonts, a tuple for vertical fonts." 655 return self.disps.get(cid, self.default_disp) 656 657 def to_unichr(self, cid): 658 try: 659 if not self.unicode_map: 660 raise KeyError(cid) 661 return self.unicode_map.get_unichr(cid) 662 except KeyError: 663 raise PDFUnicodeNotDefined(self.cidcoding, cid) 664 665 666def main(argv): 667 for fname in argv[1:]: 668 fp = io.open(fname, 'rb') 669 #font = TrueTypeFont(fname, fp) 670 font = CFFFont(fname, fp) 671 print(font) 672 fp.close() 673 674if __name__ == '__main__': 675 sys.exit(main(sys.argv)) 676