1#Copyright ReportLab Europe Ltd. 2000-2017 2#see license.txt for license details 3#history https://hg.reportlab.com/hg-public/reportlab/log/tip/src/reportlab/pdfbase/cidfonts.py 4#$Header $ 5__version__='3.3.0' 6__doc__="""CID (Asian multi-byte) font support. 7 8This defines classes to represent CID fonts. They know how to calculate 9their own width and how to write themselves into PDF files.""" 10 11import os 12import marshal 13import time 14try: 15 from hashlib import md5 16except ImportError: 17 from md5 import md5 18 19import reportlab 20from reportlab.pdfbase import pdfmetrics 21from reportlab.pdfbase._cidfontdata import allowedTypeFaces, allowedEncodings, CIDFontInfo, \ 22 defaultUnicodeEncodings, widthsByUnichar 23from reportlab.pdfgen.canvas import Canvas 24from reportlab.pdfbase import pdfdoc 25from reportlab.lib.rl_accel import escapePDF 26from reportlab.rl_config import CMapSearchPath 27from reportlab.lib.utils import isSeq, isBytes 28 29#quick hackery for 2.0 release. Now we always do unicode, and have built in 30#the CMAP data, any code to load CMap files is not needed. 31DISABLE_CMAP = True 32 33 34def findCMapFile(name): 35 "Returns full filename, or raises error" 36 for dirname in CMapSearchPath: 37 cmapfile = dirname + os.sep + name 38 if os.path.isfile(cmapfile): 39 #print "found", cmapfile 40 return cmapfile 41 raise IOError('CMAP file for encodings "%s" not found!' % name) 42 43def structToPDF(structure): 44 "Converts deeply nested structure to PDFdoc dictionary/array objects" 45 if isinstance(structure,dict): 46 newDict = {} 47 for k, v in structure.items(): 48 newDict[k] = structToPDF(v) 49 return pdfdoc.PDFDictionary(newDict) 50 elif isSeq(structure): 51 newList = [] 52 for elem in structure: 53 newList.append(structToPDF(elem)) 54 return pdfdoc.PDFArray(newList) 55 else: 56 return structure 57 58class CIDEncoding(pdfmetrics.Encoding): 59 """Multi-byte encoding. These are loaded from CMAP files. 60 61 A CMAP file is like a mini-codec. It defines the correspondence 62 between code points in the (multi-byte) input data and Character 63 IDs. """ 64 # aims to do similar things to Brian Hooper's CMap class, 65 # but I could not get it working and had to rewrite. 66 # also, we should really rearrange our current encoding 67 # into a SingleByteEncoding since many of its methods 68 # should not apply here. 69 70 def __init__(self, name, useCache=1): 71 self.name = name 72 self._mapFileHash = None 73 self._codeSpaceRanges = [] 74 self._notDefRanges = [] 75 self._cmap = {} 76 self.source = None 77 if not DISABLE_CMAP: 78 if useCache: 79 from reportlab.lib.utils import get_rl_tempdir 80 fontmapdir = get_rl_tempdir('FastCMAPS') 81 if os.path.isfile(fontmapdir + os.sep + name + '.fastmap'): 82 self.fastLoad(fontmapdir) 83 self.source = fontmapdir + os.sep + name + '.fastmap' 84 else: 85 self.parseCMAPFile(name) 86 self.source = 'CMAP: ' + name 87 self.fastSave(fontmapdir) 88 else: 89 self.parseCMAPFile(name) 90 91 def _hash(self, text): 92 hasher = md5() 93 hasher.update(text) 94 return hasher.digest() 95 96 def parseCMAPFile(self, name): 97 """This is a tricky one as CMAP files are Postscript 98 ones. Some refer to others with a 'usecmap' 99 command""" 100 #started = time.clock() 101 cmapfile = findCMapFile(name) 102 # this will CRAWL with the unicode encodings... 103 rawdata = open(cmapfile, 'r').read() 104 105 self._mapFileHash = self._hash(rawdata) 106 #if it contains the token 'usecmap', parse the other 107 #cmap file first.... 108 usecmap_pos = rawdata.find('usecmap') 109 if usecmap_pos > -1: 110 #they tell us to look in another file 111 #for the code space ranges. The one 112 # to use will be the previous word. 113 chunk = rawdata[0:usecmap_pos] 114 words = chunk.split() 115 otherCMAPName = words[-1] 116 #print 'referred to another CMAP %s' % otherCMAPName 117 self.parseCMAPFile(otherCMAPName) 118 # now continue parsing this, as it may 119 # override some settings 120 121 122 words = rawdata.split() 123 while words != []: 124 if words[0] == 'begincodespacerange': 125 words = words[1:] 126 while words[0] != 'endcodespacerange': 127 strStart, strEnd, words = words[0], words[1], words[2:] 128 start = int(strStart[1:-1], 16) 129 end = int(strEnd[1:-1], 16) 130 self._codeSpaceRanges.append((start, end),) 131 elif words[0] == 'beginnotdefrange': 132 words = words[1:] 133 while words[0] != 'endnotdefrange': 134 strStart, strEnd, strValue = words[0:3] 135 start = int(strStart[1:-1], 16) 136 end = int(strEnd[1:-1], 16) 137 value = int(strValue) 138 self._notDefRanges.append((start, end, value),) 139 words = words[3:] 140 elif words[0] == 'begincidrange': 141 words = words[1:] 142 while words[0] != 'endcidrange': 143 strStart, strEnd, strValue = words[0:3] 144 start = int(strStart[1:-1], 16) 145 end = int(strEnd[1:-1], 16) 146 value = int(strValue) 147 # this means that 'start' corresponds to 'value', 148 # start+1 corresponds to value+1 and so on up 149 # to end 150 offset = 0 151 while start + offset <= end: 152 self._cmap[start + offset] = value + offset 153 offset = offset + 1 154 words = words[3:] 155 156 else: 157 words = words[1:] 158 #finished = time.clock() 159 #print 'parsed CMAP %s in %0.4f seconds' % (self.name, finished - started) 160 161 def translate(self, text): 162 "Convert a string into a list of CIDs" 163 output = [] 164 cmap = self._cmap 165 lastChar = '' 166 for char in text: 167 if lastChar != '': 168 #print 'convert character pair "%s"' % (lastChar + char) 169 num = ord(lastChar) * 256 + ord(char) 170 else: 171 #print 'convert character "%s"' % char 172 num = ord(char) 173 lastChar = char 174 found = 0 175 for low, high in self._codeSpaceRanges: 176 if low < num < high: 177 try: 178 cid = cmap[num] 179 #print '%d -> %d' % (num, cid) 180 except KeyError: 181 #not defined. Try to find the appropriate 182 # notdef character, or failing that return 183 # zero 184 cid = 0 185 for low2, high2, notdef in self._notDefRanges: 186 if low2 < num < high2: 187 cid = notdef 188 break 189 output.append(cid) 190 found = 1 191 break 192 if found: 193 lastChar = '' 194 else: 195 lastChar = char 196 return output 197 198 def fastSave(self, directory): 199 f = open(os.path.join(directory, self.name + '.fastmap'), 'wb') 200 marshal.dump(self._mapFileHash, f) 201 marshal.dump(self._codeSpaceRanges, f) 202 marshal.dump(self._notDefRanges, f) 203 marshal.dump(self._cmap, f) 204 f.close() 205 206 def fastLoad(self, directory): 207 started = time.clock() 208 f = open(os.path.join(directory, self.name + '.fastmap'), 'rb') 209 self._mapFileHash = marshal.load(f) 210 self._codeSpaceRanges = marshal.load(f) 211 self._notDefRanges = marshal.load(f) 212 self._cmap = marshal.load(f) 213 f.close() 214 finished = time.clock() 215 #print 'loaded %s in %0.4f seconds' % (self.name, finished - started) 216 217 def getData(self): 218 """Simple persistence helper. Return a dict with all that matters.""" 219 return { 220 'mapFileHash': self._mapFileHash, 221 'codeSpaceRanges': self._codeSpaceRanges, 222 'notDefRanges': self._notDefRanges, 223 'cmap': self._cmap, 224 } 225 226class CIDTypeFace(pdfmetrics.TypeFace): 227 """Multi-byte type face. 228 229 Conceptually similar to a single byte typeface, 230 but the glyphs are identified by a numeric Character 231 ID (CID) and not a glyph name. """ 232 def __init__(self, name): 233 """Initialised from one of the canned dictionaries in allowedEncodings 234 235 Or rather, it will be shortly...""" 236 pdfmetrics.TypeFace.__init__(self, name) 237 self._extractDictInfo(name) 238 def _extractDictInfo(self, name): 239 try: 240 fontDict = CIDFontInfo[name] 241 except KeyError: 242 raise KeyError("Unable to find information on CID typeface '%s'" % name + 243 "Only the following font names work:" + repr(allowedTypeFaces)) 244 descFont = fontDict['DescendantFonts'][0] 245 self.ascent = descFont['FontDescriptor']['Ascent'] 246 self.descent = descFont['FontDescriptor']['Descent'] 247 self._defaultWidth = descFont['DW'] 248 self._explicitWidths = self._expandWidths(descFont['W']) 249 250 # should really support self.glyphWidths, self.glyphNames 251 # but not done yet. 252 253 254 def _expandWidths(self, compactWidthArray): 255 """Expands Adobe nested list structure to get a dictionary of widths. 256 257 Here is an example of such a structure.:: 258 259 ( 260 # starting at character ID 1, next n characters have the widths given. 261 1, (277,305,500,668,668,906,727,305,445,445,508,668,305,379,305,539), 262 # all Characters from ID 17 to 26 are 668 em units wide 263 17, 26, 668, 264 27, (305, 305, 668, 668, 668, 566, 871, 727, 637, 652, 699, 574, 555, 265 676, 687, 242, 492, 664, 582, 789, 707, 734, 582, 734, 605, 605, 266 641, 668, 727, 945, 609, 609, 574, 445, 668, 445, 668, 668, 590, 267 555, 609, 547, 602, 574, 391, 609, 582, 234, 277, 539, 234, 895, 268 582, 605, 602, 602, 387, 508, 441, 582, 562, 781, 531, 570, 555, 269 449, 246, 449, 668), 270 # these must be half width katakana and the like. 271 231, 632, 500 272 ) 273 274 """ 275 data = compactWidthArray[:] 276 widths = {} 277 while data: 278 start, data = data[0], data[1:] 279 if isSeq(data[0]): 280 items, data = data[0], data[1:] 281 for offset in range(len(items)): 282 widths[start + offset] = items[offset] 283 else: 284 end, width, data = data[0], data[1], data[2:] 285 for idx in range(start, end+1): 286 widths[idx] = width 287 return widths 288 289 def getCharWidth(self, characterId): 290 return self._explicitWidths.get(characterId, self._defaultWidth) 291 292class CIDFont(pdfmetrics.Font): 293 "Represents a built-in multi-byte font" 294 _multiByte = 1 295 296 def __init__(self, face, encoding): 297 298 assert face in allowedTypeFaces, "TypeFace '%s' not supported! Use any of these instead: %s" % (face, allowedTypeFaces) 299 self.faceName = face 300 #should cache in registry... 301 self.face = CIDTypeFace(face) 302 303 assert encoding in allowedEncodings, "Encoding '%s' not supported! Use any of these instead: %s" % (encoding, allowedEncodings) 304 self.encodingName = encoding 305 self.encoding = CIDEncoding(encoding) 306 307 #legacy hack doing quick cut and paste. 308 self.fontName = self.faceName + '-' + self.encodingName 309 self.name = self.fontName 310 311 # need to know if it is vertical or horizontal 312 self.isVertical = (self.encodingName[-1] == 'V') 313 314 #no substitutes initially 315 self.substitutionFonts = [] 316 317 def formatForPdf(self, text): 318 encoded = escapePDF(text) 319 #print 'encoded CIDFont:', encoded 320 return encoded 321 322 def stringWidth(self, text, size, encoding=None): 323 """This presumes non-Unicode input. UnicodeCIDFont wraps it for that context""" 324 cidlist = self.encoding.translate(text) 325 if self.isVertical: 326 #this part is "not checked!" but seems to work. 327 #assume each is 1000 ems high 328 return len(cidlist) * size 329 else: 330 w = 0 331 for cid in cidlist: 332 w = w + self.face.getCharWidth(cid) 333 return 0.001 * w * size 334 335 336 def addObjects(self, doc): 337 """The explicit code in addMinchoObjects and addGothicObjects 338 will be replaced by something that pulls the data from 339 _cidfontdata.py in the next few days.""" 340 internalName = 'F' + repr(len(doc.fontMapping)+1) 341 342 bigDict = CIDFontInfo[self.face.name] 343 bigDict['Name'] = '/' + internalName 344 bigDict['Encoding'] = '/' + self.encodingName 345 346 #convert to PDF dictionary/array objects 347 cidObj = structToPDF(bigDict) 348 349 # link into document, and add to font map 350 r = doc.Reference(cidObj, internalName) 351 fontDict = doc.idToObject['BasicFonts'].dict 352 fontDict[internalName] = r 353 doc.fontMapping[self.name] = '/' + internalName 354 355 356class UnicodeCIDFont(CIDFont): 357 """Wraps up CIDFont to hide explicit encoding choice; 358 encodes text for output as UTF16. 359 360 lang should be one of 'jpn',chs','cht','kor' for now. 361 if vertical is set, it will select a different widths array 362 and possibly glyphs for some punctuation marks. 363 364 halfWidth is only for Japanese. 365 366 367 >>> dodgy = UnicodeCIDFont('nonexistent') 368 Traceback (most recent call last): 369 ... 370 KeyError: "don't know anything about CID font nonexistent" 371 >>> heisei = UnicodeCIDFont('HeiseiMin-W3') 372 >>> heisei.name 373 'HeiseiMin-W3' 374 >>> heisei.language 375 'jpn' 376 >>> heisei.encoding.name 377 'UniJIS-UCS2-H' 378 >>> #This is how PDF data gets encoded. 379 >>> print(heisei.formatForPdf('hello')) 380 \\000h\\000e\\000l\\000l\\000o 381 >>> tokyo = u'\u6771\u4AEC' 382 >>> print(heisei.formatForPdf(tokyo)) 383 gqJ\\354 384 >>> print(heisei.stringWidth(tokyo,10)) 385 20.0 386 >>> print(heisei.stringWidth('hello world',10)) 387 45.83 388 """ 389 390 def __init__(self, face, isVertical=False, isHalfWidth=False): 391 #pass 392 try: 393 lang, defaultEncoding = defaultUnicodeEncodings[face] 394 except KeyError: 395 raise KeyError("don't know anything about CID font %s" % face) 396 397 #we know the languages now. 398 self.language = lang 399 400 #rebuilt encoding string. They follow rules which work 401 #for the 7 fonts provided. 402 enc = defaultEncoding[:-1] 403 if isHalfWidth: 404 enc = enc + 'HW-' 405 if isVertical: 406 enc = enc + 'V' 407 else: 408 enc = enc + 'H' 409 410 #now we can do the more general case 411 CIDFont.__init__(self, face, enc) 412 #self.encName = 'utf_16_le' 413 #it's simpler for unicode, just use the face name 414 self.name = self.fontName = face 415 self.vertical = isVertical 416 self.isHalfWidth = isHalfWidth 417 418 self.unicodeWidths = widthsByUnichar[self.name] 419 420 421 def formatForPdf(self, text): 422 #these ones should be encoded asUTF16 minus the BOM 423 from codecs import utf_16_be_encode 424 #print 'formatting %s: %s' % (type(text), repr(text)) 425 if isBytes(text): 426 text = text.decode('utf8') 427 utfText = utf_16_be_encode(text)[0] 428 encoded = escapePDF(utfText) 429 #print ' encoded:',encoded 430 return encoded 431 # 432 #result = escapePDF(encoded) 433 #print ' -> %s' % repr(result) 434 #return result 435 436 437 def stringWidth(self, text, size, encoding=None): 438 "Just ensure we do width test on characters, not bytes..." 439 if isBytes(text): 440 text = text.decode('utf8') 441 442 widths = self.unicodeWidths 443 return size * 0.001 * sum([widths.get(uch, 1000) for uch in text]) 444 #return CIDFont.stringWidth(self, text, size, encoding) 445 446 447def precalculate(cmapdir): 448 # crunches through all, making 'fastmap' files 449 import os 450 files = os.listdir(cmapdir) 451 for file in files: 452 if os.path.isfile(cmapdir + os.sep + file + '.fastmap'): 453 continue 454 try: 455 enc = CIDEncoding(file) 456 except: 457 print('cannot parse %s, skipping' % enc) 458 continue 459 enc.fastSave(cmapdir) 460 print('saved %s.fastmap' % file) 461 462def test(): 463 # only works if you have cirrect encodings on your box! 464 c = Canvas('test_japanese.pdf') 465 c.setFont('Helvetica', 30) 466 c.drawString(100,700, 'Japanese Font Support') 467 468 pdfmetrics.registerFont(CIDFont('HeiseiMin-W3','90ms-RKSJ-H')) 469 pdfmetrics.registerFont(CIDFont('HeiseiKakuGo-W5','90ms-RKSJ-H')) 470 471 472 # the two typefaces 473 c.setFont('HeiseiMin-W3-90ms-RKSJ-H', 16) 474 # this says "This is HeiseiMincho" in shift-JIS. Not all our readers 475 # have a Japanese PC, so I escaped it. On a Japanese-capable 476 # system, print the string to see Kanji 477 message1 = '\202\261\202\352\202\315\225\275\220\254\226\276\222\251\202\305\202\267\201B' 478 c.drawString(100, 675, message1) 479 c.save() 480 print('saved test_japanese.pdf') 481 482 483## print 'CMAP_DIR = ', CMAP_DIR 484## tf1 = CIDTypeFace('HeiseiMin-W3') 485## print 'ascent = ',tf1.ascent 486## print 'descent = ',tf1.descent 487## for cid in [1,2,3,4,5,18,19,28,231,1742]: 488## print 'width of cid %d = %d' % (cid, tf1.getCharWidth(cid)) 489 490 encName = '90ms-RKSJ-H' 491 enc = CIDEncoding(encName) 492 print(message1, '->', enc.translate(message1)) 493 494 f = CIDFont('HeiseiMin-W3','90ms-RKSJ-H') 495 print('width = %0.2f' % f.stringWidth(message1, 10)) 496 497 498 #testing all encodings 499## import time 500## started = time.time() 501## import glob 502## for encName in _cidfontdata.allowedEncodings: 503## #encName = '90ms-RKSJ-H' 504## enc = CIDEncoding(encName) 505## print 'encoding %s:' % encName 506## print ' codeSpaceRanges = %s' % enc._codeSpaceRanges 507## print ' notDefRanges = %s' % enc._notDefRanges 508## print ' mapping size = %d' % len(enc._cmap) 509## finished = time.time() 510## print 'constructed all encodings in %0.2f seconds' % (finished - started) 511 512if __name__=='__main__': 513 import doctest 514 from reportlab.pdfbase import cidfonts 515 doctest.testmod(cidfonts) 516 #test() 517 518 519 520 521