1######################################################################### 2# # 3# # 4# copyright 2002 Paul Henry Tremblay # 5# # 6# This program is distributed in the hope that it will be useful, # 7# but WITHOUT ANY WARRANTY; without even the implied warranty of # 8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # 9# General Public License for more details. # 10# # 11# # 12######################################################################### 13import sys, os, io 14 15from calibre.ebooks.rtf2xml import get_char_map, copy 16from calibre.ebooks.rtf2xml.char_set import char_set 17from calibre.ptempfile import better_mktemp 18 19from . import open_for_read, open_for_write 20 21 22class Hex2Utf8: 23 """ 24 Convert Microsoft hexadecimal numbers to utf-8 25 """ 26 27 def __init__(self, 28 in_file, 29 area_to_convert, 30 char_file, 31 default_char_map, 32 bug_handler, 33 invalid_rtf_handler, 34 copy=None, 35 temp_dir=None, 36 symbol=None, 37 wingdings=None, 38 caps=None, 39 convert_caps=None, 40 dingbats=None, 41 run_level=1, 42 ): 43 """ 44 Required: 45 'file' 46 'area_to_convert'--the area of file to convert 47 'char_file'--the file containing the character mappings 48 'default_char_map'--name of default character map 49 Optional: 50 'copy'-- whether to make a copy of result for debugging 51 'temp_dir' --where to output temporary results (default is 52 directory from which the script is run.) 53 'symbol'--whether to load the symbol character map 54 'winddings'--whether to load the wingdings character map 55 'caps'--whether to load the caps character map 56 'convert_to_caps'--wether to convert caps to utf-8 57 Returns: 58 nothing 59 """ 60 self.__file = in_file 61 self.__copy = copy 62 if area_to_convert not in ('preamble', 'body'): 63 msg = ( 64 'Developer error! Wrong flag.\n' 65 'in module "hex_2_utf8.py\n' 66 '"area_to_convert" must be "body" or "preamble"\n' 67 ) 68 raise self.__bug_handler(msg) 69 self.__char_file = char_file 70 self.__area_to_convert = area_to_convert 71 self.__default_char_map = default_char_map 72 self.__symbol = symbol 73 self.__wingdings = wingdings 74 self.__dingbats = dingbats 75 self.__caps = caps 76 self.__convert_caps = 0 77 self.__convert_symbol = 0 78 self.__convert_wingdings = 0 79 self.__convert_zapf = 0 80 self.__run_level = run_level 81 self.__write_to = better_mktemp() 82 self.__bug_handler = bug_handler 83 self.__invalid_rtf_handler = invalid_rtf_handler 84 85 def update_values(self, 86 file, 87 area_to_convert, 88 char_file, 89 convert_caps, 90 convert_symbol, 91 convert_wingdings, 92 convert_zapf, 93 copy=None, 94 temp_dir=None, 95 symbol=None, 96 wingdings=None, 97 caps=None, 98 dingbats=None, 99 ): 100 """ 101 Required: 102 'file' 103 'area_to_convert'--the area of file to convert 104 'char_file'--the file containing the character mappings 105 Optional: 106 'copy'-- whether to make a copy of result for debugging 107 'temp_dir' --where to output temporary results (default is 108 directory from which the script is run.) 109 'symbol'--whether to load the symbol character map 110 'winddings'--whether to load the wingdings character map 111 'caps'--whether to load the caps character map 112 'convert_to_caps'--wether to convert caps to utf-8 113 Returns: 114 nothing 115 """ 116 self.__file=file 117 self.__copy = copy 118 if area_to_convert not in ('preamble', 'body'): 119 msg = ( 120 'in module "hex_2_utf8.py\n' 121 '"area_to_convert" must be "body" or "preamble"\n' 122 ) 123 raise self.__bug_handler(msg) 124 self.__area_to_convert = area_to_convert 125 self.__symbol = symbol 126 self.__wingdings = wingdings 127 self.__dingbats = dingbats 128 self.__caps = caps 129 self.__convert_caps = convert_caps 130 self.__convert_symbol = convert_symbol 131 self.__convert_wingdings = convert_wingdings 132 self.__convert_zapf = convert_zapf 133 # new! 134 # no longer try to convert these 135 # self.__convert_symbol = 0 136 # self.__convert_wingdings = 0 137 # self.__convert_zapf = 0 138 139 def __initiate_values(self): 140 """ 141 Required: 142 Nothing 143 Set values, including those for the dictionaries. 144 The file that contains the maps is broken down into many different 145 sets. For example, for the Symbol font, there is the standard part for 146 hexadecimal numbers, and the part for Microsoft characters. Read 147 each part in, and then combine them. 148 """ 149 # the default encoding system, the lower map for characters 0 through 150 # 128, and the encoding system for Microsoft characters. 151 # New on 2004-05-8: the self.__char_map is not in directory with other 152 # modules 153 self.__char_file = io.StringIO(char_set) 154 char_map_obj = get_char_map.GetCharMap( 155 char_file=self.__char_file, 156 bug_handler=self.__bug_handler, 157 ) 158 up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map) 159 bt_128_dict = char_map_obj.get_char_map(map='bottom_128') 160 ms_standard_dict = char_map_obj.get_char_map(map='ms_standard') 161 self.__def_dict = {} 162 self.__def_dict.update(up_128_dict) 163 self.__def_dict.update(bt_128_dict) 164 self.__def_dict.update(ms_standard_dict) 165 self.__current_dict = self.__def_dict 166 self.__current_dict_name = 'default' 167 self.__in_caps = 0 168 self.__special_fonts_found = 0 169 if self.__symbol: 170 symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL') 171 ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol') 172 self.__symbol_dict = {} 173 self.__symbol_dict.update(symbol_base_dict) 174 self.__symbol_dict.update(ms_symbol_dict) 175 if self.__wingdings: 176 wingdings_base_dict = char_map_obj.get_char_map(map='wingdings') 177 ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings') 178 self.__wingdings_dict = {} 179 self.__wingdings_dict.update(wingdings_base_dict) 180 self.__wingdings_dict.update(ms_wingdings_dict) 181 if self.__dingbats: 182 dingbats_base_dict = char_map_obj.get_char_map(map='dingbats') 183 ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats') 184 self.__dingbats_dict = {} 185 self.__dingbats_dict.update(dingbats_base_dict) 186 self.__dingbats_dict.update(ms_dingbats_dict) 187 # load dictionary for caps, and make a string for the replacement 188 self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni') 189 # # print self.__caps_uni_dict 190 # don't think I'll need this 191 # keys = self.__caps_uni_dict.keys() 192 # self.__caps_uni_replace = '|'.join(keys) 193 self.__preamble_state_dict = { 194 'preamble' : self.__preamble_func, 195 'body' : self.__body_func, 196 'mi<mk<body-open_' : self.__found_body_func, 197 'tx<hx<__________' : self.__hex_text_func, 198 } 199 self.__body_state_dict = { 200 'preamble' : self.__preamble_for_body_func, 201 'body' : self.__body_for_body_func, 202 } 203 self.__in_body_dict = { 204 'mi<mk<body-open_' : self.__found_body_func, 205 'tx<ut<__________' : self.__utf_to_caps_func, 206 'tx<hx<__________' : self.__hex_text_func, 207 'tx<mc<__________' : self.__hex_text_func, 208 'tx<nu<__________' : self.__text_func, 209 'mi<mk<font______' : self.__start_font_func, 210 'mi<mk<caps______' : self.__start_caps_func, 211 'mi<mk<font-end__' : self.__end_font_func, 212 'mi<mk<caps-end__' : self.__end_caps_func, 213 } 214 self.__caps_list = ['false'] 215 self.__font_list = ['not-defined'] 216 217 def __hex_text_func(self, line): 218 """ 219 Required: 220 'line' -- the line 221 Logic: 222 get the hex_num and look it up in the default dictionary. If the 223 token is in the dictionary, then check if the value starts with a 224 "&". If it does, then tag the result as utf text. Otherwise, tag it 225 as normal text. 226 If the hex_num is not in the dictionary, then a mistake has been 227 made. 228 """ 229 hex_num = line[17:-1] 230 converted = self.__current_dict.get(hex_num) 231 if converted is not None: 232 # tag as utf-8 233 if converted[0:1] == "&": 234 font = self.__current_dict_name 235 if self.__convert_caps\ 236 and self.__caps_list[-1] == 'true'\ 237 and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'): 238 converted = self.__utf_token_to_caps_func(converted) 239 self.__write_obj.write( 240 'tx<ut<__________<%s\n' % converted 241 ) 242 # tag as normal text 243 else: 244 font = self.__current_dict_name 245 if self.__convert_caps\ 246 and self.__caps_list[-1] == 'true'\ 247 and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'): 248 converted = converted.upper() 249 self.__write_obj.write( 250 'tx<nu<__________<%s\n' % converted 251 ) 252 # error 253 else: 254 token = hex_num.replace("'", '') 255 the_num = 0 256 if token: 257 the_num = int(token, 16) 258 if the_num > 10: 259 self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' % 260 hex_num) 261 if self.__run_level > 4: 262 # msg = 'no dictionary entry for %s\n' 263 # msg += 'the hexadecimal num is "%s"\n' % (hex_num) 264 # msg += 'dictionary is %s\n' % self.__current_dict_name 265 msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token 266 raise self.__bug_handler(msg) 267 268 def __found_body_func(self, line): 269 self.__state = 'body' 270 self.__write_obj.write(line) 271 272 def __body_func(self, line): 273 """ 274 When parsing preamble 275 """ 276 self.__write_obj.write(line) 277 278 def __preamble_func(self, line): 279 action = self.__preamble_state_dict.get(self.__token_info) 280 if action is not None: 281 action(line) 282 else: 283 self.__write_obj.write(line) 284 285 def __convert_preamble(self): 286 self.__state = 'preamble' 287 with open_for_write(self.__write_to) as self.__write_obj: 288 with open_for_read(self.__file) as read_obj: 289 for line in read_obj: 290 self.__token_info = line[:16] 291 action = self.__preamble_state_dict.get(self.__state) 292 if action is None: 293 sys.stderr.write('error no state found in hex_2_utf8', 294 self.__state 295 ) 296 action(line) 297 copy_obj = copy.Copy(bug_handler=self.__bug_handler) 298 if self.__copy: 299 copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data") 300 copy_obj.rename(self.__write_to, self.__file) 301 os.remove(self.__write_to) 302 303 def __preamble_for_body_func(self, line): 304 """ 305 Required: 306 line -- line to parse 307 Returns: 308 nothing 309 Logic: 310 Used when parsing the body. 311 """ 312 if self.__token_info == 'mi<mk<body-open_': 313 self.__found_body_func(line) 314 self.__write_obj.write(line) 315 316 def __body_for_body_func(self, line): 317 """ 318 Required: 319 line -- line to parse 320 Returns: 321 nothing 322 Logic: 323 Used when parsing the body. 324 """ 325 action = self.__in_body_dict.get(self.__token_info) 326 if action is not None: 327 action(line) 328 else: 329 self.__write_obj.write(line) 330 331 def __start_font_func(self, line): 332 """ 333 Required: 334 line -- line to parse 335 Returns: 336 nothing 337 Logic: 338 add font face to font_list 339 """ 340 face = line[17:-1] 341 self.__font_list.append(face) 342 if face == 'Symbol' and self.__convert_symbol: 343 self.__current_dict_name = 'Symbol' 344 self.__current_dict = self.__symbol_dict 345 elif face == 'Wingdings' and self.__convert_wingdings: 346 self.__current_dict_name = 'Wingdings' 347 self.__current_dict = self.__wingdings_dict 348 elif face == 'Zapf Dingbats' and self.__convert_zapf: 349 self.__current_dict_name = 'Zapf Dingbats' 350 self.__current_dict = self.__dingbats_dict 351 else: 352 self.__current_dict_name = 'default' 353 self.__current_dict = self.__def_dict 354 355 def __end_font_func(self, line): 356 """ 357 Required: 358 line -- line to parse 359 Returns: 360 nothing 361 Logic: 362 pop font_list 363 """ 364 if len(self.__font_list) > 1: 365 self.__font_list.pop() 366 else: 367 sys.stderr.write('module is hex_2_utf8\n') 368 sys.stderr.write('method is end_font_func\n') 369 sys.stderr.write('self.__font_list should be greater than one?\n') 370 face = self.__font_list[-1] 371 if face == 'Symbol' and self.__convert_symbol: 372 self.__current_dict_name = 'Symbol' 373 self.__current_dict = self.__symbol_dict 374 elif face == 'Wingdings' and self.__convert_wingdings: 375 self.__current_dict_name = 'Wingdings' 376 self.__current_dict = self.__wingdings_dict 377 elif face == 'Zapf Dingbats' and self.__convert_zapf: 378 self.__current_dict_name = 'Zapf Dingbats' 379 self.__current_dict = self.__dingbats_dict 380 else: 381 self.__current_dict_name = 'default' 382 self.__current_dict = self.__def_dict 383 384 def __start_special_font_func_old(self, line): 385 """ 386 Required: 387 line -- line 388 Returns; 389 nothing 390 Logic: 391 change the dictionary to use in conversion 392 """ 393 # for error checking 394 if self.__token_info == 'mi<mk<font-symbo': 395 self.__current_dict.append(self.__symbol_dict) 396 self.__special_fonts_found += 1 397 self.__current_dict_name = 'Symbol' 398 elif self.__token_info == 'mi<mk<font-wingd': 399 self.__special_fonts_found += 1 400 self.__current_dict.append(self.__wingdings_dict) 401 self.__current_dict_name = 'Wingdings' 402 elif self.__token_info == 'mi<mk<font-dingb': 403 self.__current_dict.append(self.__dingbats_dict) 404 self.__special_fonts_found += 1 405 self.__current_dict_name = 'Zapf Dingbats' 406 407 def __end_special_font_func(self, line): 408 """ 409 Required: 410 line --line to parse 411 Returns: 412 nothing 413 Logic: 414 pop the last dictionary, which should be a special font 415 """ 416 if len(self.__current_dict) < 2: 417 sys.stderr.write('module is hex_2_utf 8\n') 418 sys.stderr.write('method is __end_special_font_func\n') 419 sys.stderr.write('less than two dictionaries --can\'t pop\n') 420 self.__special_fonts_found -= 1 421 else: 422 self.__current_dict.pop() 423 self.__special_fonts_found -= 1 424 self.__dict_name = 'default' 425 426 def __start_caps_func_old(self, line): 427 """ 428 Required: 429 line -- line to parse 430 Returns: 431 nothing 432 Logic: 433 A marker that marks the start of caps has been found. Set 434 self.__in_caps to 1 435 """ 436 self.__in_caps = 1 437 438 def __start_caps_func(self, line): 439 """ 440 Required: 441 line -- line to parse 442 Returns: 443 nothing 444 Logic: 445 A marker that marks the start of caps has been found. Set 446 self.__in_caps to 1 447 """ 448 self.__in_caps = 1 449 value = line[17:-1] 450 self.__caps_list.append(value) 451 452 def __end_caps_func(self, line): 453 """ 454 Required: 455 line -- line to parse 456 Returns: 457 nothing 458 Logic: 459 A marker that marks the end of caps has been found. 460 set self.__in_caps to 0 461 """ 462 if len(self.__caps_list) > 1: 463 self.__caps_list.pop() 464 else: 465 sys.stderr.write('Module is hex_2_utf8\n' 466 'method is __end_caps_func\n' 467 'caps list should be more than one?\n') # self.__in_caps not set 468 469 def __text_func(self, line): 470 """ 471 Required: 472 line -- line to parse 473 Returns: 474 nothing 475 Logic: 476 if in caps, convert. Otherwise, print out. 477 """ 478 text = line[17:-1] 479 # print line 480 if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'): 481 the_string = '' 482 for letter in text: 483 hex_num = hex(ord(letter)) 484 hex_num = str(hex_num) 485 hex_num = hex_num.upper() 486 hex_num = hex_num[2:] 487 hex_num = '\'%s' % hex_num 488 converted = self.__current_dict.get(hex_num) 489 if converted is None: 490 sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n') 491 sys.stderr.write('no hex value for "%s"\n' % hex_num) 492 else: 493 the_string += converted 494 self.__write_obj.write('tx<nu<__________<%s\n' % the_string) 495 # print the_string 496 else: 497 if self.__caps_list[-1] == 'true' \ 498 and self.__convert_caps\ 499 and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'): 500 text = text.upper() 501 self.__write_obj.write('tx<nu<__________<%s\n' % text) 502 503 def __utf_to_caps_func(self, line): 504 """ 505 Required: 506 line -- line to parse 507 returns 508 nothing 509 Logic 510 Get the text, and use another method to convert 511 """ 512 utf_text = line[17:-1] 513 if self.__caps_list[-1] == 'true' and self.__convert_caps: 514 # utf_text = utf_text.upper() 515 utf_text = self.__utf_token_to_caps_func(utf_text) 516 self.__write_obj.write('tx<ut<__________<%s\n' % utf_text) 517 518 def __utf_token_to_caps_func(self, char_entity): 519 """ 520 Required: 521 utf_text -- such as &xxx; 522 Returns: 523 token converted to the capital equivalent 524 Logic: 525 RTF often stores text in the improper values. For example, a 526 capital umlaut o (?), is stores as ?. This function swaps the 527 case by looking up the value in a dictionary. 528 """ 529 hex_num = char_entity[3:] 530 length = len(hex_num) 531 if length == 3: 532 hex_num = '00%s' % hex_num 533 elif length == 4: 534 hex_num = '0%s' % hex_num 535 new_char_entity = '&#x%s' % hex_num 536 converted = self.__caps_uni_dict.get(new_char_entity) 537 if not converted: 538 # bullets and other entities don't have capital equivalents 539 return char_entity 540 else: 541 return converted 542 543 def __convert_body(self): 544 self.__state = 'body' 545 with open_for_read(self.__file) as read_obj: 546 with open_for_write(self.__write_to) as self.__write_obj: 547 for line in read_obj: 548 self.__token_info = line[:16] 549 action = self.__body_state_dict.get(self.__state) 550 if action is None: 551 sys.stderr.write('error no state found in hex_2_utf8', 552 self.__state 553 ) 554 action(line) 555 copy_obj = copy.Copy(bug_handler=self.__bug_handler) 556 if self.__copy: 557 copy_obj.copy_file(self.__write_to, "body_utf_convert.data") 558 copy_obj.rename(self.__write_to, self.__file) 559 os.remove(self.__write_to) 560 561 def convert_hex_2_utf8(self): 562 self.__initiate_values() 563 if self.__area_to_convert == 'preamble': 564 self.__convert_preamble() 565 else: 566 self.__convert_body() 567 568 569""" 570how to swap case for non-capitals 571my_string.swapcase() 572An example of how to use a hash for the caps function 573(but I shouldn't need this, since utf text is separate 574 from regular text?) 575sub_dict = { 576 "а" : "some other value" 577 } 578def my_sub_func(matchobj): 579 info = matchobj.group(0) 580 value = sub_dict.get(info) 581 return value 582 return "f" 583line = "а more text" 584reg_exp = re.compile(r'(?P<name>а|б)') 585line2 = re.sub(reg_exp, my_sub_func, line) 586print line2 587""" 588