1#########################################################################
2#                                                                       #
3#                                                                       #
4#   copyright 2002 Paul Henry Tremblay                                  #
5#                                                                       #
6#   This program is distributed in the hope that it will be useful,     #
7#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
8#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
9#   General Public License for more details.                            #
10#                                                                       #
11#                                                                       #
12#########################################################################
13import sys, os, io
14
15from calibre.ebooks.rtf2xml import get_char_map, copy
16from calibre.ebooks.rtf2xml.char_set import char_set
17from calibre.ptempfile import better_mktemp
18
19from . import open_for_read, open_for_write
20
21
22class Hex2Utf8:
23    """
24    Convert Microsoft hexadecimal numbers to utf-8
25    """
26
27    def __init__(self,
28            in_file,
29            area_to_convert,
30            char_file,
31            default_char_map,
32            bug_handler,
33            invalid_rtf_handler,
34            copy=None,
35            temp_dir=None,
36            symbol=None,
37            wingdings=None,
38            caps=None,
39            convert_caps=None,
40            dingbats=None,
41            run_level=1,
42            ):
43        """
44        Required:
45            'file'
46            'area_to_convert'--the area of file to convert
47            'char_file'--the file containing the character mappings
48            'default_char_map'--name of default character map
49        Optional:
50            'copy'-- whether to make a copy of result for debugging
51            'temp_dir' --where to output temporary results (default is
52            directory from which the script is run.)
53            'symbol'--whether to load the symbol character map
54            'winddings'--whether to load the wingdings character map
55            'caps'--whether to load the caps character map
56            'convert_to_caps'--wether to convert caps to utf-8
57        Returns:
58            nothing
59        """
60        self.__file = in_file
61        self.__copy = copy
62        if area_to_convert not in ('preamble', 'body'):
63            msg = (
64            'Developer error! Wrong flag.\n'
65            'in module "hex_2_utf8.py\n'
66            '"area_to_convert" must be "body" or "preamble"\n'
67            )
68            raise self.__bug_handler(msg)
69        self.__char_file = char_file
70        self.__area_to_convert = area_to_convert
71        self.__default_char_map = default_char_map
72        self.__symbol = symbol
73        self.__wingdings = wingdings
74        self.__dingbats = dingbats
75        self.__caps = caps
76        self.__convert_caps = 0
77        self.__convert_symbol = 0
78        self.__convert_wingdings = 0
79        self.__convert_zapf = 0
80        self.__run_level = run_level
81        self.__write_to = better_mktemp()
82        self.__bug_handler = bug_handler
83        self.__invalid_rtf_handler = invalid_rtf_handler
84
85    def update_values(self,
86                        file,
87                        area_to_convert,
88                        char_file,
89                        convert_caps,
90                        convert_symbol,
91                        convert_wingdings,
92                        convert_zapf,
93                        copy=None,
94                        temp_dir=None,
95                        symbol=None,
96                        wingdings=None,
97                        caps=None,
98                        dingbats=None,
99                    ):
100        """
101        Required:
102            'file'
103            'area_to_convert'--the area of file to convert
104            'char_file'--the file containing the character mappings
105        Optional:
106            'copy'-- whether to make a copy of result for debugging
107            'temp_dir' --where to output temporary results (default is
108            directory from which the script is run.)
109            'symbol'--whether to load the symbol character map
110            'winddings'--whether to load the wingdings character map
111            'caps'--whether to load the caps character map
112            'convert_to_caps'--wether to convert caps to utf-8
113        Returns:
114            nothing
115            """
116        self.__file=file
117        self.__copy = copy
118        if area_to_convert not in ('preamble', 'body'):
119            msg = (
120            'in module "hex_2_utf8.py\n'
121            '"area_to_convert" must be "body" or "preamble"\n'
122            )
123            raise self.__bug_handler(msg)
124        self.__area_to_convert = area_to_convert
125        self.__symbol = symbol
126        self.__wingdings = wingdings
127        self.__dingbats = dingbats
128        self.__caps = caps
129        self.__convert_caps = convert_caps
130        self.__convert_symbol = convert_symbol
131        self.__convert_wingdings = convert_wingdings
132        self.__convert_zapf = convert_zapf
133        # new!
134        # no longer try to convert these
135        # self.__convert_symbol = 0
136        # self.__convert_wingdings = 0
137        # self.__convert_zapf = 0
138
139    def __initiate_values(self):
140        """
141        Required:
142            Nothing
143        Set values, including those for the dictionaries.
144        The file that contains the maps is broken down into many different
145        sets. For example, for the Symbol font, there is the standard part for
146        hexadecimal numbers, and the part for Microsoft characters. Read
147        each part in, and then combine them.
148        """
149        # the default encoding system, the lower map for characters 0 through
150        # 128, and the encoding system for Microsoft characters.
151        # New on 2004-05-8: the self.__char_map is not in directory with other
152        # modules
153        self.__char_file = io.StringIO(char_set)
154        char_map_obj =  get_char_map.GetCharMap(
155                char_file=self.__char_file,
156                bug_handler=self.__bug_handler,
157                )
158        up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
159        bt_128_dict = char_map_obj.get_char_map(map='bottom_128')
160        ms_standard_dict = char_map_obj.get_char_map(map='ms_standard')
161        self.__def_dict = {}
162        self.__def_dict.update(up_128_dict)
163        self.__def_dict.update(bt_128_dict)
164        self.__def_dict.update(ms_standard_dict)
165        self.__current_dict = self.__def_dict
166        self.__current_dict_name = 'default'
167        self.__in_caps = 0
168        self.__special_fonts_found = 0
169        if self.__symbol:
170            symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL')
171            ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol')
172            self.__symbol_dict = {}
173            self.__symbol_dict.update(symbol_base_dict)
174            self.__symbol_dict.update(ms_symbol_dict)
175        if self.__wingdings:
176            wingdings_base_dict = char_map_obj.get_char_map(map='wingdings')
177            ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings')
178            self.__wingdings_dict = {}
179            self.__wingdings_dict.update(wingdings_base_dict)
180            self.__wingdings_dict.update(ms_wingdings_dict)
181        if self.__dingbats:
182            dingbats_base_dict = char_map_obj.get_char_map(map='dingbats')
183            ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats')
184            self.__dingbats_dict = {}
185            self.__dingbats_dict.update(dingbats_base_dict)
186            self.__dingbats_dict.update(ms_dingbats_dict)
187        # load dictionary for caps, and make a string for the replacement
188        self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
189        # # print self.__caps_uni_dict
190        # don't think I'll need this
191        # keys = self.__caps_uni_dict.keys()
192        # self.__caps_uni_replace = '|'.join(keys)
193        self.__preamble_state_dict = {
194            'preamble'      :       self.__preamble_func,
195            'body'          :       self.__body_func,
196            'mi<mk<body-open_'  :   self.__found_body_func,
197            'tx<hx<__________'  :   self.__hex_text_func,
198            }
199        self.__body_state_dict = {
200            'preamble'      :       self.__preamble_for_body_func,
201            'body'          :       self.__body_for_body_func,
202            }
203        self.__in_body_dict = {
204            'mi<mk<body-open_'  :   self.__found_body_func,
205            'tx<ut<__________'  :   self.__utf_to_caps_func,
206            'tx<hx<__________'  :   self.__hex_text_func,
207            'tx<mc<__________'  :   self.__hex_text_func,
208            'tx<nu<__________'  :   self.__text_func,
209            'mi<mk<font______'  :   self.__start_font_func,
210            'mi<mk<caps______'  :   self.__start_caps_func,
211            'mi<mk<font-end__'  :   self.__end_font_func,
212            'mi<mk<caps-end__'  :   self.__end_caps_func,
213        }
214        self.__caps_list = ['false']
215        self.__font_list = ['not-defined']
216
217    def __hex_text_func(self, line):
218        """
219        Required:
220            'line' -- the line
221        Logic:
222            get the hex_num and look it up in the default dictionary. If the
223            token is in the dictionary, then check if the value starts with a
224            "&". If it does, then tag the result as utf text. Otherwise, tag it
225            as normal text.
226            If the hex_num is not in the dictionary, then a mistake has been
227            made.
228            """
229        hex_num = line[17:-1]
230        converted = self.__current_dict.get(hex_num)
231        if converted is not None:
232            # tag as utf-8
233            if converted[0:1] == "&":
234                font = self.__current_dict_name
235                if self.__convert_caps\
236                and self.__caps_list[-1] == 'true'\
237                and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
238                    converted = self.__utf_token_to_caps_func(converted)
239                self.__write_obj.write(
240                'tx<ut<__________<%s\n' % converted
241                )
242            # tag as normal text
243            else:
244                font = self.__current_dict_name
245                if self.__convert_caps\
246                and self.__caps_list[-1] == 'true'\
247                and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
248                    converted = converted.upper()
249                self.__write_obj.write(
250                'tx<nu<__________<%s\n' % converted
251                )
252        # error
253        else:
254            token = hex_num.replace("'", '')
255            the_num = 0
256            if token:
257                the_num = int(token, 16)
258            if the_num > 10:
259                self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
260                    hex_num)
261                if self.__run_level > 4:
262                    # msg = 'no dictionary entry for %s\n'
263                    # msg += 'the hexadecimal num is "%s"\n' % (hex_num)
264                    # msg += 'dictionary is %s\n' % self.__current_dict_name
265                    msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
266                    raise self.__bug_handler(msg)
267
268    def __found_body_func(self, line):
269        self.__state = 'body'
270        self.__write_obj.write(line)
271
272    def __body_func(self, line):
273        """
274        When parsing preamble
275        """
276        self.__write_obj.write(line)
277
278    def __preamble_func(self, line):
279        action = self.__preamble_state_dict.get(self.__token_info)
280        if action is not None:
281            action(line)
282        else:
283            self.__write_obj.write(line)
284
285    def __convert_preamble(self):
286        self.__state = 'preamble'
287        with open_for_write(self.__write_to) as self.__write_obj:
288            with open_for_read(self.__file) as read_obj:
289                for line in read_obj:
290                    self.__token_info = line[:16]
291                    action = self.__preamble_state_dict.get(self.__state)
292                    if action is None:
293                        sys.stderr.write('error no state found in hex_2_utf8',
294                        self.__state
295                        )
296                    action(line)
297        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
298        if self.__copy:
299            copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
300        copy_obj.rename(self.__write_to, self.__file)
301        os.remove(self.__write_to)
302
303    def __preamble_for_body_func(self, line):
304        """
305        Required:
306            line -- line to parse
307        Returns:
308            nothing
309        Logic:
310            Used when parsing the body.
311        """
312        if self.__token_info == 'mi<mk<body-open_':
313            self.__found_body_func(line)
314        self.__write_obj.write(line)
315
316    def __body_for_body_func(self, line):
317        """
318        Required:
319            line -- line to parse
320        Returns:
321            nothing
322        Logic:
323            Used when parsing the body.
324        """
325        action = self.__in_body_dict.get(self.__token_info)
326        if action is not None:
327            action(line)
328        else:
329            self.__write_obj.write(line)
330
331    def __start_font_func(self, line):
332        """
333        Required:
334            line -- line to parse
335        Returns:
336            nothing
337        Logic:
338            add font face to font_list
339        """
340        face = line[17:-1]
341        self.__font_list.append(face)
342        if face == 'Symbol' and self.__convert_symbol:
343            self.__current_dict_name = 'Symbol'
344            self.__current_dict = self.__symbol_dict
345        elif face == 'Wingdings' and self.__convert_wingdings:
346            self.__current_dict_name = 'Wingdings'
347            self.__current_dict = self.__wingdings_dict
348        elif face == 'Zapf Dingbats' and self.__convert_zapf:
349            self.__current_dict_name = 'Zapf Dingbats'
350            self.__current_dict = self.__dingbats_dict
351        else:
352            self.__current_dict_name = 'default'
353            self.__current_dict = self.__def_dict
354
355    def __end_font_func(self, line):
356        """
357        Required:
358            line -- line to parse
359        Returns:
360            nothing
361        Logic:
362            pop font_list
363        """
364        if len(self.__font_list) > 1:
365            self.__font_list.pop()
366        else:
367            sys.stderr.write('module is hex_2_utf8\n')
368            sys.stderr.write('method is end_font_func\n')
369            sys.stderr.write('self.__font_list should be greater than one?\n')
370        face = self.__font_list[-1]
371        if face == 'Symbol' and self.__convert_symbol:
372            self.__current_dict_name = 'Symbol'
373            self.__current_dict = self.__symbol_dict
374        elif face == 'Wingdings' and self.__convert_wingdings:
375            self.__current_dict_name = 'Wingdings'
376            self.__current_dict = self.__wingdings_dict
377        elif face == 'Zapf Dingbats' and self.__convert_zapf:
378            self.__current_dict_name = 'Zapf Dingbats'
379            self.__current_dict = self.__dingbats_dict
380        else:
381            self.__current_dict_name = 'default'
382            self.__current_dict = self.__def_dict
383
384    def __start_special_font_func_old(self, line):
385        """
386        Required:
387            line -- line
388        Returns;
389            nothing
390        Logic:
391            change the dictionary to use in conversion
392        """
393        # for error checking
394        if self.__token_info == 'mi<mk<font-symbo':
395            self.__current_dict.append(self.__symbol_dict)
396            self.__special_fonts_found += 1
397            self.__current_dict_name = 'Symbol'
398        elif self.__token_info == 'mi<mk<font-wingd':
399            self.__special_fonts_found += 1
400            self.__current_dict.append(self.__wingdings_dict)
401            self.__current_dict_name = 'Wingdings'
402        elif self.__token_info == 'mi<mk<font-dingb':
403            self.__current_dict.append(self.__dingbats_dict)
404            self.__special_fonts_found += 1
405            self.__current_dict_name = 'Zapf Dingbats'
406
407    def __end_special_font_func(self, line):
408        """
409        Required:
410            line --line to parse
411        Returns:
412            nothing
413        Logic:
414            pop the last dictionary, which should be a special font
415        """
416        if len(self.__current_dict) < 2:
417            sys.stderr.write('module is hex_2_utf 8\n')
418            sys.stderr.write('method is __end_special_font_func\n')
419            sys.stderr.write('less than two dictionaries --can\'t pop\n')
420            self.__special_fonts_found -= 1
421        else:
422            self.__current_dict.pop()
423            self.__special_fonts_found -= 1
424            self.__dict_name = 'default'
425
426    def __start_caps_func_old(self, line):
427        """
428        Required:
429            line -- line to parse
430        Returns:
431            nothing
432        Logic:
433            A marker that marks the start of caps has been found. Set
434            self.__in_caps to 1
435        """
436        self.__in_caps = 1
437
438    def __start_caps_func(self, line):
439        """
440        Required:
441            line -- line to parse
442        Returns:
443            nothing
444        Logic:
445            A marker that marks the start of caps has been found. Set
446            self.__in_caps to 1
447        """
448        self.__in_caps = 1
449        value = line[17:-1]
450        self.__caps_list.append(value)
451
452    def __end_caps_func(self, line):
453        """
454        Required:
455            line -- line to parse
456        Returns:
457            nothing
458        Logic:
459            A marker that marks the end of caps has been found.
460            set self.__in_caps to 0
461        """
462        if len(self.__caps_list) > 1:
463            self.__caps_list.pop()
464        else:
465            sys.stderr.write('Module is hex_2_utf8\n'
466            'method is __end_caps_func\n'
467            'caps list should be more than one?\n')  # self.__in_caps not set
468
469    def __text_func(self, line):
470        """
471        Required:
472            line -- line to parse
473        Returns:
474            nothing
475        Logic:
476            if in caps, convert. Otherwise, print out.
477        """
478        text = line[17:-1]
479        # print line
480        if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
481            the_string = ''
482            for letter in text:
483                hex_num = hex(ord(letter))
484                hex_num = str(hex_num)
485                hex_num = hex_num.upper()
486                hex_num = hex_num[2:]
487                hex_num = '\'%s' % hex_num
488                converted = self.__current_dict.get(hex_num)
489                if converted is None:
490                    sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n')
491                    sys.stderr.write('no hex value for "%s"\n' % hex_num)
492                else:
493                    the_string += converted
494            self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
495            # print the_string
496        else:
497            if self.__caps_list[-1] == 'true' \
498                and self.__convert_caps\
499                and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
500                text = text.upper()
501            self.__write_obj.write('tx<nu<__________<%s\n' % text)
502
503    def __utf_to_caps_func(self, line):
504        """
505        Required:
506            line -- line to parse
507        returns
508            nothing
509        Logic
510            Get the text, and use another method to convert
511        """
512        utf_text = line[17:-1]
513        if self.__caps_list[-1] == 'true' and self.__convert_caps:
514            # utf_text = utf_text.upper()
515            utf_text = self.__utf_token_to_caps_func(utf_text)
516        self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
517
518    def __utf_token_to_caps_func(self, char_entity):
519        """
520        Required:
521            utf_text -- such as &xxx;
522        Returns:
523            token converted to the capital equivalent
524        Logic:
525            RTF often stores text in the improper values. For example, a
526            capital umlaut o (?), is stores as ?. This function swaps the
527            case by looking up the value in a dictionary.
528        """
529        hex_num = char_entity[3:]
530        length = len(hex_num)
531        if length == 3:
532            hex_num = '00%s' % hex_num
533        elif length == 4:
534            hex_num = '0%s' % hex_num
535        new_char_entity = '&#x%s' % hex_num
536        converted = self.__caps_uni_dict.get(new_char_entity)
537        if not converted:
538            # bullets and other entities don't have capital equivalents
539            return char_entity
540        else:
541            return converted
542
543    def __convert_body(self):
544        self.__state = 'body'
545        with open_for_read(self.__file) as read_obj:
546            with open_for_write(self.__write_to) as self.__write_obj:
547                for line in read_obj:
548                    self.__token_info = line[:16]
549                    action = self.__body_state_dict.get(self.__state)
550                    if action is None:
551                        sys.stderr.write('error no state found in hex_2_utf8',
552                        self.__state
553                        )
554                    action(line)
555        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
556        if self.__copy:
557            copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
558        copy_obj.rename(self.__write_to, self.__file)
559        os.remove(self.__write_to)
560
561    def convert_hex_2_utf8(self):
562        self.__initiate_values()
563        if self.__area_to_convert == 'preamble':
564            self.__convert_preamble()
565        else:
566            self.__convert_body()
567
568
569"""
570how to swap case for non-capitals
571my_string.swapcase()
572An example of how to use a hash for the caps function
573(but I shouldn't need this, since utf text is separate
574 from regular text?)
575sub_dict = {
576    "&#x0430;"   : "some other value"
577    }
578def my_sub_func(matchobj):
579    info =  matchobj.group(0)
580    value = sub_dict.get(info)
581    return value
582    return "f"
583line = "&#x0430; more text"
584reg_exp = re.compile(r'(?P<name>&#x0430;|&#x0431;)')
585line2 = re.sub(reg_exp, my_sub_func, line)
586print line2
587"""
588