1#!/usr/bin/python3
2# vim:fileencoding=utf-8:sw=4:et
3
4# generate-chinese-variants
5#
6# Copyright (c) 2013-2018 Mike FABIAN <mfabian@redhat.com>
7#
8# This library is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 3.0 of the License, or (at your option) any later version.
12#
13# This library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program.  If not, see <http://www.gnu.org/licenses/>.
20
21from typing import Any
22import re
23import logging
24import sys
25
26# Unihan_Variants.txt contains the following 2 lines:
27#
28# U+50DE  kSimplifiedVariant      U+4F2A
29# U+50DE  kTraditionalVariant     U+507D U+50DE
30#
31# This seems to be currently the only case in Unihan_Variants.txt where
32# a character which has entries for kTraditionalVariant and
33# the same character is listed again among the traditional variants
34# is *not* simplified Chinese.
35#
36# U+50DE 僞 is traditional Chinese.
37# U+507D 偽 is also traditional Chinese.
38# U+4F2A 伪 is simplified Chinese
39#
40# This does not cause a problem with the current parsing code
41# of Unihan_Variants.txt because the line
42#
43# U+50DE  kSimplifiedVariant      U+4F2A
44#
45# is read first and thus the character is already inserted in the
46# “VARIANTS_TABLE_ORIG” dictionary as traditional Chinese, which is correct.
47# If a character is already in the dictionary and more lines for the
48# same character are read from Unihan_Variants.txt, these extra lines
49# are ignored.
50#
51# But maybe for some corner cases more tweaking of the code is
52# necessary. One can also add overrides manually to the
53# initial content of “VARIANTS_TABLE_ORIG”.
54
55VARIANTS_TABLE_ORIG = {
56    # Meaning of the bits in the values:
57    # 1 = 1 << 0       simplified Chinese
58    # 2 = 1 << 1       traditional Chinese
59    # 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese
60    # 4 = 1 << 2       mixture of simplified and traditional Chinese
61    #
62    # overrides can be added manually here. For example the following
63    # line marks the 〇 character as used in both
64    # simplified and traditional Chinese:
65    u'〇': 3 # simplified *and* traditional Chinese
66    }
67
68# keep the lines from Unihan_Variants.txt which were used for debugging
69VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED = {}
70
71def read_unihan_variants(unihan_variants_file) -> None:
72    '''
73    Read the Unihan_Variants.txt file downloaded  from Unicode.org.
74    '''
75    for line in unihan_variants_file:
76        line = line.strip()
77        if not re.match('^#', line):
78            if re.search('(kTraditionalVariant|kSimplifiedVariant)', line):
79                match = re.match(r'^U\+([0-9A-F]{4,5})', line)
80                if match:
81                    char = chr(int(match.group(1), 16))
82                    category = 0 # should never  stay at this value
83                    if re.match(re.escape(match.group(0))
84                                + r'.*'
85                                + re.escape(match.group(0)), line):
86                        # is both simplified and traditional
87                        category = 1 | 1 << 1
88                    elif re.search('kTraditionalVariant', line):
89                        category = 1 # simplified only
90                    elif re.search('kSimplifiedVariant', line):
91                        category = 1 << 1 # traditional only
92                    logging.debug(
93                        'char=%s category=%d line=%s',
94                        char, category, line)
95                    if not char in VARIANTS_TABLE_ORIG:
96                        VARIANTS_TABLE_ORIG[char] = category
97                    if (not char
98                            in VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED):
99                        VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED[
100                            char] = line
101
102def detect_chinese_category_old(phrase: str) -> int:
103    '''
104    Old function using encoding conversion to guess whether
105    a text is simplified Chinese, traditional Chinese, both,
106    or unknown. Does not work well, is included here for reference
107    and for comparing with the results of the new, improved function
108    using the data from the Unihan database.
109    '''
110    # this is the bitmask we will use,
111    # from low to high, 1st bit is simplified Chinese,
112    # 2nd bit is traditional Chinese,
113    # 3rd bit means out of gbk
114    category = 0
115    # make sure that we got a unicode string
116    tmp_phrase = ''.join(re.findall(u'['
117                                    + u'\u4E00-\u9FCB'
118                                    + u'\u3400-\u4DB5'
119                                    + u'\uF900-\uFaFF'
120                                    + u'\U00020000-\U0002A6D6'
121                                    + u'\U0002A700-\U0002B734'
122                                    + u'\U0002B740-\U0002B81D'
123                                    + u'\U0002F800-\U0002FA1D'
124                                    + u']+',
125                                    phrase))
126    # first whether in gb2312
127    try:
128        tmp_phrase.encode('gb2312')
129        category |= 1
130    except:
131        if u'〇' in tmp_phrase:
132            # we add '〇' into SC as well
133            category |= 1
134    # second check big5-hkscs
135    try:
136        tmp_phrase.encode('big5hkscs')
137        category |= 1 << 1
138    except:
139        # then check whether in gbk,
140        if category & 1:
141            # already know in SC
142            pass
143        else:
144            # need to check
145            try:
146                tmp_phrase.encode('gbk')
147                category |= 1
148            except:
149                # not in gbk
150                pass
151    # then set for 3rd bit, if not in SC and TC
152    if not category & (1 | 1 << 1):
153        category |= (1 << 2)
154    return category
155
156def write_variants_script(script_file) -> None:
157    '''
158    Write the generated Python script.
159    '''
160    script_file.write('''#!/usr/bin/python
161# vim:fileencoding=utf-8:sw=4:et
162
163# auto-generated by “generate-chinese-variants.py”, do not edit here!
164#
165# Copyright (c) 2013 Mike FABIAN <mfabian@redhat.com>
166#
167# This library is free software; you can redistribute it and/or
168# modify it under the terms of the GNU Lesser General Public
169# License as published by the Free Software Foundation; either
170# version 3.0 of the License, or (at your option) any later version.
171#
172# This library is distributed in the hope that it will be useful,
173# but WITHOUT ANY WARRANTY; without even the implied warranty of
174# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
175# Lesser General Public License for more details.
176#
177# You should have received a copy of the GNU General Public License
178# along with this program.  If not, see <http://www.gnu.org/licenses/>.
179''')
180
181    script_file.write('''
182import sys
183''')
184
185    script_file.write('''
186VARIANTS_TABLE = {
187    # Meaning of the bits in the values:
188    # 1 = 1 << 0       simplified Chinese
189    # 2 = 1 << 1       traditional Chinese
190    # 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese
191    # 4 = 1 << 2       mixture of simplified and traditional Chinese
192''')
193
194    for phrase in sorted(VARIANTS_TABLE_ORIG):
195        script_file.write(
196            "    u'" + phrase + "': "
197            + "%s" %VARIANTS_TABLE_ORIG[phrase] + ",\n")
198
199    script_file.write('''    }
200''')
201
202    script_file.write('''
203def detect_chinese_category(phrase):
204    \'\'\'
205    New function using Unihan data to guess whether a text is
206    simplified Chinese, traditional Chinese, both, or something rare
207    like a mixture of exclusively simplified with exclusively traditional
208    characters.
209
210    Meaning of the bits in the category value returned by this function:
211    1 = 1 << 0       simplified Chinese
212    2 = 1 << 1       traditional Chinese
213    3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese
214    4 = 1 << 2       mixture of simplified and traditional Chinese
215    \'\'\'
216    # make sure that we got a unicode string
217    if phrase in VARIANTS_TABLE:
218        # the complete phrase is in VARIANTS_TABLE, just return the
219        # value found:
220        return VARIANTS_TABLE[phrase]
221    category = 0xFF
222    for char in phrase:
223        if char in VARIANTS_TABLE:
224            category &= VARIANTS_TABLE[char]
225        else:
226            # If it is not listed in VARIANTS_TABLE, assume it is
227            # both simplified and traditional Chinese.
228            # It could be something non-Chinese as well then, but
229            # if it is non-Chinese, it should also be allowed to
230            # occur in any Chinese text and thus classified as
231            # both simplified *and* traditional Chinese (the emoji
232            # table for example uses many non-Chinese characters)
233            category &= (1 | 1 << 1)
234    if category == 0:
235        # If category is 0 after binary & of the categories of all the
236        # characters in the phrase, it means that the phrase contained
237        # exclusively simplified *and* exclusively traditional
238        # characters at the same time.  For example if the phrase is
239        # “乌烏” then “乌” gets category 1 (simplified Chinese)
240        # and “烏” gets category 2 (traditional Chinese), the result
241        # of the binary & is thus 0. In that case, classify it as
242        # category 4 which is for weird, excentric, rare stuff. If the
243        # user selects one of the modes “all characters but
244        # simplified Chinese first” or “all characters but
245        # traditional Chinese first”, phrases with category 4 will be
246        # shown but filtered to be shown only at the end of the
247        # candidate list.
248        category = 1 << 2
249    return category
250''')
251
252TEST_DATA = {
253    # Meaning of the bits in the values:
254    # 1 = 1 << 0       simplified Chinese
255    # 2 = 1 << 1       traditional Chinese
256    # 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese
257    # 4 = 1 << 2       mixture of simplified and traditional Chinese
258    u'乌': 1,
259    u'烏': 2,
260    u'晞': 3,
261    u'䖷': 3,
262    u'乌烏': 4,
263    u'a☺α乌': 1,
264    u'a☺α烏': 2,
265    u'台': 3,
266    u'同': 3,
267    u'表': 3, # U+8868
268    u'面': 3, # U+9762
269    # Characters below this comments probably have buggy entries
270    # in Unihan_Variants.txt:
271    u'覆': 3, # U+8986
272    u'杰': 3, # U+6770
273    u'系': 3, # U+7CFB
274    u'乾': 3, # U+4E7E
275    u'著': 3, # U+8457 Patch by Heiher <r@hev.cc>
276    u'只': 3, # U+53EA, see: https://github.com/kaio/ibus-table/issues/74
277    # Problems reported in https://github.com/ibus/ibus/issues/2323
278    u'着': 3, # U+7740, used in HK
279    u'枱': 3, # U+67B1, used in HK (correct already, no SC variant entry in Unihan_Variants.txt)
280    u'云': 3, # U+4E91, used in HK and TW
281    u'裡': 3, # U+88E1, (Untypable in S) used in all places same meaning as 裏
282    u'復': 3, # U+5FA9, (Untypable in S) used in all places same meaning in S, diff in T
283    u'采': 3, # U+91C7, (Untypable in T) used in Hong Kong, not sure about TW
284    # http://dict.revised.moe.edu.tw/cgi-bin/cbdic/gsweb.cgi has 采, i.e. probably
285    # it is used in TW
286    u'吓': 3, # U+5413, (Untypable in T) used in Cantonese.
287    u'尸': 3, # U+5C38, (Untypable in T) idk where it is used, but Cangjie has that as a radical.
288    u'揾': 3, # U+63FE, used in HK
289    # (TW seems to use only 搵, see http://dict.revised.moe.edu.tw/cgi-bin/cbdic/gsweb.cgi)
290    }
291
292def test_detection(generated_script) -> int:
293    '''
294    Test whether the generated script does the detection correctly.
295
296    Returns the number of errors found.
297    '''
298    logging.info('Testing detection ...')
299    error_count = 0
300    for phrase in TEST_DATA:
301        if (generated_script.detect_chinese_category(phrase)
302                != TEST_DATA[phrase]):
303            print('phrase', phrase, repr(phrase),
304                  'detected as',
305                  generated_script.detect_chinese_category(phrase),
306                  'should have been', TEST_DATA[phrase],
307                  'FAIL.')
308            error_count += 1
309        else:
310            logging.info('phrase=%s %s detected as %d PASS.',
311                         phrase,
312                         repr(phrase),
313                         TEST_DATA[phrase])
314    return error_count
315
316def compare_old_new_detection(phrase, generated_script) -> None:
317    '''
318    Only for debugging.
319
320    Compares results of the Chinese category detection using the
321    old and the new function.
322    '''
323    if (detect_chinese_category_old(phrase)
324            != generated_script.detect_chinese_category(phrase)):
325        logging.debug(
326            '%s %s old=%d new=%d',
327            phrase.encode('utf-8'),
328            repr(phrase),
329            detect_chinese_category_old(phrase),
330            generated_script.detect_chinese_category(phrase))
331        if phrase in VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED:
332            logging.debug(
333                VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED[phrase])
334
335def parse_args() -> Any:
336    '''Parse the command line arguments'''
337    import argparse
338    parser = argparse.ArgumentParser(
339        description=(
340            'Generate a script containing a table and a function '
341            + 'to check whether a string of Chinese characters '
342            + 'is simplified or traditional'))
343    parser.add_argument('-i', '--inputfilename',
344                        nargs='?',
345                        type=str,
346                        default='./Unihan_Variants.txt',
347                        help='input file, default is ./Unihan_Variants.txt')
348    parser.add_argument('-o', '--outputfilename',
349                        nargs='?',
350                        type=str,
351                        default='./chinese_variants.py',
352                        help='output file, default is ./chinese_variants.py')
353    parser.add_argument('-d', '--debug',
354                        action='store_true',
355                        help='print debugging output')
356    return parser.parse_args()
357
358def main() -> None:
359    '''Main program'''
360    args = parse_args()
361    log_level = logging.INFO
362    if args.debug:
363        log_level = logging.DEBUG
364    logging.basicConfig(format="%(levelname)s: %(message)s", level=log_level)
365    with open(args.inputfilename, 'r') as inputfile:
366        logging.info("input file=%s", inputfile)
367        read_unihan_variants(inputfile)
368    with open(args.outputfilename, 'w') as outputfile:
369        logging.info("output file=%s", outputfile)
370        write_variants_script(outputfile)
371
372    import imp
373    generated_script = imp.load_source('dummy', args.outputfilename)
374
375    logging.info('Testing detection ...')
376    error_count = test_detection(generated_script)
377    if error_count:
378        logging.info('FAIL: %s tests failed, exiting ...', error_count)
379        exit(1)
380    else:
381        logging.info('PASS: All tests passed.')
382
383    for phrase in generated_script.VARIANTS_TABLE: # type: ignore
384        compare_old_new_detection(phrase, generated_script)
385
386if __name__ == '__main__':
387    main()
388