1#!/usr/bin/python3 2# vim:fileencoding=utf-8:sw=4:et 3 4# generate-chinese-variants 5# 6# Copyright (c) 2013-2018 Mike FABIAN <mfabian@redhat.com> 7# 8# This library is free software; you can redistribute it and/or 9# modify it under the terms of the GNU Lesser General Public 10# License as published by the Free Software Foundation; either 11# version 3.0 of the License, or (at your option) any later version. 12# 13# This library is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16# Lesser General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with this program. If not, see <http://www.gnu.org/licenses/>. 20 21from typing import Any 22import re 23import logging 24import sys 25 26# Unihan_Variants.txt contains the following 2 lines: 27# 28# U+50DE kSimplifiedVariant U+4F2A 29# U+50DE kTraditionalVariant U+507D U+50DE 30# 31# This seems to be currently the only case in Unihan_Variants.txt where 32# a character which has entries for kTraditionalVariant and 33# the same character is listed again among the traditional variants 34# is *not* simplified Chinese. 35# 36# U+50DE 僞 is traditional Chinese. 37# U+507D 偽 is also traditional Chinese. 38# U+4F2A 伪 is simplified Chinese 39# 40# This does not cause a problem with the current parsing code 41# of Unihan_Variants.txt because the line 42# 43# U+50DE kSimplifiedVariant U+4F2A 44# 45# is read first and thus the character is already inserted in the 46# “VARIANTS_TABLE_ORIG” dictionary as traditional Chinese, which is correct. 47# If a character is already in the dictionary and more lines for the 48# same character are read from Unihan_Variants.txt, these extra lines 49# are ignored. 50# 51# But maybe for some corner cases more tweaking of the code is 52# necessary. One can also add overrides manually to the 53# initial content of “VARIANTS_TABLE_ORIG”. 54 55VARIANTS_TABLE_ORIG = { 56 # Meaning of the bits in the values: 57 # 1 = 1 << 0 simplified Chinese 58 # 2 = 1 << 1 traditional Chinese 59 # 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese 60 # 4 = 1 << 2 mixture of simplified and traditional Chinese 61 # 62 # overrides can be added manually here. For example the following 63 # line marks the 〇 character as used in both 64 # simplified and traditional Chinese: 65 u'〇': 3 # simplified *and* traditional Chinese 66 } 67 68# keep the lines from Unihan_Variants.txt which were used for debugging 69VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED = {} 70 71def read_unihan_variants(unihan_variants_file) -> None: 72 ''' 73 Read the Unihan_Variants.txt file downloaded from Unicode.org. 74 ''' 75 for line in unihan_variants_file: 76 line = line.strip() 77 if not re.match('^#', line): 78 if re.search('(kTraditionalVariant|kSimplifiedVariant)', line): 79 match = re.match(r'^U\+([0-9A-F]{4,5})', line) 80 if match: 81 char = chr(int(match.group(1), 16)) 82 category = 0 # should never stay at this value 83 if re.match(re.escape(match.group(0)) 84 + r'.*' 85 + re.escape(match.group(0)), line): 86 # is both simplified and traditional 87 category = 1 | 1 << 1 88 elif re.search('kTraditionalVariant', line): 89 category = 1 # simplified only 90 elif re.search('kSimplifiedVariant', line): 91 category = 1 << 1 # traditional only 92 logging.debug( 93 'char=%s category=%d line=%s', 94 char, category, line) 95 if not char in VARIANTS_TABLE_ORIG: 96 VARIANTS_TABLE_ORIG[char] = category 97 if (not char 98 in VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED): 99 VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED[ 100 char] = line 101 102def detect_chinese_category_old(phrase: str) -> int: 103 ''' 104 Old function using encoding conversion to guess whether 105 a text is simplified Chinese, traditional Chinese, both, 106 or unknown. Does not work well, is included here for reference 107 and for comparing with the results of the new, improved function 108 using the data from the Unihan database. 109 ''' 110 # this is the bitmask we will use, 111 # from low to high, 1st bit is simplified Chinese, 112 # 2nd bit is traditional Chinese, 113 # 3rd bit means out of gbk 114 category = 0 115 # make sure that we got a unicode string 116 tmp_phrase = ''.join(re.findall(u'[' 117 + u'\u4E00-\u9FCB' 118 + u'\u3400-\u4DB5' 119 + u'\uF900-\uFaFF' 120 + u'\U00020000-\U0002A6D6' 121 + u'\U0002A700-\U0002B734' 122 + u'\U0002B740-\U0002B81D' 123 + u'\U0002F800-\U0002FA1D' 124 + u']+', 125 phrase)) 126 # first whether in gb2312 127 try: 128 tmp_phrase.encode('gb2312') 129 category |= 1 130 except: 131 if u'〇' in tmp_phrase: 132 # we add '〇' into SC as well 133 category |= 1 134 # second check big5-hkscs 135 try: 136 tmp_phrase.encode('big5hkscs') 137 category |= 1 << 1 138 except: 139 # then check whether in gbk, 140 if category & 1: 141 # already know in SC 142 pass 143 else: 144 # need to check 145 try: 146 tmp_phrase.encode('gbk') 147 category |= 1 148 except: 149 # not in gbk 150 pass 151 # then set for 3rd bit, if not in SC and TC 152 if not category & (1 | 1 << 1): 153 category |= (1 << 2) 154 return category 155 156def write_variants_script(script_file) -> None: 157 ''' 158 Write the generated Python script. 159 ''' 160 script_file.write('''#!/usr/bin/python 161# vim:fileencoding=utf-8:sw=4:et 162 163# auto-generated by “generate-chinese-variants.py”, do not edit here! 164# 165# Copyright (c) 2013 Mike FABIAN <mfabian@redhat.com> 166# 167# This library is free software; you can redistribute it and/or 168# modify it under the terms of the GNU Lesser General Public 169# License as published by the Free Software Foundation; either 170# version 3.0 of the License, or (at your option) any later version. 171# 172# This library is distributed in the hope that it will be useful, 173# but WITHOUT ANY WARRANTY; without even the implied warranty of 174# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 175# Lesser General Public License for more details. 176# 177# You should have received a copy of the GNU General Public License 178# along with this program. If not, see <http://www.gnu.org/licenses/>. 179''') 180 181 script_file.write(''' 182import sys 183''') 184 185 script_file.write(''' 186VARIANTS_TABLE = { 187 # Meaning of the bits in the values: 188 # 1 = 1 << 0 simplified Chinese 189 # 2 = 1 << 1 traditional Chinese 190 # 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese 191 # 4 = 1 << 2 mixture of simplified and traditional Chinese 192''') 193 194 for phrase in sorted(VARIANTS_TABLE_ORIG): 195 script_file.write( 196 " u'" + phrase + "': " 197 + "%s" %VARIANTS_TABLE_ORIG[phrase] + ",\n") 198 199 script_file.write(''' } 200''') 201 202 script_file.write(''' 203def detect_chinese_category(phrase): 204 \'\'\' 205 New function using Unihan data to guess whether a text is 206 simplified Chinese, traditional Chinese, both, or something rare 207 like a mixture of exclusively simplified with exclusively traditional 208 characters. 209 210 Meaning of the bits in the category value returned by this function: 211 1 = 1 << 0 simplified Chinese 212 2 = 1 << 1 traditional Chinese 213 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese 214 4 = 1 << 2 mixture of simplified and traditional Chinese 215 \'\'\' 216 # make sure that we got a unicode string 217 if phrase in VARIANTS_TABLE: 218 # the complete phrase is in VARIANTS_TABLE, just return the 219 # value found: 220 return VARIANTS_TABLE[phrase] 221 category = 0xFF 222 for char in phrase: 223 if char in VARIANTS_TABLE: 224 category &= VARIANTS_TABLE[char] 225 else: 226 # If it is not listed in VARIANTS_TABLE, assume it is 227 # both simplified and traditional Chinese. 228 # It could be something non-Chinese as well then, but 229 # if it is non-Chinese, it should also be allowed to 230 # occur in any Chinese text and thus classified as 231 # both simplified *and* traditional Chinese (the emoji 232 # table for example uses many non-Chinese characters) 233 category &= (1 | 1 << 1) 234 if category == 0: 235 # If category is 0 after binary & of the categories of all the 236 # characters in the phrase, it means that the phrase contained 237 # exclusively simplified *and* exclusively traditional 238 # characters at the same time. For example if the phrase is 239 # “乌烏” then “乌” gets category 1 (simplified Chinese) 240 # and “烏” gets category 2 (traditional Chinese), the result 241 # of the binary & is thus 0. In that case, classify it as 242 # category 4 which is for weird, excentric, rare stuff. If the 243 # user selects one of the modes “all characters but 244 # simplified Chinese first” or “all characters but 245 # traditional Chinese first”, phrases with category 4 will be 246 # shown but filtered to be shown only at the end of the 247 # candidate list. 248 category = 1 << 2 249 return category 250''') 251 252TEST_DATA = { 253 # Meaning of the bits in the values: 254 # 1 = 1 << 0 simplified Chinese 255 # 2 = 1 << 1 traditional Chinese 256 # 3 = (1 | 1 << 1) used both in simplified *and* traditional Chinese 257 # 4 = 1 << 2 mixture of simplified and traditional Chinese 258 u'乌': 1, 259 u'烏': 2, 260 u'晞': 3, 261 u'䖷': 3, 262 u'乌烏': 4, 263 u'a☺α乌': 1, 264 u'a☺α烏': 2, 265 u'台': 3, 266 u'同': 3, 267 u'表': 3, # U+8868 268 u'面': 3, # U+9762 269 # Characters below this comments probably have buggy entries 270 # in Unihan_Variants.txt: 271 u'覆': 3, # U+8986 272 u'杰': 3, # U+6770 273 u'系': 3, # U+7CFB 274 u'乾': 3, # U+4E7E 275 u'著': 3, # U+8457 Patch by Heiher <r@hev.cc> 276 u'只': 3, # U+53EA, see: https://github.com/kaio/ibus-table/issues/74 277 # Problems reported in https://github.com/ibus/ibus/issues/2323 278 u'着': 3, # U+7740, used in HK 279 u'枱': 3, # U+67B1, used in HK (correct already, no SC variant entry in Unihan_Variants.txt) 280 u'云': 3, # U+4E91, used in HK and TW 281 u'裡': 3, # U+88E1, (Untypable in S) used in all places same meaning as 裏 282 u'復': 3, # U+5FA9, (Untypable in S) used in all places same meaning in S, diff in T 283 u'采': 3, # U+91C7, (Untypable in T) used in Hong Kong, not sure about TW 284 # http://dict.revised.moe.edu.tw/cgi-bin/cbdic/gsweb.cgi has 采, i.e. probably 285 # it is used in TW 286 u'吓': 3, # U+5413, (Untypable in T) used in Cantonese. 287 u'尸': 3, # U+5C38, (Untypable in T) idk where it is used, but Cangjie has that as a radical. 288 u'揾': 3, # U+63FE, used in HK 289 # (TW seems to use only 搵, see http://dict.revised.moe.edu.tw/cgi-bin/cbdic/gsweb.cgi) 290 } 291 292def test_detection(generated_script) -> int: 293 ''' 294 Test whether the generated script does the detection correctly. 295 296 Returns the number of errors found. 297 ''' 298 logging.info('Testing detection ...') 299 error_count = 0 300 for phrase in TEST_DATA: 301 if (generated_script.detect_chinese_category(phrase) 302 != TEST_DATA[phrase]): 303 print('phrase', phrase, repr(phrase), 304 'detected as', 305 generated_script.detect_chinese_category(phrase), 306 'should have been', TEST_DATA[phrase], 307 'FAIL.') 308 error_count += 1 309 else: 310 logging.info('phrase=%s %s detected as %d PASS.', 311 phrase, 312 repr(phrase), 313 TEST_DATA[phrase]) 314 return error_count 315 316def compare_old_new_detection(phrase, generated_script) -> None: 317 ''' 318 Only for debugging. 319 320 Compares results of the Chinese category detection using the 321 old and the new function. 322 ''' 323 if (detect_chinese_category_old(phrase) 324 != generated_script.detect_chinese_category(phrase)): 325 logging.debug( 326 '%s %s old=%d new=%d', 327 phrase.encode('utf-8'), 328 repr(phrase), 329 detect_chinese_category_old(phrase), 330 generated_script.detect_chinese_category(phrase)) 331 if phrase in VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED: 332 logging.debug( 333 VARIANTS_TABLE_ORIG_UNIHAN_VARIANTS_ENTRY_USED[phrase]) 334 335def parse_args() -> Any: 336 '''Parse the command line arguments''' 337 import argparse 338 parser = argparse.ArgumentParser( 339 description=( 340 'Generate a script containing a table and a function ' 341 + 'to check whether a string of Chinese characters ' 342 + 'is simplified or traditional')) 343 parser.add_argument('-i', '--inputfilename', 344 nargs='?', 345 type=str, 346 default='./Unihan_Variants.txt', 347 help='input file, default is ./Unihan_Variants.txt') 348 parser.add_argument('-o', '--outputfilename', 349 nargs='?', 350 type=str, 351 default='./chinese_variants.py', 352 help='output file, default is ./chinese_variants.py') 353 parser.add_argument('-d', '--debug', 354 action='store_true', 355 help='print debugging output') 356 return parser.parse_args() 357 358def main() -> None: 359 '''Main program''' 360 args = parse_args() 361 log_level = logging.INFO 362 if args.debug: 363 log_level = logging.DEBUG 364 logging.basicConfig(format="%(levelname)s: %(message)s", level=log_level) 365 with open(args.inputfilename, 'r') as inputfile: 366 logging.info("input file=%s", inputfile) 367 read_unihan_variants(inputfile) 368 with open(args.outputfilename, 'w') as outputfile: 369 logging.info("output file=%s", outputfile) 370 write_variants_script(outputfile) 371 372 import imp 373 generated_script = imp.load_source('dummy', args.outputfilename) 374 375 logging.info('Testing detection ...') 376 error_count = test_detection(generated_script) 377 if error_count: 378 logging.info('FAIL: %s tests failed, exiting ...', error_count) 379 exit(1) 380 else: 381 logging.info('PASS: All tests passed.') 382 383 for phrase in generated_script.VARIANTS_TABLE: # type: ignore 384 compare_old_new_detection(phrase, generated_script) 385 386if __name__ == '__main__': 387 main() 388