1#!/usr/bin/env python 2 3import sys, string 4 5char_type = 'static const gunichar' 6copyright = '' 7 8def print_help(): 9 sys.stderr.write("Usage: hanjatable.py [Unihan database file]\n") 10 sys.exit(1); 11 12def print_description(): 13 desc = """/* 14 * this file is generated from Unihan.txt database file by gen.py (part of 15 * GTK+-2.0 input module package, imhangul) 16 * You can get this database file from ftp://www.unicode.org/Public/UNIDATA/ 17 * or http://www.unicode.org/Public/UNIDATA/ 18 */ 19""" 20 print desc 21 22def print_copyright(): 23 print '/*\n' + copyright + ' */\n' 24 25def unicodetohexnum(str): 26 return string.atoi(str[2:], 16) 27 28def jamotosyllable(cho, jung, jong): 29 syllable_base = 0xAC00 30 choseong_base = 0x1100 31 jungseong_base = 0x1161 32 jongseong_base = 0x11A7 33 njungseong = 21 34 njongseong = 28 35 36 if cho < 0x1100 and cho > 0x1112: 37 return 0 38 if jung < 0x1161 and jung > 0x1175: 39 return 0 40 if jong < 0x11A8 and jong > 0x11C2: 41 return 0 42 43 cho -= choseong_base 44 jung -= jungseong_base 45 jong -= jongseong_base 46 47 ch = ((cho * njungseong) + jung) * njongseong + jong + syllable_base 48 if ch >= 0xAC00 and ch <= 0xD7AF: 49 return ch 50 else: 51 return 0 52 53def phonetocode(phone): 54 choseong_table = { 55 'K': 0x1100, 56 'KK': 0x1101, 57 'N': 0x1102, 58 'T': 0x1103, 59 'TT': 0x1104, 60 'L': 0x1105, 61 'M': 0x1106, 62 'P': 0x1107, 63 'B': 0x1107, 64 'PP': 0x1108, 65 'S': 0x1109, 66 'SS': 0x110A, 67 #'': 0x110B, 68 'C': 0x110C, 69 'CC': 0x110D, 70 'CH': 0x110E, 71 'KH': 0x110F, 72 'TH': 0x1110, 73 'PH': 0x1111, 74 'H': 0x1112 75 } 76 jungseong_table = { 77 'A': 0x1161, 78 'AY': 0x1162, 79 'YA': 0x1163, 80 'YAY': 0x1164, 81 'E': 0x1165, 82 'EY': 0x1166, 83 'YE': 0x1167, 84 'YEY': 0x1168, 85 'O': 0x1169, 86 'WA': 0x116A, 87 'WAY': 0x116B, 88 'OY': 0x116C, 89 'WOY': 0x116C, 90 'YO': 0x116D, 91 'WU': 0x116E, 92 'WE': 0x116F, 93 'WEY': 0x1170, 94 'WI': 0x1171, 95 'YU': 0x1172, 96 'U': 0x1173, 97 'UY': 0x1174, 98 'I': 0x1175 99 } 100 jongseong_table = { 101 'K': 0x11A8, 102 'KK': 0x11A9, 103 'KS': 0x11AA, 104 'N': 0x11AB, 105 'NC': 0x11AC, 106 'NH': 0x11AD, 107 'T': 0x11AE, 108 'L': 0x11AF, 109 'LK': 0x11B0, 110 'LM': 0x11B1, 111 'LP': 0x11B2, 112 'LS': 0x11B3, 113 'LTH': 0x11B4, 114 'LPH': 0x11B5, 115 'LH': 0x11B6, 116 'M': 0x11B7, 117 'P': 0x11B8, 118 'PS': 0x11B9, 119 'S': 0x11BA, 120 'SS': 0x11BB, 121 'NG': 0x11BC, 122 'C': 0x11BD, 123 'CH': 0x11BE, 124 'KH': 0x11BF, 125 'TH': 0x11C0, 126 'PH': 0x11C1, 127 'H': 0x11C2 128 } 129 if choseong_table.has_key(phone[:2]): 130 choseong = choseong_table[phone[:2]] 131 phone = phone[2:] 132 elif choseong_table.has_key(phone[:1]): 133 choseong = choseong_table[phone[:1]] 134 phone = phone[1:] 135 else: 136 choseong = 0x110B 137 138 if jungseong_table.has_key(phone[:3]): 139 jungseong = jungseong_table[phone[:3]] 140 phone = phone[3:] 141 elif jungseong_table.has_key(phone[:2]): 142 jungseong = jungseong_table[phone[:2]] 143 phone = phone[2:] 144 elif jungseong_table.has_key(phone[:1]): 145 jungseong = jungseong_table[phone[:1]] 146 phone = phone[1:] 147 else: 148 sys.stderr.write("%s: phonetic data error\n" % phone) 149 return 0 150 151 if jongseong_table.has_key(phone[:3]): 152 jongseong = jongseong_table[phone[:3]] 153 phone = phone[3:] 154 elif jongseong_table.has_key(phone[:2]): 155 jongseong = jongseong_table[phone[:2]] 156 phone = phone[2:] 157 elif jongseong_table.has_key(phone[:1]): 158 jongseong = jongseong_table[phone[:1]] 159 phone = phone[1:] 160 else: 161 jongseong = 0x11A7 162 163 # print "%x + %x + %x" % ( choseong, jungseong, jongseong ) 164 hangulcode = jamotosyllable(choseong, jungseong, jongseong) 165 166 return hangulcode; 167 168 169# start main procedure 170data_file_name = "Unihan.txt" 171 172if len(sys.argv) == 2: 173 data_file_name = sys.argv[1] 174 175try: 176 data_file = open(data_file_name, 'r') 177 178except: 179 sys.stderr("Cant open file: %s\n" % data_file_name) 180 help() 181 sys.exit(1) 182 183gather_copyright = 0 184table = { } 185for line in data_file.readlines(): 186 # check for comment, jump over comments 187 if line[0] == '#': 188 if gather_copyright == 0: 189 if string.find(line, "Format information:") != -1 : 190 gather_copyright = 1 191 continue 192 copyright += ' * ' + string.strip(line[1:]) + '\n' 193 continue 194 else: 195 continue 196 197 # check for korean phonetic data 198 if string.find(line, "kKorean") < 0: 199 continue 200 201 tokens = string.split(line) 202 hanjacode = unicodetohexnum(tokens[0]) 203 for hangulphone in tokens[2:]: 204 hangulcode = phonetocode(hangulphone) 205 if hangulcode == 0: 206 continue 207 208 if table.has_key(hangulcode): 209 table[hangulcode].append(hanjacode) 210 else: 211 table[hangulcode] = [ hanjacode ] 212 213data_file.close() 214 215print_description() 216print_copyright() 217 218list = table.keys() 219list.sort() 220 221for key in list: 222 print char_type + " hangul_%X[] = {" % key 223 print "\t0x%X," % key 224 225 i = 0 226 table[key].sort() 227 for hanja in table[key]: 228 if i == 0: 229 print "\t", 230 sys.stdout.write("0x%X," % hanja) 231 i = i + 1 232 if i == 10: 233 i = 0 234 sys.stdout.write("\n") 235 236 if i == 0: 237 print "\t0" 238 else: 239 print "\n\t0" 240 print "};" 241 242 243print char_type + " *hanja_table[] = {" 244i = 0 245for key in list[:-1]: 246 if i == 0: 247 print "\t", 248 sys.stdout.write("hangul_%X, " % key) 249 i = i + 1 250 if i == 5: 251 i = 0 252 sys.stdout.write("\n") 253 254if i == 0: 255 print "\t", 256sys.stdout.write("hangul_%X" % list[-1]) 257i = i + 1 258if i == 5: 259 i = 0 260 sys.stdout.write("\n") 261 262if i == 0: 263 print "};" 264else: 265 print "\n};" 266 267 268