1#!/usr/bin/env python
2
3import sys, string
4
5char_type = 'static const gunichar'
6copyright = ''
7
8def print_help():
9	sys.stderr.write("Usage: hanjatable.py [Unihan database file]\n")
10	sys.exit(1);
11
12def print_description():
13	desc = """/*
14 * this file is generated from Unihan.txt database file by gen.py (part of
15 * GTK+-2.0 input module package, imhangul)
16 * You can get this database file from ftp://www.unicode.org/Public/UNIDATA/
17 * or http://www.unicode.org/Public/UNIDATA/
18 */
19"""
20 	print desc
21
22def print_copyright():
23	print '/*\n' + copyright + ' */\n'
24
25def unicodetohexnum(str):
26	return string.atoi(str[2:], 16)
27
28def jamotosyllable(cho, jung, jong):
29	syllable_base = 0xAC00
30	choseong_base = 0x1100
31	jungseong_base = 0x1161
32	jongseong_base = 0x11A7
33	njungseong = 21
34	njongseong = 28
35
36	if cho < 0x1100 and cho > 0x1112:
37		return 0
38	if jung < 0x1161 and jung > 0x1175:
39		return 0
40	if jong < 0x11A8 and jong > 0x11C2:
41		return 0
42
43	cho -= choseong_base
44	jung -= jungseong_base
45	jong -= jongseong_base
46
47	ch = ((cho * njungseong) + jung) * njongseong + jong + syllable_base
48	if ch >= 0xAC00 and ch <= 0xD7AF:
49		return ch
50	else:
51		return 0
52
53def phonetocode(phone):
54	choseong_table = {
55		'K':	0x1100,
56		'KK':	0x1101,
57		'N':	0x1102,
58		'T':	0x1103,
59		'TT':	0x1104,
60		'L':	0x1105,
61		'M':	0x1106,
62		'P':	0x1107,
63		'B':	0x1107,
64		'PP':	0x1108,
65		'S':	0x1109,
66		'SS':	0x110A,
67		#'':	0x110B,
68		'C':	0x110C,
69		'CC':	0x110D,
70		'CH':	0x110E,
71		'KH':	0x110F,
72		'TH':	0x1110,
73		'PH':	0x1111,
74		'H':	0x1112
75	}
76	jungseong_table = {
77		'A':	0x1161,
78		'AY':	0x1162,
79		'YA':	0x1163,
80		'YAY':	0x1164,
81		'E':	0x1165,
82		'EY':	0x1166,
83		'YE':	0x1167,
84		'YEY':	0x1168,
85		'O':	0x1169,
86		'WA':	0x116A,
87		'WAY':	0x116B,
88		'OY':	0x116C,
89		'WOY':	0x116C,
90		'YO':	0x116D,
91		'WU':	0x116E,
92		'WE':	0x116F,
93		'WEY':	0x1170,
94		'WI':	0x1171,
95		'YU':	0x1172,
96		'U':	0x1173,
97		'UY':	0x1174,
98		'I':	0x1175
99	}
100	jongseong_table = {
101		'K':	0x11A8,
102		'KK':	0x11A9,
103		'KS':	0x11AA,
104		'N':	0x11AB,
105		'NC':	0x11AC,
106		'NH':	0x11AD,
107		'T':	0x11AE,
108		'L':	0x11AF,
109		'LK':	0x11B0,
110		'LM':	0x11B1,
111		'LP':	0x11B2,
112		'LS':	0x11B3,
113		'LTH':	0x11B4,
114		'LPH':	0x11B5,
115		'LH':	0x11B6,
116		'M':	0x11B7,
117		'P':	0x11B8,
118		'PS':	0x11B9,
119		'S':	0x11BA,
120		'SS':	0x11BB,
121		'NG':	0x11BC,
122		'C':	0x11BD,
123		'CH':	0x11BE,
124		'KH':	0x11BF,
125		'TH':	0x11C0,
126		'PH':	0x11C1,
127		'H':	0x11C2
128	}
129	if choseong_table.has_key(phone[:2]):
130		choseong = choseong_table[phone[:2]]
131		phone = phone[2:]
132	elif choseong_table.has_key(phone[:1]):
133		choseong = choseong_table[phone[:1]]
134		phone = phone[1:]
135	else:
136		choseong = 0x110B
137
138	if jungseong_table.has_key(phone[:3]):
139		jungseong = jungseong_table[phone[:3]]
140		phone = phone[3:]
141	elif jungseong_table.has_key(phone[:2]):
142		jungseong = jungseong_table[phone[:2]]
143		phone = phone[2:]
144	elif jungseong_table.has_key(phone[:1]):
145		jungseong = jungseong_table[phone[:1]]
146		phone = phone[1:]
147	else:
148		sys.stderr.write("%s: phonetic data error\n" % phone)
149		return 0
150
151	if jongseong_table.has_key(phone[:3]):
152		jongseong = jongseong_table[phone[:3]]
153		phone = phone[3:]
154	elif jongseong_table.has_key(phone[:2]):
155		jongseong = jongseong_table[phone[:2]]
156		phone = phone[2:]
157	elif jongseong_table.has_key(phone[:1]):
158		jongseong = jongseong_table[phone[:1]]
159		phone = phone[1:]
160	else:
161		jongseong = 0x11A7
162
163	# print "%x + %x + %x" % ( choseong, jungseong, jongseong )
164	hangulcode = jamotosyllable(choseong, jungseong, jongseong)
165
166	return hangulcode;
167
168
169# start main procedure
170data_file_name = "Unihan.txt"
171
172if len(sys.argv) == 2:
173	data_file_name = sys.argv[1]
174
175try:
176	data_file = open(data_file_name, 'r')
177
178except:
179	sys.stderr("Cant open file: %s\n" % data_file_name)
180	help()
181	sys.exit(1)
182
183gather_copyright = 0
184table = { }
185for line in data_file.readlines():
186	# check for comment, jump over comments
187	if line[0] == '#':
188		if gather_copyright == 0:
189			if string.find(line, "Format information:") != -1 :
190				gather_copyright = 1
191				continue
192			copyright += ' * ' + string.strip(line[1:]) + '\n'
193			continue
194		else:
195			continue
196
197	# check for korean phonetic data
198	if string.find(line, "kKorean") < 0:
199		continue
200
201	tokens = string.split(line)
202	hanjacode = unicodetohexnum(tokens[0])
203	for hangulphone in tokens[2:]:
204		hangulcode = phonetocode(hangulphone)
205		if hangulcode == 0:
206			continue
207
208		if table.has_key(hangulcode):
209			table[hangulcode].append(hanjacode)
210		else:
211			table[hangulcode] = [ hanjacode ]
212
213data_file.close()
214
215print_description()
216print_copyright()
217
218list = table.keys()
219list.sort()
220
221for key in list:
222	print char_type + " hangul_%X[] = {" % key
223	print "\t0x%X," % key
224
225	i = 0
226	table[key].sort()
227	for hanja in table[key]:
228		if i == 0:
229			print "\t",
230		sys.stdout.write("0x%X," % hanja)
231		i = i + 1
232		if i == 10:
233			i = 0
234			sys.stdout.write("\n")
235
236	if i == 0:
237		print "\t0"
238	else:
239		print "\n\t0"
240	print "};"
241
242
243print char_type + " *hanja_table[] = {"
244i = 0
245for key in list[:-1]:
246	if i == 0:
247		print "\t",
248	sys.stdout.write("hangul_%X, " % key)
249	i = i + 1
250	if i == 5:
251		i = 0
252		sys.stdout.write("\n")
253
254if i == 0:
255	print "\t",
256sys.stdout.write("hangul_%X" % list[-1])
257i = i + 1
258if i == 5:
259	i = 0
260	sys.stdout.write("\n")
261
262if i == 0:
263	print "};"
264else:
265	print "\n};"
266
267
268