1#!/usr/bin/python
2# WinVNKey Hannom Database to Stardict dictionary source Conversion Tool
3# coded by wesnoth@ustc on 070804
4# http://winvnkey.sourceforge.net
5import sys, os, string, types, pprint
6infileencoding = 'utf-16-le'
7outfileencoding = 'utf-8'
8
9def showhelp():
10	print "Usage: %s filename" % sys.argv[0]
11
12def ishantu(str):
13	if len(str) > 0 and ord(str[0]) > 0x2e80:
14		return True
15	else:
16		return False
17
18def mysplit(line):
19	status = 0 # 0: normal, 1: quote
20	i = 0
21	line = line.lstrip()
22	linelen = len(line)
23	while i < linelen:
24		if status == 0 and line[i].isspace():
25			break
26		if line[i] == u'"':
27			status = 1 - status
28		i += 1
29	#print 'mysplit: i=%d, line=%s' % (i, `line`)
30	if i == 0:
31		return []
32	else:
33		line = [line[:i], line[i:].strip()]
34		if line[1] == u'':
35			return [line[0]]
36		else:
37			return line
38
39if __name__ == '__main__':
40	if len(sys.argv) <> 2:
41		showhelp()
42	else:
43		fp = open(sys.argv[1], 'r')
44		print 'Reading file...'
45		lines = unicode(fp.read(), infileencoding).split(u'\n')
46		lineno = 0
47		hugedict = {}
48		print 'Generating Han-Viet dict...'
49		for line in lines:
50			lineno += 1
51			if line.endswith(u'\r'):
52				line = line[:-1]
53			if line.startswith(u'\ufeff'):
54				line = line[1:]
55			ind = line.find(u'#')
56			if ind >= 0:
57				line = line[:ind]
58			line = mysplit(line)
59			if len(line) == 0:
60				continue
61			elif len(line) == 1:
62				continue # ignore this incomplete line
63			if line[0].startswith(u'"') and line[0].endswith(u'"'):
64				line[0] = line[0][1:-1]
65			if line[0].startswith(u'U+') or line[0].startswith(u'u+'):
66				line[0] = unichr(int(line[0][2:], 16))
67			if not ishantu(line[0]):
68				continue # invalid Han character
69				#print 'error occurred on line %d: %s' % (lineno, `line`)
70			if line[1].startswith(u'"') and line[1].endswith(u'"'):
71				line[1] = line[1][1:-1]
72			line[1] = filter(None, map(string.strip, line[1].split(u',')))
73			#hugedict[line[0]] = hugedict.get(line[0], []) + line[1]
74			for item in line[1]:
75				if not hugedict.has_key(line[0]):
76					hugedict[line[0]] = [item]
77				elif not item in hugedict[line[0]]:
78					hugedict[line[0]] +=  [item]
79			#print lineno, `line`
80		#for hantu, quocngu in hugedict.iteritems():
81		#	print hantu.encode('utf-8'), ':',
82		#	for viettu in quocngu:
83		#		print viettu.encode('utf-8'), ',',
84		#	print
85		fp.close()
86		print 'Generating Viet-Han dict...'
87		dicthuge = {}
88		for hantu, quocngu in hugedict.iteritems():
89			for viettu in quocngu:
90				if not dicthuge.has_key(viettu):
91					dicthuge[viettu] = [hantu]
92				elif not hantu in dicthuge[viettu]:
93					dicthuge[viettu] +=  [hantu]
94		print 'Writing Han-Viet dict...'
95		gp = open('hanviet.txt', 'w')
96		for hantu, quocngu in hugedict.iteritems():
97			gp.write(hantu.encode('utf-8'))
98			gp.write('\t')
99			gp.write((u', '.join(quocngu)).encode('utf-8'))
100			gp.write('\n')
101		gp.close()
102		print 'Writing Viet-Han dict...'
103		gp = open('viethan.txt', 'w')
104		for quocngu,hantu in dicthuge.iteritems():
105			gp.write(quocngu.encode('utf-8'))
106			gp.write('\t')
107			gp.write((u' '.join(hantu)).encode('utf-8'))
108			gp.write('\n')
109		gp.close()
110