1#!/usr/bin/python 2# WinVNKey Hannom Database to Stardict dictionary source Conversion Tool 3# coded by wesnoth@ustc on 070804 4# http://winvnkey.sourceforge.net 5import sys, os, string, types, pprint 6infileencoding = 'utf-16-le' 7outfileencoding = 'utf-8' 8 9def showhelp(): 10 print "Usage: %s filename" % sys.argv[0] 11 12def ishantu(str): 13 if len(str) > 0 and ord(str[0]) > 0x2e80: 14 return True 15 else: 16 return False 17 18def mysplit(line): 19 status = 0 # 0: normal, 1: quote 20 i = 0 21 line = line.lstrip() 22 linelen = len(line) 23 while i < linelen: 24 if status == 0 and line[i].isspace(): 25 break 26 if line[i] == u'"': 27 status = 1 - status 28 i += 1 29 #print 'mysplit: i=%d, line=%s' % (i, `line`) 30 if i == 0: 31 return [] 32 else: 33 line = [line[:i], line[i:].strip()] 34 if line[1] == u'': 35 return [line[0]] 36 else: 37 return line 38 39if __name__ == '__main__': 40 if len(sys.argv) <> 2: 41 showhelp() 42 else: 43 fp = open(sys.argv[1], 'r') 44 print 'Reading file...' 45 lines = unicode(fp.read(), infileencoding).split(u'\n') 46 lineno = 0 47 hugedict = {} 48 print 'Generating Han-Viet dict...' 49 for line in lines: 50 lineno += 1 51 if line.endswith(u'\r'): 52 line = line[:-1] 53 if line.startswith(u'\ufeff'): 54 line = line[1:] 55 ind = line.find(u'#') 56 if ind >= 0: 57 line = line[:ind] 58 line = mysplit(line) 59 if len(line) == 0: 60 continue 61 elif len(line) == 1: 62 continue # ignore this incomplete line 63 if line[0].startswith(u'"') and line[0].endswith(u'"'): 64 line[0] = line[0][1:-1] 65 if line[0].startswith(u'U+') or line[0].startswith(u'u+'): 66 line[0] = unichr(int(line[0][2:], 16)) 67 if not ishantu(line[0]): 68 continue # invalid Han character 69 #print 'error occurred on line %d: %s' % (lineno, `line`) 70 if line[1].startswith(u'"') and line[1].endswith(u'"'): 71 line[1] = line[1][1:-1] 72 line[1] = filter(None, map(string.strip, line[1].split(u','))) 73 #hugedict[line[0]] = hugedict.get(line[0], []) + line[1] 74 for item in line[1]: 75 if not hugedict.has_key(line[0]): 76 hugedict[line[0]] = [item] 77 elif not item in hugedict[line[0]]: 78 hugedict[line[0]] += [item] 79 #print lineno, `line` 80 #for hantu, quocngu in hugedict.iteritems(): 81 # print hantu.encode('utf-8'), ':', 82 # for viettu in quocngu: 83 # print viettu.encode('utf-8'), ',', 84 # print 85 fp.close() 86 print 'Generating Viet-Han dict...' 87 dicthuge = {} 88 for hantu, quocngu in hugedict.iteritems(): 89 for viettu in quocngu: 90 if not dicthuge.has_key(viettu): 91 dicthuge[viettu] = [hantu] 92 elif not hantu in dicthuge[viettu]: 93 dicthuge[viettu] += [hantu] 94 print 'Writing Han-Viet dict...' 95 gp = open('hanviet.txt', 'w') 96 for hantu, quocngu in hugedict.iteritems(): 97 gp.write(hantu.encode('utf-8')) 98 gp.write('\t') 99 gp.write((u', '.join(quocngu)).encode('utf-8')) 100 gp.write('\n') 101 gp.close() 102 print 'Writing Viet-Han dict...' 103 gp = open('viethan.txt', 'w') 104 for quocngu,hantu in dicthuge.iteritems(): 105 gp.write(quocngu.encode('utf-8')) 106 gp.write('\t') 107 gp.write((u' '.join(hantu)).encode('utf-8')) 108 gp.write('\n') 109 gp.close() 110