1#!/usr/bin/python 2# vim:set fileencoding=utf-8 et sts=4 sw=4: 3# 4# ibus - Intelligent Input Bus for Linux / Unix OS 5# 6# Copyright © 2016 Takao Fujiwara <takao.fujiwara1@gmail.com> 7# 8# This library is free software; you can redistribute it and/or 9# modify it under the terms of the GNU Lesser General Public 10# License as published by the Free Software Foundation; either 11# version 2.1 of the License, or (at your option) any later version. 12# 13# This library is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16# Lesser General Public License for more details. 17# 18# You should have received a copy of the GNU Lesser General Public 19# License along with this library. If not, see <http://www.gnu.org/licenses/>. 20 21 22# This script converts ISO 639-2 of three characters to ISO 639-1 of two 23# characters in simple.xml. 24# E.g. "eng" to "en" 25 26 27from xml.sax import make_parser as sax_make_parser 28from xml.sax.handler import feature_namespaces as sax_feature_namespaces 29from xml.sax.saxutils import XMLFilterBase, XMLGenerator 30from xml.sax._exceptions import SAXParseException 31 32import codecs 33import getopt 34import io 35import os 36import sys 37 38INSTALLED_SIMPLE_XML = '/usr/share/ibus/component/simple.xml' 39PY3K = sys.version_info >= (3, 0) 40 41if PY3K: 42 from io import StringIO 43else: 44 # io.StringIO does not work with XMLGenerator 45 from cStringIO import StringIO 46 # iso_639.xml includes UTF-8 47 reload(sys) 48 sys.setdefaultencoding('utf-8') 49 50 51def usage(prgname): 52 print('''\ 53Usage: 54 %s [OPTION...] 55 56Options: 57 -h, --help Show this message 58 -i, --input=SIMPLE_XML Load SIMPLE_XML file (default is: 59 %s) 60 -o, --output=FILE Output FILE (default is stdout) 61''' % (prgname, INSTALLED_SIMPLE_XML)) 62 63 64class ISO639XML(XMLFilterBase): 65 def __init__(self, parser=None): 66 self.__code2to1 = {} 67 self.__codetoname = {} 68 XMLFilterBase.__init__(self, parser) 69 def startElement(self, name, attrs): 70 if name != 'iso_639_entry': 71 return 72 n = attrs.get('name') 73 iso639_1 = attrs.get('iso_639_1_code') 74 iso639_2b = attrs.get('iso_639_2B_code') 75 iso639_2t = attrs.get('iso_639_2T_code') 76 if iso639_1 != None: 77 self.__codetoname[iso639_1] = n 78 if iso639_2b != None: 79 self.__code2to1[iso639_2b] = iso639_1 80 self.__codetoname[iso639_2b] = n 81 if iso639_2t != None and iso639_2b != iso639_2t: 82 self.__code2to1[iso639_2t] = iso639_1 83 self.__codetoname[iso639_2t] = n 84 def code2to1(self, iso639_2): 85 try: 86 return self.__code2to1[iso639_2] 87 except KeyError: 88 return None 89 90 91class IBusComponentXML(XMLFilterBase): 92 def __init__(self, parser=None, downstream=None, iso639=None): 93 XMLFilterBase.__init__(self, parser) 94 self.__downstream = downstream 95 self.__iso639 = iso639 96 self.__is_language = False 97 def startDocument(self): 98 if self.__downstream: 99 self.__downstream.startDocument() 100 def endDocument(self): 101 if self.__downstream: 102 self.__downstream.endDocument() 103 def startElement(self, name, attrs): 104 if name == 'language': 105 self.__is_language = True 106 if self.__downstream: 107 self.__downstream.startElement(name, attrs) 108 def endElement(self, name): 109 if name == 'language': 110 self.__is_language = False 111 if self.__downstream: 112 self.__downstream.endElement(name) 113 def characters(self, text): 114 if self.__is_language: 115 if self.__iso639: 116 iso639_1 = self.__iso639.code2to1(text) 117 if iso639_1 != None: 118 text = iso639_1 119 if self.__downstream: 120 self.__downstream.characters(text) 121 122 123class ConvertEngineXML(): 124 def __init__(self, path, iso639=None): 125 self.__path = path 126 self.__iso639 = iso639 127 128 self.__result = StringIO() 129 downstream = XMLGenerator(self.__result, 'utf-8') 130 self.__load(downstream) 131 132 def __load(self, downstream=None): 133 parser = sax_make_parser() 134 parser.setFeature(sax_feature_namespaces, 0) 135 self.__handler = IBusComponentXML(parser, downstream, self.__iso639) 136 parser.setContentHandler(self.__handler) 137 f = codecs.open(self.__path, 'r', encoding='utf-8') 138 try: 139 parser.parse(f) 140 except SAXParseException: 141 print('Error: Invalid file format: %s' % path) 142 finally: 143 f.close() 144 def write(self, output=None): 145 if output != None: 146 od = codecs.open(output, 'w', encoding='utf-8') 147 else: 148 if PY3K: 149 od = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 150 else: 151 od = codecs.getwriter('utf-8')(sys.stdout) 152 contents = self.__result.getvalue() 153 od.write(contents) 154 155 156def parse_iso639(path): 157 f = codecs.open(path, 'r', encoding='utf-8') 158 parser = sax_make_parser() 159 parser.setFeature(sax_feature_namespaces, 0) 160 handler = ISO639XML(parser) 161 parser.setContentHandler(handler) 162 try: 163 parser.parse(f) 164 except SAXParseException: 165 print('Error: Invalid file format: %s' % path) 166 finally: 167 f.close() 168 return handler 169 170 171if __name__ == '__main__': 172 prgname = os.path.basename(sys.argv[0]) 173 try: 174 opts, args = getopt.getopt(sys.argv[1:], 175 'hi:o:', 176 ['help', 'input=', 'output=']) 177 except getopt.GetoptError as err: 178 print(err) 179 usage(prgname) 180 sys.exit(2) 181 if len(args) > 0: 182 usage(prgname) 183 sys.exit(2) 184 input = INSTALLED_SIMPLE_XML 185 output = None 186 for opt, arg in opts: 187 if opt in ('-h', '--help'): 188 usage(prgname) 189 sys.exit() 190 elif opt in ('-i', '--input'): 191 input = arg 192 elif opt in ('-o', '--output'): 193 output = arg 194 195 iso639 = parse_iso639('/usr/share/xml/iso-codes/iso_639.xml') 196 xml = ConvertEngineXML(input, iso639) 197 xml.write(output) 198