1#!/usr/bin/python 2# Copyright (c) 2014 Wladimir J. van der Laan 3# Distributed under the MIT software license, see the accompanying 4# file COPYING or http://www.opensource.org/licenses/mit-license.php. 5''' 6Run this script from the root of the repository to update all translations from 7transifex. 8It will do the following automatically: 9 10- fetch all translations using the tx tool 11- post-process them into valid and committable format 12 - remove invalid control characters 13 - remove location tags (makes diffs less noisy) 14 15TODO: 16- auto-add new translations to the build system according to the translation process 17''' 18from __future__ import division, print_function 19import subprocess 20import re 21import sys 22import os 23import io 24import xml.etree.ElementTree as ET 25 26# Name of transifex tool 27TX = 'tx' 28# Name of source language file 29SOURCE_LANG = 'bitcoin_en.ts' 30# Directory with locale files 31LOCALE_DIR = 'src/qt/locale' 32# Minimum number of messages for translation to be considered at all 33MIN_NUM_MESSAGES = 10 34 35def check_at_repository_root(): 36 if not os.path.exists('.git'): 37 print('No .git directory found') 38 print('Execute this script at the root of the repository', file=sys.stderr) 39 exit(1) 40 41def fetch_all_translations(): 42 if subprocess.call([TX, 'pull', '-f', '-a']): 43 print('Error while fetching translations', file=sys.stderr) 44 exit(1) 45 46def find_format_specifiers(s): 47 '''Find all format specifiers in a string.''' 48 pos = 0 49 specifiers = [] 50 while True: 51 percent = s.find('%', pos) 52 if percent < 0: 53 break 54 specifiers.append(s[percent+1]) 55 pos = percent+2 56 return specifiers 57 58def split_format_specifiers(specifiers): 59 '''Split format specifiers between numeric (Qt) and others (strprintf)''' 60 numeric = [] 61 other = [] 62 for s in specifiers: 63 if s in {'1','2','3','4','5','6','7','8','9'}: 64 numeric.append(s) 65 else: 66 other.append(s) 67 68 # numeric (Qt) can be present in any order, others (strprintf) must be in specified order 69 return set(numeric),other 70 71def sanitize_string(s): 72 '''Sanitize string for printing''' 73 return s.replace('\n',' ') 74 75def check_format_specifiers(source, translation, errors, numerus): 76 source_f = split_format_specifiers(find_format_specifiers(source)) 77 # assert that no source messages contain both Qt and strprintf format specifiers 78 # if this fails, go change the source as this is hacky and confusing! 79 assert(not(source_f[0] and source_f[1])) 80 try: 81 translation_f = split_format_specifiers(find_format_specifiers(translation)) 82 except IndexError: 83 errors.append("Parse error in translation for '%s': '%s'" % (sanitize_string(source), sanitize_string(translation))) 84 return False 85 else: 86 if source_f != translation_f: 87 if numerus and source_f == (set(), ['n']) and translation_f == (set(), []) and translation.find('%') == -1: 88 # Allow numerus translations to omit %n specifier (usually when it only has one possible value) 89 return True 90 errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation))) 91 return False 92 return True 93 94def all_ts_files(suffix=''): 95 for filename in os.listdir(LOCALE_DIR): 96 # process only language files, and do not process source language 97 if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix: 98 continue 99 if suffix: # remove provided suffix 100 filename = filename[0:-len(suffix)] 101 filepath = os.path.join(LOCALE_DIR, filename) 102 yield(filename, filepath) 103 104FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]') 105def remove_invalid_characters(s): 106 '''Remove invalid characters from translation string''' 107 return FIX_RE.sub(b'', s) 108 109# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for 110# comparison, disable by default) 111_orig_escape_cdata = None 112def escape_cdata(text): 113 text = _orig_escape_cdata(text) 114 text = text.replace("'", ''') 115 text = text.replace('"', '"') 116 return text 117 118def postprocess_translations(reduce_diff_hacks=False): 119 print('Checking and postprocessing...') 120 121 if reduce_diff_hacks: 122 global _orig_escape_cdata 123 _orig_escape_cdata = ET._escape_cdata 124 ET._escape_cdata = escape_cdata 125 126 for (filename,filepath) in all_ts_files(): 127 os.rename(filepath, filepath+'.orig') 128 129 have_errors = False 130 for (filename,filepath) in all_ts_files('.orig'): 131 # pre-fixups to cope with transifex output 132 parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8' 133 with open(filepath + '.orig', 'rb') as f: 134 data = f.read() 135 # remove control characters; this must be done over the entire file otherwise the XML parser will fail 136 data = remove_invalid_characters(data) 137 tree = ET.parse(io.BytesIO(data), parser=parser) 138 139 # iterate over all messages in file 140 root = tree.getroot() 141 for context in root.findall('context'): 142 for message in context.findall('message'): 143 numerus = message.get('numerus') == 'yes' 144 source = message.find('source').text 145 translation_node = message.find('translation') 146 # pick all numerusforms 147 if numerus: 148 translations = [i.text for i in translation_node.findall('numerusform')] 149 else: 150 translations = [translation_node.text] 151 152 for translation in translations: 153 if translation is None: 154 continue 155 errors = [] 156 valid = check_format_specifiers(source, translation, errors, numerus) 157 158 for error in errors: 159 print('%s: %s' % (filename, error)) 160 161 if not valid: # set type to unfinished and clear string if invalid 162 translation_node.clear() 163 translation_node.set('type', 'unfinished') 164 have_errors = True 165 166 # Remove location tags 167 for location in message.findall('location'): 168 message.remove(location) 169 170 # Remove entire message if it is an unfinished translation 171 if translation_node.get('type') == 'unfinished': 172 context.remove(message) 173 174 # check if document is (virtually) empty, and remove it if so 175 num_messages = 0 176 for context in root.findall('context'): 177 for message in context.findall('message'): 178 num_messages += 1 179 if num_messages < MIN_NUM_MESSAGES: 180 print('Removing %s, as it contains only %i messages' % (filepath, num_messages)) 181 continue 182 183 # write fixed-up tree 184 # if diff reduction requested, replace some XML to 'sanitize' to qt formatting 185 if reduce_diff_hacks: 186 out = io.BytesIO() 187 tree.write(out, encoding='utf-8') 188 out = out.getvalue() 189 out = out.replace(b' />', b'/>') 190 with open(filepath, 'wb') as f: 191 f.write(out) 192 else: 193 tree.write(filepath, encoding='utf-8') 194 return have_errors 195 196if __name__ == '__main__': 197 check_at_repository_root() 198 fetch_all_translations() 199 postprocess_translations() 200 201