1#!/usr/bin/python
2# Copyright (c) 2014 Wladimir J. van der Laan
3# Distributed under the MIT software license, see the accompanying
4# file COPYING or http://www.opensource.org/licenses/mit-license.php.
5'''
6Run this script from the root of the repository to update all translations from
7transifex.
8It will do the following automatically:
9
10- fetch all translations using the tx tool
11- post-process them into valid and committable format
12  - remove invalid control characters
13  - remove location tags (makes diffs less noisy)
14
15TODO:
16- auto-add new translations to the build system according to the translation process
17'''
18from __future__ import division, print_function
19import subprocess
20import re
21import sys
22import os
23import io
24import xml.etree.ElementTree as ET
25
26# Name of transifex tool
27TX = 'tx'
28# Name of source language file
29SOURCE_LANG = 'bitcoin_en.ts'
30# Directory with locale files
31LOCALE_DIR = 'src/qt/locale'
32# Minimum number of messages for translation to be considered at all
33MIN_NUM_MESSAGES = 10
34
35def check_at_repository_root():
36    if not os.path.exists('.git'):
37        print('No .git directory found')
38        print('Execute this script at the root of the repository', file=sys.stderr)
39        exit(1)
40
41def fetch_all_translations():
42    if subprocess.call([TX, 'pull', '-f', '-a']):
43        print('Error while fetching translations', file=sys.stderr)
44        exit(1)
45
46def find_format_specifiers(s):
47    '''Find all format specifiers in a string.'''
48    pos = 0
49    specifiers = []
50    while True:
51        percent = s.find('%', pos)
52        if percent < 0:
53            break
54        specifiers.append(s[percent+1])
55        pos = percent+2
56    return specifiers
57
58def split_format_specifiers(specifiers):
59    '''Split format specifiers between numeric (Qt) and others (strprintf)'''
60    numeric = []
61    other = []
62    for s in specifiers:
63        if s in {'1','2','3','4','5','6','7','8','9'}:
64            numeric.append(s)
65        else:
66            other.append(s)
67
68    # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
69    return set(numeric),other
70
71def sanitize_string(s):
72    '''Sanitize string for printing'''
73    return s.replace('\n',' ')
74
75def check_format_specifiers(source, translation, errors, numerus):
76    source_f = split_format_specifiers(find_format_specifiers(source))
77    # assert that no source messages contain both Qt and strprintf format specifiers
78    # if this fails, go change the source as this is hacky and confusing!
79    assert(not(source_f[0] and source_f[1]))
80    try:
81        translation_f = split_format_specifiers(find_format_specifiers(translation))
82    except IndexError:
83        errors.append("Parse error in translation for '%s': '%s'" % (sanitize_string(source), sanitize_string(translation)))
84        return False
85    else:
86        if source_f != translation_f:
87            if numerus and source_f == (set(), ['n']) and translation_f == (set(), []) and translation.find('%') == -1:
88                # Allow numerus translations to omit %n specifier (usually when it only has one possible value)
89                return True
90            errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
91            return False
92    return True
93
94def all_ts_files(suffix=''):
95    for filename in os.listdir(LOCALE_DIR):
96        # process only language files, and do not process source language
97        if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
98            continue
99        if suffix: # remove provided suffix
100            filename = filename[0:-len(suffix)]
101        filepath = os.path.join(LOCALE_DIR, filename)
102        yield(filename, filepath)
103
104FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
105def remove_invalid_characters(s):
106    '''Remove invalid characters from translation string'''
107    return FIX_RE.sub(b'', s)
108
109# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
110# comparison, disable by default)
111_orig_escape_cdata = None
112def escape_cdata(text):
113    text = _orig_escape_cdata(text)
114    text = text.replace("'", '&apos;')
115    text = text.replace('"', '&quot;')
116    return text
117
118def postprocess_translations(reduce_diff_hacks=False):
119    print('Checking and postprocessing...')
120
121    if reduce_diff_hacks:
122        global _orig_escape_cdata
123        _orig_escape_cdata = ET._escape_cdata
124        ET._escape_cdata = escape_cdata
125
126    for (filename,filepath) in all_ts_files():
127        os.rename(filepath, filepath+'.orig')
128
129    have_errors = False
130    for (filename,filepath) in all_ts_files('.orig'):
131        # pre-fixups to cope with transifex output
132        parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
133        with open(filepath + '.orig', 'rb') as f:
134            data = f.read()
135        # remove control characters; this must be done over the entire file otherwise the XML parser will fail
136        data = remove_invalid_characters(data)
137        tree = ET.parse(io.BytesIO(data), parser=parser)
138
139        # iterate over all messages in file
140        root = tree.getroot()
141        for context in root.findall('context'):
142            for message in context.findall('message'):
143                numerus = message.get('numerus') == 'yes'
144                source = message.find('source').text
145                translation_node = message.find('translation')
146                # pick all numerusforms
147                if numerus:
148                    translations = [i.text for i in translation_node.findall('numerusform')]
149                else:
150                    translations = [translation_node.text]
151
152                for translation in translations:
153                    if translation is None:
154                        continue
155                    errors = []
156                    valid = check_format_specifiers(source, translation, errors, numerus)
157
158                    for error in errors:
159                        print('%s: %s' % (filename, error))
160
161                    if not valid: # set type to unfinished and clear string if invalid
162                        translation_node.clear()
163                        translation_node.set('type', 'unfinished')
164                        have_errors = True
165
166                # Remove location tags
167                for location in message.findall('location'):
168                    message.remove(location)
169
170                # Remove entire message if it is an unfinished translation
171                if translation_node.get('type') == 'unfinished':
172                    context.remove(message)
173
174        # check if document is (virtually) empty, and remove it if so
175        num_messages = 0
176        for context in root.findall('context'):
177            for message in context.findall('message'):
178                num_messages += 1
179        if num_messages < MIN_NUM_MESSAGES:
180            print('Removing %s, as it contains only %i messages' % (filepath, num_messages))
181            continue
182
183        # write fixed-up tree
184        # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
185        if reduce_diff_hacks:
186            out = io.BytesIO()
187            tree.write(out, encoding='utf-8')
188            out = out.getvalue()
189            out = out.replace(b' />', b'/>')
190            with open(filepath, 'wb') as f:
191                f.write(out)
192        else:
193            tree.write(filepath, encoding='utf-8')
194    return have_errors
195
196if __name__ == '__main__':
197    check_at_repository_root()
198    fetch_all_translations()
199    postprocess_translations()
200
201