1#! /usr/bin/env python
2# -*- coding: utf-8 -*-
3# Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
4
5"""Generate binary message catalog from textual translation description.
6
7This program converts a textual Uniforum-style message catalog (.po file) into
8a binary GNU catalog (.mo file).  This is essentially the same function as the
9GNU msgfmt program, however, it is a simpler implementation.
10
11Usage: msgfmt.py [OPTIONS] filename.po
12
13Options:
14    -o file
15    --output-file=file
16        Specify the output file to write to.  If omitted, output will go to a
17        file named filename.mo (based off the input file name).
18
19    -h
20    --help
21        Print this message and exit.
22
23    -V
24    --version
25        Display version information and exit.
26"""
27
28from __future__ import print_function
29import os
30import sys
31import getopt
32import struct
33import array
34import re
35import codecs
36from email.parser import HeaderParser
37
38__version__ = "1.2"
39
40MESSAGES = {}
41
42
43
44def usage(code, msg=''):
45    print(__doc__, file=sys.stderr)
46    if msg:
47        print(msg, file=sys.stderr)
48    sys.exit(code)
49
50
51
52def add(id, str, fuzzy):
53    "Add a non-fuzzy translation to the dictionary."
54    global MESSAGES
55    if not fuzzy and str:
56        MESSAGES[id] = str
57
58def dequote(s):
59    if (s[0] == s[-1]) and s.startswith(("'", '"')):
60        return s[1:-1]
61    return s
62
63# decode_escapes from http://stackoverflow.com/a/24519338
64ESCAPE_SEQUENCE_RE = re.compile(r'''
65    ( \\U........      # 8-digit hex escapes
66    | \\u....          # 4-digit hex escapes
67    | \\x..            # 2-digit hex escapes
68    | \\[0-7]{1,3}     # Octal escapes
69    | \\N\{[^}]+\}     # Unicode characters by name
70    | \\[\\'"abfnrtv]  # Single-character escapes
71    )''', re.UNICODE | re.VERBOSE)
72
73def decode_escapes(s):
74    def decode_match(match):
75        return codecs.decode(match.group(0), 'unicode-escape')
76
77    return ESCAPE_SEQUENCE_RE.sub(decode_match, s)
78
79
80def generate():
81    "Return the generated output."
82    global MESSAGES
83    # the keys are sorted in the .mo file
84    keys = sorted(MESSAGES.keys())
85    offsets = []
86    ids = strs = b''
87    for id in keys:
88        # For each string, we need size and file offset.  Each string is NUL
89        # terminated; the NUL does not count into the size.
90        offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
91        ids += id + b'\0'
92        strs += MESSAGES[id] + b'\0'
93    output = ''
94    # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
95    # the keys start right after the index tables.
96    # translated string.
97    keystart = 7*4+16*len(keys)
98    # and the values start after the keys
99    valuestart = keystart + len(ids)
100    koffsets = []
101    voffsets = []
102    # The string table first has the list of keys, then the list of values.
103    # Each entry has first the size of the string, then the file offset.
104    for o1, l1, o2, l2 in offsets:
105        koffsets += [l1, o1+keystart]
106        voffsets += [l2, o2+valuestart]
107    offsets = koffsets + voffsets
108    output = struct.pack("Iiiiiii",
109                         0x950412de,       # Magic
110                         0,                 # Version
111                         len(keys),         # # of entries
112                         7*4,               # start of key index
113                         7*4+len(keys)*8,   # start of value index
114                         0, 0)              # size and offset of hash table
115    offsdata = array.array("i", offsets)
116    output += offsdata.tobytes() if hasattr(offsdata, "tobytes") else offsdata.tostring()
117    output += ids
118    output += strs
119    return output
120
121
122
123def make(filename, outfile):
124    ID = 1
125    STR = 2
126
127    # Compute .mo name from .po name and arguments
128    if filename.endswith('.po'):
129        infile = filename
130    else:
131        infile = filename + '.po'
132    if outfile is None:
133        outfile = os.path.splitext(infile)[0] + '.mo'
134
135    try:
136        lines = open(infile, 'rb').readlines()
137    except IOError as msg:
138        print(msg, file=sys.stderr)
139        sys.exit(1)
140
141    section = None
142    fuzzy = 0
143    empty = 0
144    header_attempted = False
145
146    # Start off assuming Latin-1, so everything decodes without failure,
147    # until we know the exact encoding
148    encoding = 'latin-1'
149
150    # Start off assuming Latin-1, so everything decodes without failure,
151    # until we know the exact encoding
152    encoding = 'latin-1'
153
154    # Parse the catalog
155    for lno, l in enumerate(lines):
156        l = l.decode(encoding)
157        # If we get a comment line after a msgstr, this is a new entry
158        if l[0] == '#' and section == STR:
159            add(msgid, msgstr, fuzzy)
160            section = None
161            fuzzy = 0
162        # Record a fuzzy mark
163        if l[:2] == '#,' and 'fuzzy' in l:
164            fuzzy = 1
165        # Skip comments
166        if l[0] == '#':
167            continue
168        # Now we are in a msgid section, output previous section
169        if l.startswith('msgid') and not l.startswith('msgid_plural'):
170            if section == STR:
171                add(msgid, msgstr, fuzzy)
172                if not msgid:
173                    # See whether there is an encoding declaration
174                    p = HeaderParser()
175                    charset = p.parsestr(msgstr.decode(encoding)).get_content_charset()
176                    if charset:
177                        encoding = charset
178            section = ID
179            l = l[5:]
180            msgid = msgstr = b''
181            is_plural = False
182            if l.strip() == '""':
183                # Check if next line is msgstr. If so, this is a multiline msgid.
184                if lines[lno+1].decode(encoding).startswith('msgstr'):
185                    # If this is the first empty msgid and is followed by msgstr, this is the header, which may contain the encoding declaration.
186                    # Otherwise this file is not valid
187                    if empty > 1:
188                        print("Found multiple empty msgids on line " + str(lno) + ", not valid!")
189                    empty += 1
190        # This is a message with plural forms
191        elif l.startswith('msgid_plural'):
192            if section != ID:
193                print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno),
194                      file=sys.stderr)
195                sys.exit(1)
196            l = l[12:]
197            msgid += b'\0' # separator of singular and plural
198            is_plural = True
199        # Now we are in a msgstr section
200        elif l.startswith('msgstr'):
201            section = STR
202            if l.startswith('msgstr['):
203                if not is_plural:
204                    print('plural without msgid_plural on %s:%d' % (infile, lno),
205                          file=sys.stderr)
206                    sys.exit(1)
207                l = l.split(']', 1)[1]
208                if msgstr:
209                    msgstr += b'\0' # Separator of the various plural forms
210            else:
211                if (l[6:].strip() == '""') and (empty == 1) and (not header_attempted):
212                    header = ""
213                    # parse up until next empty line = end of header
214                    hdrno = lno
215                    while(hdrno < len(lines)-1):
216                        # This is a roundabout way to strip non-ASCII unicode characters from the header.
217                        # As we are only parsing out the encoding, we don't need any unicode chars in it.
218                        l = lines[hdrno+1].decode('unicode_escape').encode('ascii','ignore').decode(encoding)
219                        if l.strip():
220                            header += decode_escapes(dequote(l.strip()))
221                        else:
222                            break
223                        hdrno += 1
224                    # See whether there is an encoding declaration
225                    if(hdrno > lno):
226                        p = HeaderParser()
227                        charset = p.parsestr(str(header)).get_content_charset()
228                        header_attempted = True
229                        if charset:
230                            encoding = charset
231                if is_plural:
232                    print('indexed msgstr required for plural on  %s:%d' % (infile, lno),
233                          file=sys.stderr)
234                    sys.exit(1)
235                l = l[6:]
236        # Skip empty lines
237        l = l.strip()
238        if not l:
239            continue
240        l = decode_escapes(dequote(l)) # strip quotes and replace newlines if present
241        if section == ID:
242            msgid += l.encode(encoding)
243        elif section == STR:
244            msgstr += l.encode(encoding)
245        else:
246            print('Syntax error on %s:%d' % (infile, lno), \
247                  'before:', file=sys.stderr)
248            print(l, file=sys.stderr)
249            sys.exit(1)
250    # Add last entry
251    if section == STR:
252        add(msgid, msgstr, fuzzy)
253
254    # Compute output
255    output = generate()
256
257    try:
258        open(outfile,"wb").write(output)
259    except IOError as msg:
260        print(msg, file=sys.stderr)
261
262
263
264def main():
265    try:
266        opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
267                                   ['help', 'version', 'output-file='])
268    except getopt.error as msg:
269        usage(1, msg)
270
271    outfile = None
272    # parse options
273    for opt, arg in opts:
274        if opt in ('-h', '--help'):
275            usage(0)
276        elif opt in ('-V', '--version'):
277            print("msgfmt.py", __version__)
278            sys.exit(0)
279        elif opt in ('-o', '--output-file'):
280            outfile = arg
281    # do it
282    if not args:
283        print('No input file given', file=sys.stderr)
284        print("Try `msgfmt --help' for more information.", file=sys.stderr)
285        return
286
287    for filename in args:
288        make(filename, outfile)
289
290
291if __name__ == '__main__':
292    main()
293