1#! /usr/bin/env python3
2# Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
3
4"""Generate binary message catalog from textual translation description.
5
6This program converts a textual Uniforum-style message catalog (.po file) into
7a binary GNU catalog (.mo file).  This is essentially the same function as the
8GNU msgfmt program, however, it is a simpler implementation.  Currently it
9does not handle plural forms but it does handle message contexts.
10
11Usage: msgfmt.py [OPTIONS] filename.po
12
13Options:
14    -o file
15    --output-file=file
16        Specify the output file to write to.  If omitted, output will go to a
17        file named filename.mo (based off the input file name).
18
19    -h
20    --help
21        Print this message and exit.
22
23    -V
24    --version
25        Display version information and exit.
26"""
27
28import os
29import sys
30import ast
31import getopt
32import struct
33import array
34from email.parser import HeaderParser
35
36__version__ = "1.2"
37
38MESSAGES = {}
39
40
41def usage(code, msg=''):
42    print(__doc__, file=sys.stderr)
43    if msg:
44        print(msg, file=sys.stderr)
45    sys.exit(code)
46
47
48def add(ctxt, id, str, fuzzy):
49    "Add a non-fuzzy translation to the dictionary."
50    global MESSAGES
51    if not fuzzy and str:
52        if ctxt is None:
53            MESSAGES[id] = str
54        else:
55            MESSAGES[b"%b\x04%b" % (ctxt, id)] = str
56
57
58def generate():
59    "Return the generated output."
60    global MESSAGES
61    # the keys are sorted in the .mo file
62    keys = sorted(MESSAGES.keys())
63    offsets = []
64    ids = strs = b''
65    for id in keys:
66        # For each string, we need size and file offset.  Each string is NUL
67        # terminated; the NUL does not count into the size.
68        offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
69        ids += id + b'\0'
70        strs += MESSAGES[id] + b'\0'
71    output = ''
72    # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
73    # the keys start right after the index tables.
74    # translated string.
75    keystart = 7*4+16*len(keys)
76    # and the values start after the keys
77    valuestart = keystart + len(ids)
78    koffsets = []
79    voffsets = []
80    # The string table first has the list of keys, then the list of values.
81    # Each entry has first the size of the string, then the file offset.
82    for o1, l1, o2, l2 in offsets:
83        koffsets += [l1, o1+keystart]
84        voffsets += [l2, o2+valuestart]
85    offsets = koffsets + voffsets
86    output = struct.pack("Iiiiiii",
87                         0x950412de,       # Magic
88                         0,                 # Version
89                         len(keys),         # # of entries
90                         7*4,               # start of key index
91                         7*4+len(keys)*8,   # start of value index
92                         0, 0)              # size and offset of hash table
93    output += array.array("i", offsets).tobytes()
94    output += ids
95    output += strs
96    return output
97
98
99def make(filename, outfile):
100    ID = 1
101    STR = 2
102    CTXT = 3
103
104    # Compute .mo name from .po name and arguments
105    if filename.endswith('.po'):
106        infile = filename
107    else:
108        infile = filename + '.po'
109    if outfile is None:
110        outfile = os.path.splitext(infile)[0] + '.mo'
111
112    try:
113        with open(infile, 'rb') as f:
114            lines = f.readlines()
115    except IOError as msg:
116        print(msg, file=sys.stderr)
117        sys.exit(1)
118
119    section = msgctxt = None
120    fuzzy = 0
121
122    # Start off assuming Latin-1, so everything decodes without failure,
123    # until we know the exact encoding
124    encoding = 'latin-1'
125
126    # Parse the catalog
127    lno = 0
128    for l in lines:
129        l = l.decode(encoding)
130        lno += 1
131        # If we get a comment line after a msgstr, this is a new entry
132        if l[0] == '#' and section == STR:
133            add(msgctxt, msgid, msgstr, fuzzy)
134            section = msgctxt = None
135            fuzzy = 0
136        # Record a fuzzy mark
137        if l[:2] == '#,' and 'fuzzy' in l:
138            fuzzy = 1
139        # Skip comments
140        if l[0] == '#':
141            continue
142        # Now we are in a msgid or msgctxt section, output previous section
143        if l.startswith('msgctxt'):
144            if section == STR:
145                add(msgctxt, msgid, msgstr, fuzzy)
146            section = CTXT
147            l = l[7:]
148            msgctxt = b''
149        elif l.startswith('msgid') and not l.startswith('msgid_plural'):
150            if section == STR:
151                add(msgctxt, msgid, msgstr, fuzzy)
152                if not msgid:
153                    # See whether there is an encoding declaration
154                    p = HeaderParser()
155                    charset = p.parsestr(msgstr.decode(encoding)).get_content_charset()
156                    if charset:
157                        encoding = charset
158            section = ID
159            l = l[5:]
160            msgid = msgstr = b''
161            is_plural = False
162        # This is a message with plural forms
163        elif l.startswith('msgid_plural'):
164            if section != ID:
165                print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno),
166                      file=sys.stderr)
167                sys.exit(1)
168            l = l[12:]
169            msgid += b'\0' # separator of singular and plural
170            is_plural = True
171        # Now we are in a msgstr section
172        elif l.startswith('msgstr'):
173            section = STR
174            if l.startswith('msgstr['):
175                if not is_plural:
176                    print('plural without msgid_plural on %s:%d' % (infile, lno),
177                          file=sys.stderr)
178                    sys.exit(1)
179                l = l.split(']', 1)[1]
180                if msgstr:
181                    msgstr += b'\0' # Separator of the various plural forms
182            else:
183                if is_plural:
184                    print('indexed msgstr required for plural on  %s:%d' % (infile, lno),
185                          file=sys.stderr)
186                    sys.exit(1)
187                l = l[6:]
188        # Skip empty lines
189        l = l.strip()
190        if not l:
191            continue
192        l = ast.literal_eval(l)
193        if section == CTXT:
194            msgctxt += l.encode(encoding)
195        elif section == ID:
196            msgid += l.encode(encoding)
197        elif section == STR:
198            msgstr += l.encode(encoding)
199        else:
200            print('Syntax error on %s:%d' % (infile, lno), \
201                  'before:', file=sys.stderr)
202            print(l, file=sys.stderr)
203            sys.exit(1)
204    # Add last entry
205    if section == STR:
206        add(msgctxt, msgid, msgstr, fuzzy)
207
208    # Compute output
209    output = generate()
210
211    try:
212        with open(outfile,"wb") as f:
213            f.write(output)
214    except IOError as msg:
215        print(msg, file=sys.stderr)
216
217
218def main():
219    try:
220        opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
221                                   ['help', 'version', 'output-file='])
222    except getopt.error as msg:
223        usage(1, msg)
224
225    outfile = None
226    # parse options
227    for opt, arg in opts:
228        if opt in ('-h', '--help'):
229            usage(0)
230        elif opt in ('-V', '--version'):
231            print("msgfmt.py", __version__)
232            sys.exit(0)
233        elif opt in ('-o', '--output-file'):
234            outfile = arg
235    # do it
236    if not args:
237        print('No input file given', file=sys.stderr)
238        print("Try `msgfmt --help' for more information.", file=sys.stderr)
239        return
240
241    for filename in args:
242        make(filename, outfile)
243
244
245if __name__ == '__main__':
246    main()
247