1#
2# Copyright 2002-2007 Zuza Software Foundation
3# Copyright 2016 F Wolff
4#
5# This file is part of translate.
6#
7# translate is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# translate is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, see <http://www.gnu.org/licenses/>.
19
20import re
21
22
23"""
24From the GNU gettext manual:
25     WHITE-SPACE
26     #  TRANSLATOR-COMMENTS
27     #. AUTOMATIC-COMMENTS
28     #| PREVIOUS MSGID                 (Gettext 0.16 - check if this is the correct position - not yet implemented)
29     #: REFERENCE...
30     #, FLAG...
31     msgctxt CONTEXT                   (Gettext 0.15)
32     msgid UNTRANSLATED-STRING
33     msgstr TRANSLATED-STRING
34"""
35
36SINGLE_BYTE_ENCODING = "iso-8859-1"
37isspace = str.isspace
38find = str.find
39rfind = str.rfind
40startswith = str.startswith
41append = list.append
42decode = bytes.decode
43
44
45class ParseState:
46    def __init__(self, input_iterator, UnitClass, encoding=SINGLE_BYTE_ENCODING):
47        # A single-byte encoding is first defined to be able to read the header
48        # without risking UnicodeDecodeErrors. As soon as the header is parsed,
49        # the encoding defined in the header is used for re-encoding the header
50        # and for decoding all further strings.
51        self._input_iterator = input_iterator
52        self.next_line = ""
53        self.lineno = 0
54        self.eof = False
55        self.encoding = encoding
56        self.read_line()
57        self.UnitClass = UnitClass
58
59    def decode(self, string):
60        if self.encoding is not None:
61            return decode(string, self.encoding)
62        return string
63
64    def read_line(self):
65        current = self.next_line
66        if self.eof:
67            return current
68        try:
69            self.next_line = next(self._input_iterator)
70            self.lineno += 1
71            while not self.eof and self.next_line.isspace():
72                self.next_line = next(self._input_iterator)
73                self.lineno += 1
74        except StopIteration:
75            self.next_line = ""
76            self.eof = True
77        else:
78            if isinstance(self.next_line, bytes) and self.encoding is not None:
79                self.next_line = decode(self.next_line, self.encoding)
80        return current
81
82    def new_input(self, _input):
83        return ParseState(_input, self.UnitClass, self.encoding)
84
85
86def read_prevmsgid_lines(parse_state):
87    """Read all the lines belonging starting with #|. These lines contain the
88    previous msgid and msgctxt info. We strip away the leading '#| ' and read
89    until we stop seeing #|.
90    """
91    prevmsgid_lines = []
92    next_line = parse_state.next_line
93    while startswith(next_line, "#|") or startswith(next_line, "|"):
94        content = parse_state.read_line()
95        prefix_len = content.index("|") + 1
96        while content[prefix_len] == " ":
97            prefix_len += 1
98        content = content[prefix_len:]
99        append(prevmsgid_lines, content)
100        next_line = parse_state.next_line
101    return prevmsgid_lines
102
103
104def parse_prev_msgctxt(parse_state, unit):
105    parse_message(parse_state, "msgctxt", 7, unit.prev_msgctxt)
106    return len(unit.prev_msgctxt) > 0
107
108
109def parse_prev_msgid(parse_state, unit):
110    parse_message(parse_state, "msgid", 5, unit.prev_msgid)
111    return len(unit.prev_msgid) > 0
112
113
114def parse_prev_msgid_plural(parse_state, unit):
115    parse_message(parse_state, "msgid_plural", 12, unit.prev_msgid_plural)
116    return len(unit.prev_msgid_plural) > 0
117
118
119def parse_comment(parse_state, unit):
120    next_line = parse_state.next_line.lstrip()
121    if next_line and next_line[0] in ("#", "|"):
122        next_char = next_line[1]
123        if next_char == ".":
124            append(unit.automaticcomments, next_line)
125        elif next_line[0] == "|" or next_char == "|":
126            parsed = False
127            # Read all the lines starting with #|
128            prevmsgid_lines = read_prevmsgid_lines(parse_state)
129            # Create a parse state object that holds these lines
130            ps = parse_state.new_input(iter(prevmsgid_lines))
131            # Parse the msgctxt if any
132            parsed |= parse_prev_msgctxt(ps, unit)
133            # Parse the msgid if any
134            parsed |= parse_prev_msgid(ps, unit)
135            # Parse the msgid_plural if any
136            parsed |= parse_prev_msgid_plural(ps, unit)
137            # Fail with error in csae nothing was parsed
138            if not parsed:
139                raise ValueError(f"Syntax error on line {parse_state.lineno}")
140            return parse_state.next_line
141        elif next_char == ":":
142            append(unit.sourcecomments, next_line)
143        elif next_char == ",":
144            append(unit.typecomments, next_line)
145        elif next_char == "~":
146            # Special case: we refuse to parse obsoletes: they are done
147            # elsewhere to ensure we reuse the normal unit parsing code
148            return None
149        else:
150            append(unit.othercomments, next_line)
151        return parse_state.read_line()
152    else:
153        return None
154
155
156def parse_comments(parse_state, unit):
157    if not parse_comment(parse_state, unit):
158        return None
159    while parse_comment(parse_state, unit):
160        pass
161    return True
162
163
164def read_obsolete_lines(parse_state):
165    """Read all the lines belonging to the current unit if obsolete."""
166    obsolete_lines = []
167    next_line = parse_state.next_line
168    while startswith(next_line, "#~"):
169        content = parse_state.read_line()[2:].lstrip()
170        append(obsolete_lines, content)
171        next_line = parse_state.next_line
172        if startswith(content, "msgstr"):
173            # now we saw a msgstr, so we need to become more conservative to
174            # avoid parsing into the following unit
175            while startswith(next_line, '#~ "') or startswith(next_line, "#~ msgstr"):
176                content = parse_state.read_line()[3:]
177                append(obsolete_lines, content)
178                next_line = parse_state.next_line
179            break
180    return obsolete_lines
181
182
183def parse_obsolete(parse_state, unit):
184    obsolete_lines = read_obsolete_lines(parse_state)
185    if obsolete_lines == []:
186        return None
187    unit = parse_unit(parse_state.new_input(iter(obsolete_lines)), unit)
188    if unit is not None:
189        unit.makeobsolete()
190    return unit
191
192
193def parse_quoted(parse_state, start_pos=0):
194    line = parse_state.next_line
195    left = find(line, '"', start_pos)
196    if left == start_pos or isspace(line[start_pos:left]):
197        right = rfind(line, '"')
198        if left != right:
199            return parse_state.read_line()[left : right + 1]
200        raise ValueError("end-of-line within string")
201    return None
202
203
204def parse_msg_comment(parse_state, msg_comment_list, string):
205    while string is not None:
206        append(msg_comment_list, string)
207        if find(string, "\\n") > -1:
208            return parse_quoted(parse_state)
209        string = parse_quoted(parse_state)
210    return None
211
212
213def parse_multiple_quoted(parse_state, msg_list, msg_comment_list, first_start_pos=0):
214    string = parse_quoted(parse_state, first_start_pos)
215    while string is not None:
216        if msg_comment_list is None or not startswith(string, '"_:'):
217            append(msg_list, string)
218            string = parse_quoted(parse_state)
219        else:
220            string = parse_msg_comment(parse_state, msg_comment_list, string)
221
222
223def parse_message(
224    parse_state, start_of_string, start_of_string_len, msg_list, msg_comment_list=None
225):
226    if startswith(parse_state.next_line, start_of_string):
227        return parse_multiple_quoted(
228            parse_state, msg_list, msg_comment_list, start_of_string_len
229        )
230
231
232def parse_msgctxt(parse_state, unit):
233    parse_message(parse_state, "msgctxt", 7, unit.msgctxt)
234    return len(unit.msgctxt) > 0
235
236
237def parse_msgid(parse_state, unit):
238    parse_message(parse_state, "msgid", 5, unit.msgid, unit.msgidcomments)
239    return len(unit.msgid) > 0 or len(unit.msgidcomments) > 0
240
241
242def parse_msgstr(parse_state, unit):
243    parse_message(parse_state, "msgstr", 6, unit.msgstr)
244    return len(unit.msgstr) > 0
245
246
247def parse_msgid_plural(parse_state, unit):
248    parse_message(
249        parse_state, "msgid_plural", 12, unit.msgid_plural, unit.msgid_pluralcomments
250    )
251    return len(unit.msgid_plural) > 0 or len(unit.msgid_pluralcomments) > 0
252
253
254MSGSTR_ARRAY_ENTRY_LEN = len("msgstr[")
255
256
257def add_to_dict(msgstr_dict, line, right_bracket_pos, entry):
258    index = int(line[MSGSTR_ARRAY_ENTRY_LEN:right_bracket_pos])
259    if index not in msgstr_dict:
260        msgstr_dict[index] = []
261    msgstr_dict[index].extend(entry)
262
263
264def get_entry(parse_state, right_bracket_pos):
265    entry = []
266    parse_message(parse_state, "msgstr[", right_bracket_pos + 1, entry)
267    return entry
268
269
270def parse_msgstr_array_entry(parse_state, msgstr_dict):
271    line = parse_state.next_line
272    right_bracket_pos = find(line, "]", MSGSTR_ARRAY_ENTRY_LEN)
273    if right_bracket_pos >= 0:
274        entry = get_entry(parse_state, right_bracket_pos)
275        if entry:
276            add_to_dict(msgstr_dict, line, right_bracket_pos, entry)
277            return True
278        return False
279    return False
280
281
282def parse_msgstr_array(parse_state, unit):
283    msgstr_dict = {}
284    result = parse_msgstr_array_entry(parse_state, msgstr_dict)
285    if not result:  # We require at least one result
286        return False
287    while parse_msgstr_array_entry(parse_state, msgstr_dict):
288        pass
289    unit.msgstr = msgstr_dict
290    return True
291
292
293def parse_plural(parse_state, unit):
294    return bool(
295        parse_msgid_plural(parse_state, unit) and parse_msgstr_array(parse_state, unit)
296    )
297
298
299def parse_msg_entries(parse_state, unit):
300    parse_msgctxt(parse_state, unit)
301    return bool(
302        parse_msgid(parse_state, unit)
303        and (parse_msgstr(parse_state, unit) or parse_plural(parse_state, unit))
304    )
305
306
307def parse_unit(parse_state, unit=None):
308    unit = unit or parse_state.UnitClass()
309    parsed_comments = parse_comments(parse_state, unit)
310    obsolete_unit = parse_obsolete(parse_state, unit)
311    if obsolete_unit is not None:
312        return obsolete_unit
313    parsed_msg_entries = parse_msg_entries(parse_state, unit)
314    if parsed_comments or parsed_msg_entries:
315        return unit
316    return None
317
318
319def set_encoding(parse_state, store, unit):
320    charset = None
321    if (
322        isinstance(unit.msgstr, list)
323        and unit.msgstr
324        and isinstance(unit.msgstr[0], str)
325    ):
326        charset = re.search("charset=([^\\s\\\\n]+)", "".join(unit.msgstr))
327    if charset:
328        encoding = charset.group(1)
329        if encoding != "CHARSET":
330            store._encoding = encoding
331        else:
332            store._encoding = "utf-8"
333    else:
334        store._encoding = "utf-8"
335    parse_state.encoding = store._encoding
336
337
338def decode_list(lst, decode):
339    return [decode(item.encode(SINGLE_BYTE_ENCODING)) for item in lst]
340
341
342def decode_header(unit, decode):
343    """
344    The header has been arbitrarily decoded with a single-byte encoding. We
345    re-encode it to decode values with the proper encoding defined in the header
346    (using decode_list above).
347    """
348    for attr in (
349        "msgctxt",
350        "msgid",
351        "msgid_pluralcomments",
352        "msgid_plural",
353        "msgstr",
354        "othercomments",
355        "automaticcomments",
356        "sourcecomments",
357        "typecomments",
358        "msgidcomments",
359    ):
360        element = getattr(unit, attr)
361        if isinstance(element, list):
362            setattr(unit, attr, decode_list(element, decode))
363        else:
364            setattr(
365                unit,
366                attr,
367                {key: decode_list(value, decode) for key, value in element.items()},
368            )
369
370
371def parse_header(parse_state, store):
372    first_unit = parse_unit(parse_state)
373    if first_unit is None:
374        return None
375    set_encoding(parse_state, store, first_unit)
376    decode_header(first_unit, parse_state.decode)
377    # Fix encoding of next line in parser
378    # It was originally parsed with  SINGLE_BYTE_ENCODING
379    # but we need to convert it to actual encoding
380    parse_state.next_line = parse_state.decode(
381        parse_state.next_line.encode(SINGLE_BYTE_ENCODING)
382    )
383    return first_unit
384
385
386def parse_units(parse_state, store):
387    unit = parse_header(parse_state, store)
388    while unit:
389        unit.infer_state()
390        store.addunit(unit)
391        unit = parse_unit(parse_state)
392    if not parse_state.eof:
393        raise ValueError(f"Syntax error on line {parse_state.lineno}")
394