1# 2# Copyright 2002-2007 Zuza Software Foundation 3# Copyright 2016 F Wolff 4# 5# This file is part of translate. 6# 7# translate is free software; you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation; either version 2 of the License, or 10# (at your option) any later version. 11# 12# translate is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with this program; if not, see <http://www.gnu.org/licenses/>. 19 20import re 21 22 23""" 24From the GNU gettext manual: 25 WHITE-SPACE 26 # TRANSLATOR-COMMENTS 27 #. AUTOMATIC-COMMENTS 28 #| PREVIOUS MSGID (Gettext 0.16 - check if this is the correct position - not yet implemented) 29 #: REFERENCE... 30 #, FLAG... 31 msgctxt CONTEXT (Gettext 0.15) 32 msgid UNTRANSLATED-STRING 33 msgstr TRANSLATED-STRING 34""" 35 36SINGLE_BYTE_ENCODING = "iso-8859-1" 37isspace = str.isspace 38find = str.find 39rfind = str.rfind 40startswith = str.startswith 41append = list.append 42decode = bytes.decode 43 44 45class ParseState: 46 def __init__(self, input_iterator, UnitClass, encoding=SINGLE_BYTE_ENCODING): 47 # A single-byte encoding is first defined to be able to read the header 48 # without risking UnicodeDecodeErrors. As soon as the header is parsed, 49 # the encoding defined in the header is used for re-encoding the header 50 # and for decoding all further strings. 51 self._input_iterator = input_iterator 52 self.next_line = "" 53 self.lineno = 0 54 self.eof = False 55 self.encoding = encoding 56 self.read_line() 57 self.UnitClass = UnitClass 58 59 def decode(self, string): 60 if self.encoding is not None: 61 return decode(string, self.encoding) 62 return string 63 64 def read_line(self): 65 current = self.next_line 66 if self.eof: 67 return current 68 try: 69 self.next_line = next(self._input_iterator) 70 self.lineno += 1 71 while not self.eof and self.next_line.isspace(): 72 self.next_line = next(self._input_iterator) 73 self.lineno += 1 74 except StopIteration: 75 self.next_line = "" 76 self.eof = True 77 else: 78 if isinstance(self.next_line, bytes) and self.encoding is not None: 79 self.next_line = decode(self.next_line, self.encoding) 80 return current 81 82 def new_input(self, _input): 83 return ParseState(_input, self.UnitClass, self.encoding) 84 85 86def read_prevmsgid_lines(parse_state): 87 """Read all the lines belonging starting with #|. These lines contain the 88 previous msgid and msgctxt info. We strip away the leading '#| ' and read 89 until we stop seeing #|. 90 """ 91 prevmsgid_lines = [] 92 next_line = parse_state.next_line 93 while startswith(next_line, "#|") or startswith(next_line, "|"): 94 content = parse_state.read_line() 95 prefix_len = content.index("|") + 1 96 while content[prefix_len] == " ": 97 prefix_len += 1 98 content = content[prefix_len:] 99 append(prevmsgid_lines, content) 100 next_line = parse_state.next_line 101 return prevmsgid_lines 102 103 104def parse_prev_msgctxt(parse_state, unit): 105 parse_message(parse_state, "msgctxt", 7, unit.prev_msgctxt) 106 return len(unit.prev_msgctxt) > 0 107 108 109def parse_prev_msgid(parse_state, unit): 110 parse_message(parse_state, "msgid", 5, unit.prev_msgid) 111 return len(unit.prev_msgid) > 0 112 113 114def parse_prev_msgid_plural(parse_state, unit): 115 parse_message(parse_state, "msgid_plural", 12, unit.prev_msgid_plural) 116 return len(unit.prev_msgid_plural) > 0 117 118 119def parse_comment(parse_state, unit): 120 next_line = parse_state.next_line.lstrip() 121 if next_line and next_line[0] in ("#", "|"): 122 next_char = next_line[1] 123 if next_char == ".": 124 append(unit.automaticcomments, next_line) 125 elif next_line[0] == "|" or next_char == "|": 126 parsed = False 127 # Read all the lines starting with #| 128 prevmsgid_lines = read_prevmsgid_lines(parse_state) 129 # Create a parse state object that holds these lines 130 ps = parse_state.new_input(iter(prevmsgid_lines)) 131 # Parse the msgctxt if any 132 parsed |= parse_prev_msgctxt(ps, unit) 133 # Parse the msgid if any 134 parsed |= parse_prev_msgid(ps, unit) 135 # Parse the msgid_plural if any 136 parsed |= parse_prev_msgid_plural(ps, unit) 137 # Fail with error in csae nothing was parsed 138 if not parsed: 139 raise ValueError(f"Syntax error on line {parse_state.lineno}") 140 return parse_state.next_line 141 elif next_char == ":": 142 append(unit.sourcecomments, next_line) 143 elif next_char == ",": 144 append(unit.typecomments, next_line) 145 elif next_char == "~": 146 # Special case: we refuse to parse obsoletes: they are done 147 # elsewhere to ensure we reuse the normal unit parsing code 148 return None 149 else: 150 append(unit.othercomments, next_line) 151 return parse_state.read_line() 152 else: 153 return None 154 155 156def parse_comments(parse_state, unit): 157 if not parse_comment(parse_state, unit): 158 return None 159 while parse_comment(parse_state, unit): 160 pass 161 return True 162 163 164def read_obsolete_lines(parse_state): 165 """Read all the lines belonging to the current unit if obsolete.""" 166 obsolete_lines = [] 167 next_line = parse_state.next_line 168 while startswith(next_line, "#~"): 169 content = parse_state.read_line()[2:].lstrip() 170 append(obsolete_lines, content) 171 next_line = parse_state.next_line 172 if startswith(content, "msgstr"): 173 # now we saw a msgstr, so we need to become more conservative to 174 # avoid parsing into the following unit 175 while startswith(next_line, '#~ "') or startswith(next_line, "#~ msgstr"): 176 content = parse_state.read_line()[3:] 177 append(obsolete_lines, content) 178 next_line = parse_state.next_line 179 break 180 return obsolete_lines 181 182 183def parse_obsolete(parse_state, unit): 184 obsolete_lines = read_obsolete_lines(parse_state) 185 if obsolete_lines == []: 186 return None 187 unit = parse_unit(parse_state.new_input(iter(obsolete_lines)), unit) 188 if unit is not None: 189 unit.makeobsolete() 190 return unit 191 192 193def parse_quoted(parse_state, start_pos=0): 194 line = parse_state.next_line 195 left = find(line, '"', start_pos) 196 if left == start_pos or isspace(line[start_pos:left]): 197 right = rfind(line, '"') 198 if left != right: 199 return parse_state.read_line()[left : right + 1] 200 raise ValueError("end-of-line within string") 201 return None 202 203 204def parse_msg_comment(parse_state, msg_comment_list, string): 205 while string is not None: 206 append(msg_comment_list, string) 207 if find(string, "\\n") > -1: 208 return parse_quoted(parse_state) 209 string = parse_quoted(parse_state) 210 return None 211 212 213def parse_multiple_quoted(parse_state, msg_list, msg_comment_list, first_start_pos=0): 214 string = parse_quoted(parse_state, first_start_pos) 215 while string is not None: 216 if msg_comment_list is None or not startswith(string, '"_:'): 217 append(msg_list, string) 218 string = parse_quoted(parse_state) 219 else: 220 string = parse_msg_comment(parse_state, msg_comment_list, string) 221 222 223def parse_message( 224 parse_state, start_of_string, start_of_string_len, msg_list, msg_comment_list=None 225): 226 if startswith(parse_state.next_line, start_of_string): 227 return parse_multiple_quoted( 228 parse_state, msg_list, msg_comment_list, start_of_string_len 229 ) 230 231 232def parse_msgctxt(parse_state, unit): 233 parse_message(parse_state, "msgctxt", 7, unit.msgctxt) 234 return len(unit.msgctxt) > 0 235 236 237def parse_msgid(parse_state, unit): 238 parse_message(parse_state, "msgid", 5, unit.msgid, unit.msgidcomments) 239 return len(unit.msgid) > 0 or len(unit.msgidcomments) > 0 240 241 242def parse_msgstr(parse_state, unit): 243 parse_message(parse_state, "msgstr", 6, unit.msgstr) 244 return len(unit.msgstr) > 0 245 246 247def parse_msgid_plural(parse_state, unit): 248 parse_message( 249 parse_state, "msgid_plural", 12, unit.msgid_plural, unit.msgid_pluralcomments 250 ) 251 return len(unit.msgid_plural) > 0 or len(unit.msgid_pluralcomments) > 0 252 253 254MSGSTR_ARRAY_ENTRY_LEN = len("msgstr[") 255 256 257def add_to_dict(msgstr_dict, line, right_bracket_pos, entry): 258 index = int(line[MSGSTR_ARRAY_ENTRY_LEN:right_bracket_pos]) 259 if index not in msgstr_dict: 260 msgstr_dict[index] = [] 261 msgstr_dict[index].extend(entry) 262 263 264def get_entry(parse_state, right_bracket_pos): 265 entry = [] 266 parse_message(parse_state, "msgstr[", right_bracket_pos + 1, entry) 267 return entry 268 269 270def parse_msgstr_array_entry(parse_state, msgstr_dict): 271 line = parse_state.next_line 272 right_bracket_pos = find(line, "]", MSGSTR_ARRAY_ENTRY_LEN) 273 if right_bracket_pos >= 0: 274 entry = get_entry(parse_state, right_bracket_pos) 275 if entry: 276 add_to_dict(msgstr_dict, line, right_bracket_pos, entry) 277 return True 278 return False 279 return False 280 281 282def parse_msgstr_array(parse_state, unit): 283 msgstr_dict = {} 284 result = parse_msgstr_array_entry(parse_state, msgstr_dict) 285 if not result: # We require at least one result 286 return False 287 while parse_msgstr_array_entry(parse_state, msgstr_dict): 288 pass 289 unit.msgstr = msgstr_dict 290 return True 291 292 293def parse_plural(parse_state, unit): 294 return bool( 295 parse_msgid_plural(parse_state, unit) and parse_msgstr_array(parse_state, unit) 296 ) 297 298 299def parse_msg_entries(parse_state, unit): 300 parse_msgctxt(parse_state, unit) 301 return bool( 302 parse_msgid(parse_state, unit) 303 and (parse_msgstr(parse_state, unit) or parse_plural(parse_state, unit)) 304 ) 305 306 307def parse_unit(parse_state, unit=None): 308 unit = unit or parse_state.UnitClass() 309 parsed_comments = parse_comments(parse_state, unit) 310 obsolete_unit = parse_obsolete(parse_state, unit) 311 if obsolete_unit is not None: 312 return obsolete_unit 313 parsed_msg_entries = parse_msg_entries(parse_state, unit) 314 if parsed_comments or parsed_msg_entries: 315 return unit 316 return None 317 318 319def set_encoding(parse_state, store, unit): 320 charset = None 321 if ( 322 isinstance(unit.msgstr, list) 323 and unit.msgstr 324 and isinstance(unit.msgstr[0], str) 325 ): 326 charset = re.search("charset=([^\\s\\\\n]+)", "".join(unit.msgstr)) 327 if charset: 328 encoding = charset.group(1) 329 if encoding != "CHARSET": 330 store._encoding = encoding 331 else: 332 store._encoding = "utf-8" 333 else: 334 store._encoding = "utf-8" 335 parse_state.encoding = store._encoding 336 337 338def decode_list(lst, decode): 339 return [decode(item.encode(SINGLE_BYTE_ENCODING)) for item in lst] 340 341 342def decode_header(unit, decode): 343 """ 344 The header has been arbitrarily decoded with a single-byte encoding. We 345 re-encode it to decode values with the proper encoding defined in the header 346 (using decode_list above). 347 """ 348 for attr in ( 349 "msgctxt", 350 "msgid", 351 "msgid_pluralcomments", 352 "msgid_plural", 353 "msgstr", 354 "othercomments", 355 "automaticcomments", 356 "sourcecomments", 357 "typecomments", 358 "msgidcomments", 359 ): 360 element = getattr(unit, attr) 361 if isinstance(element, list): 362 setattr(unit, attr, decode_list(element, decode)) 363 else: 364 setattr( 365 unit, 366 attr, 367 {key: decode_list(value, decode) for key, value in element.items()}, 368 ) 369 370 371def parse_header(parse_state, store): 372 first_unit = parse_unit(parse_state) 373 if first_unit is None: 374 return None 375 set_encoding(parse_state, store, first_unit) 376 decode_header(first_unit, parse_state.decode) 377 # Fix encoding of next line in parser 378 # It was originally parsed with SINGLE_BYTE_ENCODING 379 # but we need to convert it to actual encoding 380 parse_state.next_line = parse_state.decode( 381 parse_state.next_line.encode(SINGLE_BYTE_ENCODING) 382 ) 383 return first_unit 384 385 386def parse_units(parse_state, store): 387 unit = parse_header(parse_state, store) 388 while unit: 389 unit.infer_state() 390 store.addunit(unit) 391 unit = parse_unit(parse_state) 392 if not parse_state.eof: 393 raise ValueError(f"Syntax error on line {parse_state.lineno}") 394