1# 2# Copyright 2002-2006 Zuza Software Foundation 3# 4# This file is part of translate. 5# 6# translate is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# translate is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18 19"""String processing utilities for extracting strings with various kinds of 20delimiters 21""" 22 23import html.entities 24import logging 25import re 26 27 28def find_all(searchin, substr): 29 """Returns a list of locations where substr occurs in searchin locations 30 are not allowed to overlap 31 """ 32 location = 0 33 locations = [] 34 substr_len = len(substr) 35 while location != -1: 36 location = searchin.find(substr, location) 37 if location != -1: 38 locations.append(location) 39 location += substr_len 40 return locations 41 42 43def extract( 44 source, startdelim, enddelim, escape=None, startinstring=False, allowreentry=True 45): 46 """Extracts a doublequote-delimited string from a string, allowing for 47 backslash-escaping returns tuple of (quoted string with quotes, still in 48 string at end). 49 """ 50 # Note that this returns the quote characters as well... even internally 51 instring = startinstring 52 enteredonce = False 53 lenstart = len(startdelim) 54 lenend = len(enddelim) 55 startdelim_places = find_all(source, startdelim) 56 if startdelim == enddelim: 57 enddelim_places = startdelim_places[:] 58 else: 59 enddelim_places = find_all(source, enddelim) 60 if escape is not None: 61 lenescape = len(escape) 62 escape_places = find_all(source, escape) 63 # Filter escaped escapes 64 true_escape = False 65 true_escape_places = [] 66 for escape_pos in escape_places: 67 if escape_pos - lenescape in escape_places: 68 true_escape = not true_escape 69 else: 70 true_escape = True 71 if true_escape: 72 true_escape_places.append(escape_pos) 73 startdelim_places = [ 74 pos 75 for pos in startdelim_places 76 if pos - lenescape not in true_escape_places 77 ] 78 enddelim_places = [ 79 pos + lenend 80 for pos in enddelim_places 81 if pos - lenescape not in true_escape_places 82 ] 83 else: 84 enddelim_places = [pos + lenend for pos in enddelim_places] 85 # Get a unique sorted list of the significant places in the string 86 significant_places = [0] + startdelim_places + enddelim_places + [len(source) - 1] 87 significant_places.sort() 88 extracted = "" 89 lastpos = None 90 for pos in significant_places: 91 if instring and pos in enddelim_places: 92 # Make sure that if startdelim == enddelim we don't get confused 93 # and count the same string as start and end. 94 if lastpos == pos - lenstart and lastpos in startdelim_places: 95 continue 96 extracted += source[lastpos:pos] 97 instring = False 98 lastpos = pos 99 if ( 100 (not instring) 101 and pos in startdelim_places 102 and not (enteredonce and not allowreentry) 103 ): 104 instring = True 105 enteredonce = True 106 lastpos = pos 107 if instring: 108 extracted += source[lastpos:] 109 return (extracted, instring) 110 111 112def extractwithoutquotes( 113 source, 114 startdelim, 115 enddelim, 116 escape=None, 117 startinstring=False, 118 includeescapes=True, 119 allowreentry=True, 120): 121 """Extracts a doublequote-delimited string from a string, allowing for 122 backslash-escaping includeescapes can also be a function that takes the 123 whole escaped string and returns the replaced version. 124 """ 125 instring = startinstring 126 enteredonce = False 127 lenstart = len(startdelim) 128 lenend = len(enddelim) 129 startdelim_places = find_all(source, startdelim) 130 if startdelim == enddelim: 131 enddelim_places = startdelim_places[:] 132 else: 133 enddelim_places = find_all(source, enddelim) 134 # hell slow because it is called far too often 135 if escape is not None: 136 lenescape = len(escape) 137 escape_places = find_all(source, escape) 138 # filter escaped escapes 139 true_escape = False 140 true_escape_places = [] 141 for escape_pos in escape_places: 142 if escape_pos - lenescape in escape_places: 143 true_escape = not true_escape 144 else: 145 true_escape = True 146 if true_escape: 147 true_escape_places.append(escape_pos) 148 startdelim_places = [ 149 pos 150 for pos in startdelim_places 151 if pos - lenescape not in true_escape_places 152 ] 153 enddelim_places = [ 154 pos + lenend 155 for pos in enddelim_places 156 if pos - lenescape not in true_escape_places 157 ] 158 else: 159 enddelim_places = [pos + lenend for pos in enddelim_places] 160 # get a unique sorted list of the significant places in the string 161 significant_places = [0] + startdelim_places + enddelim_places + [len(source) - 1] 162 significant_places.sort() 163 extracted = "" 164 lastpos = 0 165 callable_includeescapes = callable(includeescapes) 166 checkescapes = callable_includeescapes or not includeescapes 167 for pos in significant_places: 168 if instring and pos in enddelim_places and lastpos != pos - lenstart: 169 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim) 170 section = source[section_start:section_end] 171 if escape is not None and checkescapes: 172 escape_list = [ 173 epos - section_start 174 for epos in true_escape_places 175 if section_start <= epos <= section_end 176 ] 177 new_section = "" 178 last_epos = 0 179 for epos in escape_list: 180 new_section += section[last_epos:epos] 181 if callable_includeescapes: 182 replace_escape = includeescapes( 183 section[epos : epos + lenescape + 1] 184 ) 185 # TODO: deprecate old method of returning boolean from 186 # includeescape, by removing this if block 187 if not isinstance(replace_escape, str): 188 if replace_escape: 189 replace_escape = section[epos : epos + lenescape + 1] 190 else: 191 replace_escape = section[ 192 epos + lenescape : epos + lenescape + 1 193 ] 194 new_section += replace_escape 195 last_epos = epos + lenescape + 1 196 else: 197 last_epos = epos + lenescape 198 section = new_section + section[last_epos:] 199 extracted += section 200 instring = False 201 lastpos = pos 202 if ( 203 (not instring) 204 and pos in startdelim_places 205 and not (enteredonce and not allowreentry) 206 ): 207 instring = True 208 enteredonce = True 209 lastpos = pos 210 if instring: 211 section_start = lastpos + len(startdelim) 212 section = source[section_start:] 213 if escape is not None and not includeescapes: 214 escape_list = [ 215 epos - section_start 216 for epos in true_escape_places 217 if section_start <= epos 218 ] 219 new_section = "" 220 last_epos = 0 221 for epos in escape_list: 222 new_section += section[last_epos:epos] 223 if callable_includeescapes and includeescapes( 224 section[epos : epos + lenescape + 1] 225 ): 226 last_epos = epos 227 else: 228 last_epos = epos + lenescape 229 section = new_section + section[last_epos:] 230 extracted += section 231 return (extracted, instring) 232 233 234def _encode_entity_char(char, codepoint2name): 235 charnum = ord(char) 236 if charnum in codepoint2name: 237 return "&%s;" % codepoint2name[charnum] 238 else: 239 return char 240 241 242def entityencode(source, codepoint2name): 243 """Encode ``source`` using entities from ``codepoint2name``. 244 245 :param unicode source: Source string to encode 246 :param codepoint2name: Dictionary mapping code points to entity names 247 (without the the leading ``&`` or the trailing ``;``) 248 :type codepoint2name: :meth:`dict` 249 """ 250 output = "" 251 inentity = False 252 for char in source: 253 if char == "&": 254 inentity = True 255 possibleentity = "" 256 continue 257 if inentity: 258 if char == ";": 259 output += "&" + possibleentity + ";" 260 inentity = False 261 elif char == " ": 262 output += _encode_entity_char("&", codepoint2name) + entityencode( 263 possibleentity + char, codepoint2name 264 ) 265 inentity = False 266 else: 267 possibleentity += char 268 else: 269 output += _encode_entity_char(char, codepoint2name) 270 if inentity: 271 # Handle nonentities at end of string. 272 output += _encode_entity_char("&", codepoint2name) + entityencode( 273 possibleentity, codepoint2name 274 ) 275 276 return output 277 278 279def _has_entity_end(source): 280 for char in source: 281 if char == ";": 282 return True 283 elif char == " ": 284 return False 285 return False 286 287 288def entitydecode(source, name2codepoint): 289 """Decode ``source`` using entities from ``name2codepoint``. 290 291 :param unicode source: Source string to decode 292 :param name2codepoint: Dictionary mapping entity names (without the 293 the leading ``&`` or the trailing ``;``) to code points 294 :type name2codepoint: :meth:`dict` 295 """ 296 output = "" 297 inentity = False 298 for i, char in enumerate(source): 299 char = source[i] 300 if char == "&": 301 inentity = True 302 possibleentity = "" 303 continue 304 if inentity: 305 if char == ";": 306 if len(possibleentity) > 0 and possibleentity in name2codepoint: 307 entchar = chr(name2codepoint[possibleentity]) 308 if entchar == "&" and _has_entity_end(source[i + 1 :]): 309 output += "&" + possibleentity + ";" 310 else: 311 output += entchar 312 inentity = False 313 else: 314 output += "&" + possibleentity + ";" 315 inentity = False 316 elif char == " ": 317 output += "&" + possibleentity + char 318 inentity = False 319 else: 320 possibleentity += char 321 else: 322 output += char 323 if inentity: 324 # Handle nonentities at end of string. 325 output += "&" + possibleentity 326 return output 327 328 329def htmlentityencode(source): 330 """Encode ``source`` using HTML entities e.g. © -> ``©`` 331 332 :param unicode source: Source string to encode 333 """ 334 return entityencode(source, html.entities.codepoint2name) 335 336 337def htmlentitydecode(source): 338 """Decode source using HTML entities e.g. ``©`` -> ©. 339 340 :param unicode source: Source string to decode 341 """ 342 return entitydecode(source, html.entities.name2codepoint) 343 344 345def javapropertiesencode(source): 346 """Encodes source in the escaped-unicode encoding used by Java 347 .properties files 348 """ 349 output = "" 350 if source and source[0] == " ": 351 output = "\\" 352 for char in source: 353 charnum = ord(char) 354 if char in controlchars: 355 output += controlchars[char] 356 elif 0 <= charnum < 128: 357 output += str(char) 358 else: 359 output += "\\u%04X" % charnum 360 return output 361 362 363def java_utf8_properties_encode(source): 364 """Encodes source in the escaped-unicode encoding used by java utf-8 365 .properties files. 366 """ 367 output = "" 368 for char in source: 369 if char in controlchars: 370 output += controlchars[char] 371 else: 372 output += char 373 return output 374 375 376def xwiki_properties_encode(source, encoding): 377 if re.search(r"\{[0-9]+\}", source): 378 source = source.replace("'", "''") 379 if encoding == "utf-8": 380 return java_utf8_properties_encode(source) 381 else: 382 return javapropertiesencode(source) 383 384 385def escapespace(char): 386 assert len(char) == 1 387 if char.isspace(): 388 return "\\u%04X" % ord(char) 389 return char 390 391 392def mozillaescapemarginspaces(source): 393 """Escape leading and trailing spaces for Mozilla .properties files.""" 394 if not source: 395 return "" 396 397 if len(source) == 1 and source.isspace(): 398 # FIXME: This is hack for people using white-space to mark empty 399 # Mozilla strings translated, drop this once we have better way to 400 # handle this in Pootle. 401 return "" 402 403 if len(source) == 1: 404 return escapespace(source) 405 else: 406 return escapespace(source[0]) + source[1:-1] + escapespace(source[-1]) 407 408 409propertyescapes = { 410 # escapes that are self-escaping 411 "\\": "\\", 412 "'": "'", 413 '"': '"', 414 # control characters that we keep 415 "f": "\f", 416 "n": "\n", 417 "r": "\r", 418 "t": "\t", 419} 420 421controlchars = { 422 # the reverse of the above... 423 "\\": "\\\\", 424 "\f": "\\f", 425 "\n": "\\n", 426 "\r": "\\r", 427 "\t": "\\t", 428} 429 430 431def escapecontrols(source): 432 """escape control characters in the given string""" 433 for key, value in controlchars.items(): 434 source = source.replace(key, value) 435 return source 436 437 438def propertiesdecode(source): 439 """Decodes source from the escaped-unicode encoding used by .properties 440 files. 441 442 Java uses Latin1 by default, and Mozilla uses UTF-8 by default. 443 444 Since the .decode("unicode-escape") routine decodes everything, and we 445 don't want to we reimplemented the algorithm from Python Objects/unicode.c 446 in Python and modify it to retain escaped control characters. 447 """ 448 output = "" 449 s = 0 450 451 def unichr2(i): 452 """Returns a Unicode string of one character with ordinal 32 <= i, 453 otherwise an escaped control character. 454 """ 455 if 32 <= i: 456 return chr(i) 457 elif chr(i) in controlchars: 458 # we just return the character, unescaped 459 # if people want to escape them they can use escapecontrols 460 return chr(i) 461 return "\\u%04x" % i 462 463 while s < len(source): 464 c = source[s] 465 if c != "\\": 466 output += c 467 s += 1 468 continue 469 s += 1 470 if s >= len(source): 471 # this is an escape at the end of the line, which implies 472 # a continuation..., return the escape to inform the parser 473 output += c 474 continue 475 c = source[s] 476 s += 1 477 if c == "\n": 478 pass 479 # propertyescapes lookups 480 elif c in propertyescapes: 481 output += propertyescapes[c] 482 # \uXXXX escapes 483 # \UXXXX escapes 484 elif c in "uU": 485 digits = 4 486 x = 0 487 for digit in range(digits): 488 if s + digit >= len(source): 489 digits = digit 490 break 491 c = source[s + digit].lower() 492 if c.isdigit() or c in "abcdef": 493 x <<= 4 494 if c.isdigit(): 495 x += ord(c) - ord("0") 496 else: 497 x += ord(c) - ord("a") + 10 498 else: 499 digits = digit 500 break 501 s += digits 502 output += unichr2(x) 503 elif c == "N": 504 if source[s] != "{": 505 logging.warning("Invalid named unicode escape: no { after \\N") 506 output += "\\" + c 507 continue 508 s += 1 509 e = source.find("}", s) 510 if e == -1: 511 logging.warning("Invalid named unicode escape: no } after \\N{") 512 output += "\\" + c 513 continue 514 import unicodedata 515 516 name = source[s:e] 517 output += unicodedata.lookup(name) 518 s = e + 1 519 else: 520 output += c # Drop any \ that we don't specifically handle 521 return output 522 523 524def xwiki_properties_decode(source): 525 if re.search(r"\{[0-9]+\}", source): 526 source = source.replace("''", "'") 527 return propertiesdecode(source) 528 529 530def findend(string, substring): 531 s = string.find(substring) 532 if s != -1: 533 s += len(substring) 534 return s 535 536 537def rstripeol(string): 538 return string.rstrip("\r\n") 539 540 541def stripcomment(comment, startstring="<!--", endstring="-->"): 542 cstart = comment.find(startstring) 543 if cstart == -1: 544 cstart = 0 545 else: 546 cstart += len(startstring) 547 cend = comment.find(endstring, cstart) 548 return comment[cstart:cend].strip() 549 550 551def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"): 552 return startstring + comment.strip() + endstring 553