1#
2# Copyright 2002-2006 Zuza Software Foundation
3#
4# This file is part of translate.
5#
6# translate is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# translate is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19"""String processing utilities for extracting strings with various kinds of
20delimiters
21"""
22
23import html.entities
24import logging
25import re
26
27
28def find_all(searchin, substr):
29    """Returns a list of locations where substr occurs in searchin locations
30    are not allowed to overlap
31    """
32    location = 0
33    locations = []
34    substr_len = len(substr)
35    while location != -1:
36        location = searchin.find(substr, location)
37        if location != -1:
38            locations.append(location)
39            location += substr_len
40    return locations
41
42
43def extract(
44    source, startdelim, enddelim, escape=None, startinstring=False, allowreentry=True
45):
46    """Extracts a doublequote-delimited string from a string, allowing for
47    backslash-escaping returns tuple of (quoted string with quotes, still in
48    string at end).
49    """
50    # Note that this returns the quote characters as well... even internally
51    instring = startinstring
52    enteredonce = False
53    lenstart = len(startdelim)
54    lenend = len(enddelim)
55    startdelim_places = find_all(source, startdelim)
56    if startdelim == enddelim:
57        enddelim_places = startdelim_places[:]
58    else:
59        enddelim_places = find_all(source, enddelim)
60    if escape is not None:
61        lenescape = len(escape)
62        escape_places = find_all(source, escape)
63        # Filter escaped escapes
64        true_escape = False
65        true_escape_places = []
66        for escape_pos in escape_places:
67            if escape_pos - lenescape in escape_places:
68                true_escape = not true_escape
69            else:
70                true_escape = True
71            if true_escape:
72                true_escape_places.append(escape_pos)
73        startdelim_places = [
74            pos
75            for pos in startdelim_places
76            if pos - lenescape not in true_escape_places
77        ]
78        enddelim_places = [
79            pos + lenend
80            for pos in enddelim_places
81            if pos - lenescape not in true_escape_places
82        ]
83    else:
84        enddelim_places = [pos + lenend for pos in enddelim_places]
85    # Get a unique sorted list of the significant places in the string
86    significant_places = [0] + startdelim_places + enddelim_places + [len(source) - 1]
87    significant_places.sort()
88    extracted = ""
89    lastpos = None
90    for pos in significant_places:
91        if instring and pos in enddelim_places:
92            # Make sure that if startdelim == enddelim we don't get confused
93            # and count the same string as start and end.
94            if lastpos == pos - lenstart and lastpos in startdelim_places:
95                continue
96            extracted += source[lastpos:pos]
97            instring = False
98            lastpos = pos
99        if (
100            (not instring)
101            and pos in startdelim_places
102            and not (enteredonce and not allowreentry)
103        ):
104            instring = True
105            enteredonce = True
106            lastpos = pos
107    if instring:
108        extracted += source[lastpos:]
109    return (extracted, instring)
110
111
112def extractwithoutquotes(
113    source,
114    startdelim,
115    enddelim,
116    escape=None,
117    startinstring=False,
118    includeescapes=True,
119    allowreentry=True,
120):
121    """Extracts a doublequote-delimited string from a string, allowing for
122    backslash-escaping includeescapes can also be a function that takes the
123    whole escaped string and returns the replaced version.
124    """
125    instring = startinstring
126    enteredonce = False
127    lenstart = len(startdelim)
128    lenend = len(enddelim)
129    startdelim_places = find_all(source, startdelim)
130    if startdelim == enddelim:
131        enddelim_places = startdelim_places[:]
132    else:
133        enddelim_places = find_all(source, enddelim)
134    # hell slow because it is called far too often
135    if escape is not None:
136        lenescape = len(escape)
137        escape_places = find_all(source, escape)
138        # filter escaped escapes
139        true_escape = False
140        true_escape_places = []
141        for escape_pos in escape_places:
142            if escape_pos - lenescape in escape_places:
143                true_escape = not true_escape
144            else:
145                true_escape = True
146            if true_escape:
147                true_escape_places.append(escape_pos)
148        startdelim_places = [
149            pos
150            for pos in startdelim_places
151            if pos - lenescape not in true_escape_places
152        ]
153        enddelim_places = [
154            pos + lenend
155            for pos in enddelim_places
156            if pos - lenescape not in true_escape_places
157        ]
158    else:
159        enddelim_places = [pos + lenend for pos in enddelim_places]
160    # get a unique sorted list of the significant places in the string
161    significant_places = [0] + startdelim_places + enddelim_places + [len(source) - 1]
162    significant_places.sort()
163    extracted = ""
164    lastpos = 0
165    callable_includeescapes = callable(includeescapes)
166    checkescapes = callable_includeescapes or not includeescapes
167    for pos in significant_places:
168        if instring and pos in enddelim_places and lastpos != pos - lenstart:
169            section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
170            section = source[section_start:section_end]
171            if escape is not None and checkescapes:
172                escape_list = [
173                    epos - section_start
174                    for epos in true_escape_places
175                    if section_start <= epos <= section_end
176                ]
177                new_section = ""
178                last_epos = 0
179                for epos in escape_list:
180                    new_section += section[last_epos:epos]
181                    if callable_includeescapes:
182                        replace_escape = includeescapes(
183                            section[epos : epos + lenescape + 1]
184                        )
185                        # TODO: deprecate old method of returning boolean from
186                        # includeescape, by removing this if block
187                        if not isinstance(replace_escape, str):
188                            if replace_escape:
189                                replace_escape = section[epos : epos + lenescape + 1]
190                            else:
191                                replace_escape = section[
192                                    epos + lenescape : epos + lenescape + 1
193                                ]
194                        new_section += replace_escape
195                        last_epos = epos + lenescape + 1
196                    else:
197                        last_epos = epos + lenescape
198                section = new_section + section[last_epos:]
199            extracted += section
200            instring = False
201            lastpos = pos
202        if (
203            (not instring)
204            and pos in startdelim_places
205            and not (enteredonce and not allowreentry)
206        ):
207            instring = True
208            enteredonce = True
209            lastpos = pos
210    if instring:
211        section_start = lastpos + len(startdelim)
212        section = source[section_start:]
213        if escape is not None and not includeescapes:
214            escape_list = [
215                epos - section_start
216                for epos in true_escape_places
217                if section_start <= epos
218            ]
219            new_section = ""
220            last_epos = 0
221            for epos in escape_list:
222                new_section += section[last_epos:epos]
223                if callable_includeescapes and includeescapes(
224                    section[epos : epos + lenescape + 1]
225                ):
226                    last_epos = epos
227                else:
228                    last_epos = epos + lenescape
229            section = new_section + section[last_epos:]
230        extracted += section
231    return (extracted, instring)
232
233
234def _encode_entity_char(char, codepoint2name):
235    charnum = ord(char)
236    if charnum in codepoint2name:
237        return "&%s;" % codepoint2name[charnum]
238    else:
239        return char
240
241
242def entityencode(source, codepoint2name):
243    """Encode ``source`` using entities from ``codepoint2name``.
244
245    :param unicode source: Source string to encode
246    :param codepoint2name: Dictionary mapping code points to entity names
247           (without the the leading ``&`` or the trailing ``;``)
248    :type codepoint2name: :meth:`dict`
249    """
250    output = ""
251    inentity = False
252    for char in source:
253        if char == "&":
254            inentity = True
255            possibleentity = ""
256            continue
257        if inentity:
258            if char == ";":
259                output += "&" + possibleentity + ";"
260                inentity = False
261            elif char == " ":
262                output += _encode_entity_char("&", codepoint2name) + entityencode(
263                    possibleentity + char, codepoint2name
264                )
265                inentity = False
266            else:
267                possibleentity += char
268        else:
269            output += _encode_entity_char(char, codepoint2name)
270    if inentity:
271        # Handle nonentities at end of string.
272        output += _encode_entity_char("&", codepoint2name) + entityencode(
273            possibleentity, codepoint2name
274        )
275
276    return output
277
278
279def _has_entity_end(source):
280    for char in source:
281        if char == ";":
282            return True
283        elif char == " ":
284            return False
285    return False
286
287
288def entitydecode(source, name2codepoint):
289    """Decode ``source`` using entities from ``name2codepoint``.
290
291    :param unicode source: Source string to decode
292    :param name2codepoint: Dictionary mapping entity names (without the
293           the leading ``&`` or the trailing ``;``) to code points
294    :type name2codepoint: :meth:`dict`
295    """
296    output = ""
297    inentity = False
298    for i, char in enumerate(source):
299        char = source[i]
300        if char == "&":
301            inentity = True
302            possibleentity = ""
303            continue
304        if inentity:
305            if char == ";":
306                if len(possibleentity) > 0 and possibleentity in name2codepoint:
307                    entchar = chr(name2codepoint[possibleentity])
308                    if entchar == "&" and _has_entity_end(source[i + 1 :]):
309                        output += "&" + possibleentity + ";"
310                    else:
311                        output += entchar
312                    inentity = False
313                else:
314                    output += "&" + possibleentity + ";"
315                    inentity = False
316            elif char == " ":
317                output += "&" + possibleentity + char
318                inentity = False
319            else:
320                possibleentity += char
321        else:
322            output += char
323    if inentity:
324        # Handle nonentities at end of string.
325        output += "&" + possibleentity
326    return output
327
328
329def htmlentityencode(source):
330    """Encode ``source`` using HTML entities e.g. © -> ``&copy;``
331
332    :param unicode source: Source string to encode
333    """
334    return entityencode(source, html.entities.codepoint2name)
335
336
337def htmlentitydecode(source):
338    """Decode source using HTML entities e.g. ``&copy;`` -> ©.
339
340    :param unicode source: Source string to decode
341    """
342    return entitydecode(source, html.entities.name2codepoint)
343
344
345def javapropertiesencode(source):
346    """Encodes source in the escaped-unicode encoding used by Java
347    .properties files
348    """
349    output = ""
350    if source and source[0] == " ":
351        output = "\\"
352    for char in source:
353        charnum = ord(char)
354        if char in controlchars:
355            output += controlchars[char]
356        elif 0 <= charnum < 128:
357            output += str(char)
358        else:
359            output += "\\u%04X" % charnum
360    return output
361
362
363def java_utf8_properties_encode(source):
364    """Encodes source in the escaped-unicode encoding used by java utf-8
365    .properties files.
366    """
367    output = ""
368    for char in source:
369        if char in controlchars:
370            output += controlchars[char]
371        else:
372            output += char
373    return output
374
375
376def xwiki_properties_encode(source, encoding):
377    if re.search(r"\{[0-9]+\}", source):
378        source = source.replace("'", "''")
379    if encoding == "utf-8":
380        return java_utf8_properties_encode(source)
381    else:
382        return javapropertiesencode(source)
383
384
385def escapespace(char):
386    assert len(char) == 1
387    if char.isspace():
388        return "\\u%04X" % ord(char)
389    return char
390
391
392def mozillaescapemarginspaces(source):
393    """Escape leading and trailing spaces for Mozilla .properties files."""
394    if not source:
395        return ""
396
397    if len(source) == 1 and source.isspace():
398        # FIXME: This is hack for people using white-space to mark empty
399        # Mozilla strings translated, drop this once we have better way to
400        # handle this in Pootle.
401        return ""
402
403    if len(source) == 1:
404        return escapespace(source)
405    else:
406        return escapespace(source[0]) + source[1:-1] + escapespace(source[-1])
407
408
409propertyescapes = {
410    # escapes that are self-escaping
411    "\\": "\\",
412    "'": "'",
413    '"': '"',
414    # control characters that we keep
415    "f": "\f",
416    "n": "\n",
417    "r": "\r",
418    "t": "\t",
419}
420
421controlchars = {
422    # the reverse of the above...
423    "\\": "\\\\",
424    "\f": "\\f",
425    "\n": "\\n",
426    "\r": "\\r",
427    "\t": "\\t",
428}
429
430
431def escapecontrols(source):
432    """escape control characters in the given string"""
433    for key, value in controlchars.items():
434        source = source.replace(key, value)
435    return source
436
437
438def propertiesdecode(source):
439    """Decodes source from the escaped-unicode encoding used by .properties
440    files.
441
442    Java uses Latin1 by default, and Mozilla uses UTF-8 by default.
443
444    Since the .decode("unicode-escape") routine decodes everything, and we
445    don't want to we reimplemented the algorithm from Python Objects/unicode.c
446    in Python and modify it to retain escaped control characters.
447    """
448    output = ""
449    s = 0
450
451    def unichr2(i):
452        """Returns a Unicode string of one character with ordinal 32 <= i,
453        otherwise an escaped control character.
454        """
455        if 32 <= i:
456            return chr(i)
457        elif chr(i) in controlchars:
458            # we just return the character, unescaped
459            # if people want to escape them they can use escapecontrols
460            return chr(i)
461        return "\\u%04x" % i
462
463    while s < len(source):
464        c = source[s]
465        if c != "\\":
466            output += c
467            s += 1
468            continue
469        s += 1
470        if s >= len(source):
471            # this is an escape at the end of the line, which implies
472            # a continuation..., return the escape to inform the parser
473            output += c
474            continue
475        c = source[s]
476        s += 1
477        if c == "\n":
478            pass
479        # propertyescapes lookups
480        elif c in propertyescapes:
481            output += propertyescapes[c]
482        # \uXXXX escapes
483        # \UXXXX escapes
484        elif c in "uU":
485            digits = 4
486            x = 0
487            for digit in range(digits):
488                if s + digit >= len(source):
489                    digits = digit
490                    break
491                c = source[s + digit].lower()
492                if c.isdigit() or c in "abcdef":
493                    x <<= 4
494                    if c.isdigit():
495                        x += ord(c) - ord("0")
496                    else:
497                        x += ord(c) - ord("a") + 10
498                else:
499                    digits = digit
500                    break
501            s += digits
502            output += unichr2(x)
503        elif c == "N":
504            if source[s] != "{":
505                logging.warning("Invalid named unicode escape: no { after \\N")
506                output += "\\" + c
507                continue
508            s += 1
509            e = source.find("}", s)
510            if e == -1:
511                logging.warning("Invalid named unicode escape: no } after \\N{")
512                output += "\\" + c
513                continue
514            import unicodedata
515
516            name = source[s:e]
517            output += unicodedata.lookup(name)
518            s = e + 1
519        else:
520            output += c  # Drop any \ that we don't specifically handle
521    return output
522
523
524def xwiki_properties_decode(source):
525    if re.search(r"\{[0-9]+\}", source):
526        source = source.replace("''", "'")
527    return propertiesdecode(source)
528
529
530def findend(string, substring):
531    s = string.find(substring)
532    if s != -1:
533        s += len(substring)
534    return s
535
536
537def rstripeol(string):
538    return string.rstrip("\r\n")
539
540
541def stripcomment(comment, startstring="<!--", endstring="-->"):
542    cstart = comment.find(startstring)
543    if cstart == -1:
544        cstart = 0
545    else:
546        cstart += len(startstring)
547    cend = comment.find(endstring, cstart)
548    return comment[cstart:cend].strip()
549
550
551def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
552    return startstring + comment.strip() + endstring
553