1#
2# Copyright 2004-2008 Zuza Software Foundation
3#
4# This file is part of translate.
5#
6# translate is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# translate is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19"""functions to get decorative/informative text out of strings..."""
20
21import re
22import unicodedata
23
24from translate.lang import data
25
26
27def spacestart(str1):
28    """returns all the whitespace from the start of the string"""
29    newstring = ""
30    for c in str1:
31        if c.isspace():
32            newstring += c
33        else:
34            break
35    return newstring
36
37
38def spaceend(str1):
39    """returns all the whitespace from the end of the string"""
40    newstring = ""
41    for n in range(len(str1)):
42        c = str1[-1 - n]
43        if c.isspace():
44            newstring = c + newstring
45        else:
46            break
47    return newstring
48
49
50def puncstart(str1, punctuation):
51    """returns all the punctuation from the start of the string"""
52    newstring = ""
53    for c in str1:
54        if c in punctuation or c.isspace():
55            newstring += c
56        else:
57            break
58    return newstring
59
60
61def puncend(str1, punctuation):
62    """returns all the punctuation from the end of the string"""
63    # An implementation with regular expressions was slightly slower.
64
65    newstring = ""
66    for n in range(len(str1)):
67        c = str1[-1 - n]
68        if c in punctuation or c.isspace():
69            newstring = c + newstring
70        else:
71            break
72    return newstring.replace("\u00a0", " ")
73
74
75def ispurepunctuation(str1):
76    """checks whether the string is entirely punctuation"""
77    for c in str1:
78        if c.isalnum():
79            return False
80    return len(str1)
81
82
83def isvalidaccelerator(accelerator, acceptlist=None):
84    """returns whether the given accelerator character is valid
85
86    :type accelerator: character
87    :param accelerator: A character to be checked for accelerator validity
88    :type acceptlist: String
89    :param acceptlist: A list of characters that are permissible as
90                       accelerators
91    :rtype: Boolean
92    :return: True if the supplied character is an acceptable accelerator
93    """
94    assert isinstance(accelerator, str)
95    assert isinstance(acceptlist, str) or acceptlist is None
96    if len(accelerator) == 0:
97        return False
98    if acceptlist is not None:
99        acceptlist = data.normalize(acceptlist)
100        if accelerator in acceptlist:
101            return True
102        return False
103    else:
104        # Old code path - ensures that we don't get a large number of
105        # regressions
106        accelerator = accelerator.replace("_", "")
107        if accelerator in "-?":
108            return True
109        if not accelerator.isalnum():
110            return False
111
112        # We don't want to have accelerators on characters with diacritics,
113        # so let's see if the character can decompose.
114        decomposition = unicodedata.decomposition(accelerator)
115        # Next we strip out any extra information like <this>
116        decomposition = re.sub("<[^>]+>", "", decomposition).strip()
117        return decomposition.count(" ") == 0
118
119
120def findaccelerators(str1, accelmarker, acceptlist=None):
121    """returns all the accelerators and locations in str1 marked with a given
122    marker
123    """
124    accelerators = []
125    badaccelerators = []
126    currentpos = 0
127    while currentpos >= 0:
128        currentpos = str1.find(accelmarker, currentpos)
129        if currentpos >= 0:
130            accelstart = currentpos
131            currentpos += len(accelmarker)
132            # we assume accelerators are single characters
133            accelend = currentpos + 1
134            if accelend > len(str1):
135                break
136            accelerator = str1[currentpos:accelend]
137            currentpos = accelend
138            if isvalidaccelerator(accelerator, acceptlist):
139                accelerators.append((accelstart, accelerator))
140            else:
141                badaccelerators.append((accelstart, accelerator))
142    return accelerators, badaccelerators
143
144
145def findmarkedvariables(str1, startmarker, endmarker, ignorelist=[]):
146    """returns all the variables and locations in str1 marked with a given
147    marker
148    """
149    variables = []
150    currentpos = 0
151    while currentpos >= 0:
152        variable = None
153        currentpos = str1.find(startmarker, currentpos)
154        if currentpos >= 0:
155            startmatch = currentpos
156            currentpos += len(startmarker)
157            if endmarker is None:
158                # handle case without an end marker - use any non-alphanumeric
159                # character as the end marker, var must be len > 1
160                endmatch = currentpos
161                for n in range(currentpos, len(str1)):
162                    if not (str1[n].isalnum() or str1[n] == "_"):
163                        endmatch = n
164                        break
165                if currentpos == endmatch:
166                    endmatch = len(str1)
167                if currentpos < endmatch:
168                    variable = str1[currentpos:endmatch]
169                currentpos = endmatch
170            elif type(endmarker) == int:
171                # setting endmarker to an int means it is a fixed-length
172                # variable string (usually endmarker==1)
173                endmatch = currentpos + endmarker
174                if endmatch > len(str1):
175                    break
176                variable = str1[currentpos:endmatch]
177                currentpos = endmatch
178            else:
179                endmatch = str1.find(endmarker, currentpos)
180                if endmatch == -1:
181                    break
182                # search backwards in case there's an intervening startmarker
183                # (if not it's OK)...
184                start2 = str1.rfind(startmarker, currentpos, endmatch)
185                if start2 != -1:
186                    startmatch2 = start2
187                    start2 += len(startmarker)
188                    if start2 != currentpos:
189                        currentpos = start2
190                        startmatch = startmatch2
191                variable = str1[currentpos:endmatch]
192                currentpos = endmatch + len(endmarker)
193            if variable is not None and variable not in ignorelist:
194                if not variable or variable.replace("_", "").replace(".", "").isalnum():
195                    variables.append((startmatch, variable))
196    return variables
197
198
199def getaccelerators(accelmarker, acceptlist=None):
200    """returns a function that gets a list of accelerators marked using
201    accelmarker
202    """
203
204    def getmarkedaccelerators(str1):
205        """returns all the accelerators in str1 marked with a given marker"""
206        acclocs, badlocs = findaccelerators(str1, accelmarker, acceptlist)
207        accelerators = [accelerator for accelstart, accelerator in acclocs]
208        badaccelerators = [accelerator for accelstart, accelerator in badlocs]
209        return accelerators, badaccelerators
210
211    return getmarkedaccelerators
212
213
214def getvariables(startmarker, endmarker):
215    """returns a function that gets a list of variables marked using
216    startmarker and endmarker
217    """
218
219    def getmarkedvariables(str1):
220        """returns all the variables in str1 marked with a given marker"""
221        varlocs = findmarkedvariables(str1, startmarker, endmarker)
222        return [variable for accelstart, variable in varlocs]
223
224    return getmarkedvariables
225
226
227def getnumbers(str1):
228    """returns any numbers that are in the string"""
229    # TODO: handle locale-based periods e.g. 2,5 for Afrikaans
230    assert isinstance(str1, str)
231    numbers = []
232    innumber = False
233    degreesign = "\xb0"
234    lastnumber = ""
235    carryperiod = ""
236    for chr1 in str1:
237        if chr1.isdigit():
238            innumber = True
239        elif innumber:
240            if not (chr1 == "." or chr1 == degreesign):
241                innumber = False
242                if lastnumber:
243                    numbers.append(lastnumber)
244                lastnumber = ""
245        if innumber:
246            if chr1 == degreesign:
247                lastnumber += chr1
248            elif chr1 == ".":
249                carryperiod += chr1
250            else:
251                lastnumber += carryperiod + chr1
252                carryperiod = ""
253        else:
254            carryperiod = ""
255    if innumber:
256        if lastnumber:
257            numbers.append(lastnumber)
258    return numbers
259
260
261_function_re = re.compile(
262    r"""((?:
263    [\w\.]+              # function or module name - any alpha-numeric character, _, or .
264    (?:(?:::|->|\.)\w+)* # (optional) C++ style Class::Method() syntax or pointer->Method() or module.function()
265    \(\)                 # Must close with ()
266)+)
267""",
268    re.VERBOSE,
269)  # shouldn't be locale aware
270# Reference functions:
271#   pam_*_item() IO::String NULL() POE::Component::Client::LDAP->new()
272#   POE::Wheel::Null mechanize.UserAgent POSIX::sigaction()
273#   window.resizeBy() @fptr()
274
275
276def getfunctions(str1):
277    """returns the functions() that are in a string, while ignoring the
278    trailing punctuation in the given parameter
279    """
280    if "()" in str1:
281        return _function_re.findall(str1)
282    else:
283        return []
284
285
286def getemails(str1):
287    """returns the email addresses that are in a string"""
288    return re.findall(r"[\w\.\-]+@[\w\.\-]+", str1)
289
290
291def geturls(str1):
292    """returns the URIs in a string"""
293    # TODO turn this into a verbose and compiled regex
294    URLPAT = (
295        r"https?:[\w/\.:;+\-~\%#\$?=&,()]+|"
296        + r"www\.[\w/\.:;+\-~\%#\$?=&,()]+|"
297        + r"ftp:[\w/\.:;+\-~\%#?=&,]+"
298    )
299    return re.findall(URLPAT, str1)
300
301
302def countaccelerators(accelmarker, acceptlist=None):
303    """returns a function that counts the number of accelerators marked with
304    the given marker
305    """
306
307    def countmarkedaccelerators(str1):
308        """returns all the variables in str1 marked with a given marker"""
309        acclocs, badlocs = findaccelerators(str1, accelmarker, acceptlist)
310        return len(acclocs), len(badlocs)
311
312    return countmarkedaccelerators
313