1# 2# Copyright 2004-2008 Zuza Software Foundation 3# 4# This file is part of translate. 5# 6# translate is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# translate is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18 19"""functions to get decorative/informative text out of strings...""" 20 21import re 22import unicodedata 23 24from translate.lang import data 25 26 27def spacestart(str1): 28 """returns all the whitespace from the start of the string""" 29 newstring = "" 30 for c in str1: 31 if c.isspace(): 32 newstring += c 33 else: 34 break 35 return newstring 36 37 38def spaceend(str1): 39 """returns all the whitespace from the end of the string""" 40 newstring = "" 41 for n in range(len(str1)): 42 c = str1[-1 - n] 43 if c.isspace(): 44 newstring = c + newstring 45 else: 46 break 47 return newstring 48 49 50def puncstart(str1, punctuation): 51 """returns all the punctuation from the start of the string""" 52 newstring = "" 53 for c in str1: 54 if c in punctuation or c.isspace(): 55 newstring += c 56 else: 57 break 58 return newstring 59 60 61def puncend(str1, punctuation): 62 """returns all the punctuation from the end of the string""" 63 # An implementation with regular expressions was slightly slower. 64 65 newstring = "" 66 for n in range(len(str1)): 67 c = str1[-1 - n] 68 if c in punctuation or c.isspace(): 69 newstring = c + newstring 70 else: 71 break 72 return newstring.replace("\u00a0", " ") 73 74 75def ispurepunctuation(str1): 76 """checks whether the string is entirely punctuation""" 77 for c in str1: 78 if c.isalnum(): 79 return False 80 return len(str1) 81 82 83def isvalidaccelerator(accelerator, acceptlist=None): 84 """returns whether the given accelerator character is valid 85 86 :type accelerator: character 87 :param accelerator: A character to be checked for accelerator validity 88 :type acceptlist: String 89 :param acceptlist: A list of characters that are permissible as 90 accelerators 91 :rtype: Boolean 92 :return: True if the supplied character is an acceptable accelerator 93 """ 94 assert isinstance(accelerator, str) 95 assert isinstance(acceptlist, str) or acceptlist is None 96 if len(accelerator) == 0: 97 return False 98 if acceptlist is not None: 99 acceptlist = data.normalize(acceptlist) 100 if accelerator in acceptlist: 101 return True 102 return False 103 else: 104 # Old code path - ensures that we don't get a large number of 105 # regressions 106 accelerator = accelerator.replace("_", "") 107 if accelerator in "-?": 108 return True 109 if not accelerator.isalnum(): 110 return False 111 112 # We don't want to have accelerators on characters with diacritics, 113 # so let's see if the character can decompose. 114 decomposition = unicodedata.decomposition(accelerator) 115 # Next we strip out any extra information like <this> 116 decomposition = re.sub("<[^>]+>", "", decomposition).strip() 117 return decomposition.count(" ") == 0 118 119 120def findaccelerators(str1, accelmarker, acceptlist=None): 121 """returns all the accelerators and locations in str1 marked with a given 122 marker 123 """ 124 accelerators = [] 125 badaccelerators = [] 126 currentpos = 0 127 while currentpos >= 0: 128 currentpos = str1.find(accelmarker, currentpos) 129 if currentpos >= 0: 130 accelstart = currentpos 131 currentpos += len(accelmarker) 132 # we assume accelerators are single characters 133 accelend = currentpos + 1 134 if accelend > len(str1): 135 break 136 accelerator = str1[currentpos:accelend] 137 currentpos = accelend 138 if isvalidaccelerator(accelerator, acceptlist): 139 accelerators.append((accelstart, accelerator)) 140 else: 141 badaccelerators.append((accelstart, accelerator)) 142 return accelerators, badaccelerators 143 144 145def findmarkedvariables(str1, startmarker, endmarker, ignorelist=[]): 146 """returns all the variables and locations in str1 marked with a given 147 marker 148 """ 149 variables = [] 150 currentpos = 0 151 while currentpos >= 0: 152 variable = None 153 currentpos = str1.find(startmarker, currentpos) 154 if currentpos >= 0: 155 startmatch = currentpos 156 currentpos += len(startmarker) 157 if endmarker is None: 158 # handle case without an end marker - use any non-alphanumeric 159 # character as the end marker, var must be len > 1 160 endmatch = currentpos 161 for n in range(currentpos, len(str1)): 162 if not (str1[n].isalnum() or str1[n] == "_"): 163 endmatch = n 164 break 165 if currentpos == endmatch: 166 endmatch = len(str1) 167 if currentpos < endmatch: 168 variable = str1[currentpos:endmatch] 169 currentpos = endmatch 170 elif type(endmarker) == int: 171 # setting endmarker to an int means it is a fixed-length 172 # variable string (usually endmarker==1) 173 endmatch = currentpos + endmarker 174 if endmatch > len(str1): 175 break 176 variable = str1[currentpos:endmatch] 177 currentpos = endmatch 178 else: 179 endmatch = str1.find(endmarker, currentpos) 180 if endmatch == -1: 181 break 182 # search backwards in case there's an intervening startmarker 183 # (if not it's OK)... 184 start2 = str1.rfind(startmarker, currentpos, endmatch) 185 if start2 != -1: 186 startmatch2 = start2 187 start2 += len(startmarker) 188 if start2 != currentpos: 189 currentpos = start2 190 startmatch = startmatch2 191 variable = str1[currentpos:endmatch] 192 currentpos = endmatch + len(endmarker) 193 if variable is not None and variable not in ignorelist: 194 if not variable or variable.replace("_", "").replace(".", "").isalnum(): 195 variables.append((startmatch, variable)) 196 return variables 197 198 199def getaccelerators(accelmarker, acceptlist=None): 200 """returns a function that gets a list of accelerators marked using 201 accelmarker 202 """ 203 204 def getmarkedaccelerators(str1): 205 """returns all the accelerators in str1 marked with a given marker""" 206 acclocs, badlocs = findaccelerators(str1, accelmarker, acceptlist) 207 accelerators = [accelerator for accelstart, accelerator in acclocs] 208 badaccelerators = [accelerator for accelstart, accelerator in badlocs] 209 return accelerators, badaccelerators 210 211 return getmarkedaccelerators 212 213 214def getvariables(startmarker, endmarker): 215 """returns a function that gets a list of variables marked using 216 startmarker and endmarker 217 """ 218 219 def getmarkedvariables(str1): 220 """returns all the variables in str1 marked with a given marker""" 221 varlocs = findmarkedvariables(str1, startmarker, endmarker) 222 return [variable for accelstart, variable in varlocs] 223 224 return getmarkedvariables 225 226 227def getnumbers(str1): 228 """returns any numbers that are in the string""" 229 # TODO: handle locale-based periods e.g. 2,5 for Afrikaans 230 assert isinstance(str1, str) 231 numbers = [] 232 innumber = False 233 degreesign = "\xb0" 234 lastnumber = "" 235 carryperiod = "" 236 for chr1 in str1: 237 if chr1.isdigit(): 238 innumber = True 239 elif innumber: 240 if not (chr1 == "." or chr1 == degreesign): 241 innumber = False 242 if lastnumber: 243 numbers.append(lastnumber) 244 lastnumber = "" 245 if innumber: 246 if chr1 == degreesign: 247 lastnumber += chr1 248 elif chr1 == ".": 249 carryperiod += chr1 250 else: 251 lastnumber += carryperiod + chr1 252 carryperiod = "" 253 else: 254 carryperiod = "" 255 if innumber: 256 if lastnumber: 257 numbers.append(lastnumber) 258 return numbers 259 260 261_function_re = re.compile( 262 r"""((?: 263 [\w\.]+ # function or module name - any alpha-numeric character, _, or . 264 (?:(?:::|->|\.)\w+)* # (optional) C++ style Class::Method() syntax or pointer->Method() or module.function() 265 \(\) # Must close with () 266)+) 267""", 268 re.VERBOSE, 269) # shouldn't be locale aware 270# Reference functions: 271# pam_*_item() IO::String NULL() POE::Component::Client::LDAP->new() 272# POE::Wheel::Null mechanize.UserAgent POSIX::sigaction() 273# window.resizeBy() @fptr() 274 275 276def getfunctions(str1): 277 """returns the functions() that are in a string, while ignoring the 278 trailing punctuation in the given parameter 279 """ 280 if "()" in str1: 281 return _function_re.findall(str1) 282 else: 283 return [] 284 285 286def getemails(str1): 287 """returns the email addresses that are in a string""" 288 return re.findall(r"[\w\.\-]+@[\w\.\-]+", str1) 289 290 291def geturls(str1): 292 """returns the URIs in a string""" 293 # TODO turn this into a verbose and compiled regex 294 URLPAT = ( 295 r"https?:[\w/\.:;+\-~\%#\$?=&,()]+|" 296 + r"www\.[\w/\.:;+\-~\%#\$?=&,()]+|" 297 + r"ftp:[\w/\.:;+\-~\%#?=&,]+" 298 ) 299 return re.findall(URLPAT, str1) 300 301 302def countaccelerators(accelmarker, acceptlist=None): 303 """returns a function that counts the number of accelerators marked with 304 the given marker 305 """ 306 307 def countmarkedaccelerators(str1): 308 """returns all the variables in str1 marked with a given marker""" 309 acclocs, badlocs = findaccelerators(str1, accelmarker, acceptlist) 310 return len(acclocs), len(badlocs) 311 312 return countmarkedaccelerators 313