1### 2# Copyright (c) 2002-2005, Jeremiah Fincher 3# Copyright (c) 2008-2009, James McCoy 4# Copyright (c) 2010, Valentin Lorentz 5# All rights reserved. 6# 7# Redistribution and use in source and binary forms, with or without 8# modification, are permitted provided that the following conditions are met: 9# 10# * Redistributions of source code must retain the above copyright notice, 11# this list of conditions, and the following disclaimer. 12# * Redistributions in binary form must reproduce the above copyright notice, 13# this list of conditions, and the following disclaimer in the 14# documentation and/or other materials provided with the distribution. 15# * Neither the name of the author of this software nor the name of 16# contributors to this software may be used to endorse or promote products 17# derived from this software without specific prior written consent. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29# POSSIBILITY OF SUCH DAMAGE. 30### 31 32""" 33Simple utility functions related to strings. 34""" 35 36import re 37import sys 38import time 39import string 40import textwrap 41 42from . import minisix 43from .iter import any 44from .structures import TwoWayDictionary 45 46from . import internationalization as _ 47internationalizeFunction = _.internationalizeFunction 48 49try: 50 from charade.universaldetector import UniversalDetector 51 charadeLoaded = True 52except ImportError: 53 charadeLoaded = False 54 55if minisix.PY3: 56 def decode_raw_line(line): 57 #first, try to decode using utf-8 58 try: 59 line = line.decode('utf8', 'strict') 60 except UnicodeError: 61 # if this fails and charade is loaded, try to guess the correct encoding 62 if charadeLoaded: 63 u = UniversalDetector() 64 u.feed(line) 65 u.close() 66 if u.result['encoding']: 67 # try to use the guessed encoding 68 try: 69 line = line.decode(u.result['encoding'], 70 'strict') 71 # on error, give up and replace the offending characters 72 except UnicodeError: 73 line = line.decode(errors='replace') 74 else: 75 # if no encoding could be guessed, fall back to utf-8 and 76 # replace offending characters 77 line = line.decode('utf8', 'replace') 78 # if charade is not loaded, try to decode using utf-8 and replace any 79 # offending characters 80 else: 81 line = line.decode('utf8', 'replace') 82 return line 83else: 84 def decode_raw_line(line): 85 return line 86 87def rsplit(s, sep=None, maxsplit=-1): 88 """Equivalent to str.split, except splitting from the right.""" 89 return s.rsplit(sep, maxsplit) 90 91def normalizeWhitespace(s, removeNewline=True): 92 r"""Normalizes the whitespace in a string; \s+ becomes one space.""" 93 if not s: 94 return str(s) # not the same reference 95 starts_with_space = (s[0] in ' \n\t\r') 96 ends_with_space = (s[-1] in ' \n\t\r') 97 if removeNewline: 98 newline_re = re.compile('[\r\n]+') 99 s = ' '.join(filter(bool, newline_re.split(s))) 100 s = ' '.join(filter(bool, s.split('\t'))) 101 s = ' '.join(filter(bool, s.split(' '))) 102 if starts_with_space: 103 s = ' ' + s 104 if ends_with_space: 105 s += ' ' 106 return s 107 108def distance(s, t): 109 """Returns the levenshtein edit distance between two strings.""" 110 n = len(s) 111 m = len(t) 112 if n == 0: 113 return m 114 elif m == 0: 115 return n 116 d = [] 117 for i in range(n+1): 118 d.append([]) 119 for j in range(m+1): 120 d[i].append(0) 121 d[0][j] = j 122 d[i][0] = i 123 for i in range(1, n+1): 124 cs = s[i-1] 125 for j in range(1, m+1): 126 ct = t[j-1] 127 cost = int(cs != ct) 128 d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost) 129 return d[n][m] 130 131class MultipleReplacer: 132 """Return a callable that replaces all dict keys by the associated 133 value. More efficient than multiple .replace().""" 134 135 # We use an object instead of a lambda function because it avoids the 136 # need for using the staticmethod() on the lambda function if assigning 137 # it to a class in Python 3. 138 def __init__(self, dict_): 139 self._dict = dict_ 140 dict_ = dict([(re.escape(key), val) for key,val in dict_.items()]) 141 self._matcher = re.compile('|'.join(dict_.keys())) 142 def __call__(self, s): 143 return self._matcher.sub(lambda m: self._dict[m.group(0)], s) 144def multipleReplacer(dict_): 145 return MultipleReplacer(dict_) 146 147class MultipleRemover: 148 """Return a callable that removes all words in the list. A bit more 149 efficient than multipleReplacer""" 150 # See comment of MultipleReplacer 151 def __init__(self, list_): 152 list_ = [re.escape(x) for x in list_] 153 self._matcher = re.compile('|'.join(list_)) 154 def __call__(self, s): 155 return self._matcher.sub(lambda m: '', s) 156 157_soundextrans = MultipleReplacer(dict(list(zip(string.ascii_uppercase, 158 '01230120022455012623010202')))) 159def soundex(s, length=4): 160 """Returns the soundex hash of a given string. 161 162 length=0 doesn't truncate the hash. 163 """ 164 s = s.upper() # Make everything uppercase. 165 s = ''.join([x for x in s if x in string.ascii_uppercase]) 166 if not s: 167 raise ValueError('Invalid string for soundex: %s') 168 firstChar = s[0] # Save the first character. 169 s = _soundextrans(s) # Convert to soundex numbers. 170 s = s.lstrip(s[0]) # Remove all repeated first characters. 171 L = [firstChar] 172 for c in s: 173 if c != L[-1]: 174 L.append(c) 175 L = [c for c in L if c != '0'] 176 s = ''.join(L) 177 if length: 178 s = s.ljust(length, '0')[:length] 179 return s 180 181def dqrepr(s): 182 """Returns a repr() of s guaranteed to be in double quotes.""" 183 # The wankers-that-be decided not to use double-quotes anymore in 2.3. 184 # return '"' + repr("'\x00" + s)[6:] 185 encoding = 'string_escape' if minisix.PY2 else 'unicode_escape' 186 if minisix.PY2 and isinstance(s, unicode): 187 s = s.encode('utf8', 'replace') 188 return '"%s"' % s.encode(encoding).decode().replace('"', '\\"') 189 190def quoted(s): 191 """Returns a quoted s.""" 192 return '"%s"' % s 193 194_openers = '{[(<' 195_closers = '}])>' 196def _getSep(s, allowBraces=False): 197 if len(s) < 2: 198 raise ValueError('string given to _getSep is too short: %r' % s) 199 if allowBraces: 200 braces = _closers 201 else: 202 braces = _openers + _closers 203 if s.startswith('m') or s.startswith('s'): 204 separator = s[1] 205 else: 206 separator = s[0] 207 if separator.isalnum() or separator in braces: 208 raise ValueError('Invalid separator: separator must not be alphanumeric or in ' \ 209 '"%s"' % braces) 210 return separator 211 212def perlReToPythonRe(s, allowG=False): 213 """Converts a string representation of a Perl regular expression (i.e., 214 m/^foo$/i or /foo|bar/) to a Python regular expression. 215 """ 216 opener = closer = _getSep(s, True) 217 if opener in '{[(<': 218 closer = _closers[_openers.index(opener)] 219 opener = re.escape(opener) 220 closer = re.escape(closer) 221 matcher = re.compile(r'm?%s((?:\\.|[^\\])*)%s(.*)' % (opener, closer)) 222 try: 223 (regexp, flags) = matcher.match(s).groups() 224 except AttributeError: # Unpack list of wrong size. 225 raise ValueError('Must be of the form m/.../ or /.../') 226 regexp = regexp.replace('\\'+opener, opener) 227 if opener != closer: 228 regexp = regexp.replace('\\'+closer, closer) 229 flag = 0 230 g = False 231 try: 232 for c in flags.upper(): 233 if c == 'G' and allowG: 234 g = True 235 continue 236 flag |= getattr(re, c) 237 except AttributeError: 238 raise ValueError('Invalid flag: %s' % c) 239 try: 240 r = re.compile(regexp, flag) 241 except re.error as e: 242 raise ValueError(str(e)) 243 if allowG: 244 return (r, g) 245 else: 246 return r 247 248def perlReToFindall(s): 249 """Converts a string representation of a Perl regular expression (i.e., 250 m/^foo$/i or /foo|bar/) to a Python regular expression, with support for 251 G flag 252 """ 253 (r, g) = perlReToPythonRe(s, allowG=True) 254 if g: 255 return lambda s: r.findall(s) 256 else: 257 return lambda s: r.search(s) and r.search(s).group(0) or '' 258 259def perlReToReplacer(s): 260 """Converts a string representation of a Perl regular expression (i.e., 261 s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent 262 replacement. 263 """ 264 sep = _getSep(s) 265 escaped = re.escape(sep) 266 matcher = re.compile(r's%s((?:\\.|[^\\])*)%s((?:\\.|[^\\])*)%s(.*)' 267 % (escaped, escaped, escaped)) 268 try: 269 (regexp, replace, flags) = matcher.match(s).groups() 270 except AttributeError: # Unpack list of wrong size. 271 raise ValueError('Must be of the form s/.../.../') 272 regexp = regexp.replace('\x08', r'\b') 273 replace = replace.replace('\\'+sep, sep) 274 for i in range(10): 275 replace = replace.replace(chr(i), r'\%s' % i) 276 g = False 277 if 'g' in flags: 278 g = True 279 flags = list(filter('g'.__ne__, flags)) 280 if isinstance(flags, list): 281 flags = ''.join(flags) 282 r = perlReToPythonRe(sep.join(('', regexp, flags))) 283 if g: 284 return lambda s: r.sub(replace, s) 285 else: 286 return lambda s: r.sub(replace, s, 1) 287 288_perlVarSubstituteRe = re.compile(r'\$\{([^}]+)\}|\$([a-zA-Z][a-zA-Z0-9]*)') 289def perlVariableSubstitute(vars, text): 290 def replacer(m): 291 (braced, unbraced) = m.groups() 292 var = braced or unbraced 293 try: 294 x = vars[var] 295 if callable(x): 296 return x() 297 else: 298 try: 299 return str(x) 300 except UnicodeEncodeError: # Python 2 301 return str(x).encode('utf8') 302 except KeyError: 303 if braced: 304 return '${%s}' % braced 305 else: 306 return '$' + unbraced 307 return _perlVarSubstituteRe.sub(replacer, text) 308 309def splitBytes(word, size): 310 # I'm going to hell for this function 311 for i in range(4): # a character takes at most 4 bytes in UTF-8 312 try: 313 if sys.version_info[0] >= 3: 314 word[size-i:].decode() 315 else: 316 word[size-i:].encode('utf8') 317 except UnicodeDecodeError: 318 continue 319 else: 320 return (word[0:size-i], word[size-i:]) 321 assert False, (word, size) 322 323def byteTextWrap(text, size, break_on_hyphens=False): 324 """Similar to textwrap.wrap(), but considers the size of strings (in bytes) 325 instead of their length (in characters).""" 326 try: 327 words = textwrap.TextWrapper()._split_chunks(text) 328 except AttributeError: # Python 2 329 words = textwrap.TextWrapper()._split(text) 330 words.reverse() # use it as a stack 331 if sys.version_info[0] >= 3: 332 words = [w.encode() for w in words] 333 lines = [b''] 334 while words: 335 word = words.pop(-1) 336 if len(word) > size: 337 (before, after) = splitBytes(word, size) 338 words.append(after) 339 word = before 340 if len(lines[-1]) + len(word) <= size: 341 lines[-1] += word 342 else: 343 lines.append(word) 344 if sys.version_info[0] >= 3: 345 return [l.decode() for l in lines] 346 else: 347 return lines 348 349def commaAndify(seq, comma=',', And=None): 350 """Given a a sequence, returns an English clause for that sequence. 351 352 I.e., given [1, 2, 3], returns '1, 2, and 3' 353 """ 354 if And is None: 355 And = _('and') 356 L = list(seq) 357 if len(L) == 0: 358 return '' 359 elif len(L) == 1: 360 return ''.join(L) # We need this because it raises TypeError. 361 elif len(L) == 2: 362 L.insert(1, And) 363 return ' '.join(L) 364 else: 365 L[-1] = '%s %s' % (And, L[-1]) 366 sep = '%s ' % comma 367 return sep.join(L) 368 369_unCommaTheRe = re.compile(r'(.*),\s*(the)$', re.I) 370def unCommaThe(s): 371 """Takes a string of the form 'foo, the' and turns it into 'the foo'.""" 372 m = _unCommaTheRe.match(s) 373 if m is not None: 374 return '%s %s' % (m.group(2), m.group(1)) 375 else: 376 return s 377 378def ellipsisify(s, n): 379 """Returns a shortened version of s. Produces up to the first n chars at 380 the nearest word boundary. 381 """ 382 if len(s) <= n: 383 return s 384 else: 385 return (textwrap.wrap(s, n-3)[0] + '...') 386 387plurals = TwoWayDictionary({}) 388def matchCase(s1, s2): 389 """Matches the case of s1 in s2""" 390 if s1.isupper(): 391 return s2.upper() 392 else: 393 L = list(s2) 394 for (i, char) in enumerate(s1[:len(s2)]): 395 if char.isupper(): 396 L[i] = L[i].upper() 397 return ''.join(L) 398 399@internationalizeFunction('pluralize') 400def pluralize(s): 401 """Returns the plural of s. Put any exceptions to the general English 402 rule of appending 's' in the plurals dictionary. 403 """ 404 consonants = 'bcdfghjklmnpqrstvwxz' 405 _pluralizeRegex = re.compile('[%s]y$' % consonants) 406 lowered = s.lower() 407 # Exception dictionary 408 if lowered in plurals: 409 return matchCase(s, plurals[lowered]) 410 # Words ending with 'ch', 'sh' or 'ss' such as 'punch(es)', 'fish(es) 411 # and miss(es) 412 elif any(lowered.endswith, ['x', 'ch', 'sh', 'ss']): 413 return matchCase(s, s+'es') 414 # Words ending with a consonant followed by a 'y' such as 415 # 'try (tries)' or 'spy (spies)' 416 elif _pluralizeRegex.search(lowered): 417 return matchCase(s, s[:-1] + 'ies') 418 # In all other cases, we simply add an 's' to the base word 419 else: 420 return matchCase(s, s+'s') 421 422@internationalizeFunction('depluralize') 423def depluralize(s): 424 """Returns the singular of s.""" 425 consonants = 'bcdfghjklmnpqrstvwxz' 426 _depluralizeRegex = re.compile('[%s]ies' % consonants) 427 lowered = s.lower() 428 if lowered in plurals: 429 return matchCase(s, plurals[lowered]) 430 elif any(lowered.endswith, ['ches', 'shes', 'sses']): 431 return s[:-2] 432 elif re.search(_depluralizeRegex, lowered): 433 return s[:-3] + 'y' 434 else: 435 if lowered.endswith('s'): 436 return s[:-1] # Chop off 's'. 437 else: 438 return s # Don't know what to do. 439 440def nItems(n, item, between=None): 441 """Works like this: 442 443 >>> nItems(4, '<empty>') 444 '4' 445 446 >>> nItems(1, 'clock') 447 '1 clock' 448 449 >>> nItems(10, 'clock') 450 '10 clocks' 451 452 >>> nItems(4, '<empty>', between='grandfather') 453 '4 grandfather' 454 455 >>> nItems(10, 'clock', between='grandfather') 456 '10 grandfather clocks' 457 """ 458 assert isinstance(n, minisix.integer_types), \ 459 'The order of the arguments to nItems changed again, sorry.' 460 if item == '<empty>': 461 if between is None: 462 return format('%s', n) 463 else: 464 return format('%s %s', n, item) 465 if between is None: 466 if n != 1: 467 return format('%s %p', n, item) 468 else: 469 return format('%s %s', n, item) 470 else: 471 if n != 1: 472 return format('%s %s %p', n, between, item) 473 else: 474 return format('%s %s %s', n, between, item) 475 476@internationalizeFunction('ordinal') 477def ordinal(i): 478 """Returns i + the ordinal indicator for the number. 479 480 Example: ordinal(3) => '3rd' 481 """ 482 i = int(i) 483 if i % 100 in (11,12,13): 484 return '%sth' % i 485 ord = 'th' 486 test = i % 10 487 if test == 1: 488 ord = 'st' 489 elif test == 2: 490 ord = 'nd' 491 elif test == 3: 492 ord = 'rd' 493 return '%s%s' % (i, ord) 494 495@internationalizeFunction('be') 496def be(i): 497 """Returns the form of the verb 'to be' based on the number i.""" 498 if i == 1: 499 return 'is' 500 else: 501 return 'are' 502 503@internationalizeFunction('has') 504def has(i): 505 """Returns the form of the verb 'to have' based on the number i.""" 506 if i == 1: 507 return 'has' 508 else: 509 return 'have' 510 511def toBool(s): 512 s = s.strip().lower() 513 if s in ('true', 'on', 'enable', 'enabled', '1'): 514 return True 515 elif s in ('false', 'off', 'disable', 'disabled', '0'): 516 return False 517 else: 518 raise ValueError('Invalid string for toBool: %s' % quoted(s)) 519 520# When used with Supybot, this is overriden when supybot.conf is loaded 521def timestamp(t): 522 if t is None: 523 t = time.time() 524 return time.ctime(t) 525def url(url): 526 return url 527 528_formatRe = re.compile(r'%((?:\d+)?\.\d+f|[bfhiLnpqrsStTuv%])') 529def format(s, *args, **kwargs): 530 """w00t. 531 532 %: literal %. 533 i: integer 534 s: string 535 f: float 536 r: repr 537 b: form of the verb 'to be' (takes an int) 538 h: form of the verb 'to have' (takes an int) 539 L: commaAndify (takes a list of strings or a tuple of ([strings], and)) 540 p: pluralize (takes a string) 541 q: quoted (takes a string) 542 n: nItems (takes a 2-tuple of (n, item) or a 3-tuple of (n, between, item)) 543 S: returns a human-readable size (takes an int) 544 t: time, formatted (takes an int) 545 T: time delta, formatted (takes an int) 546 u: url, wrapped in braces (this should be configurable at some point) 547 v: void : takes one or many arguments, but doesn't display it 548 (useful for translation) 549 """ 550 # Note to developers: If you want to add an argument type, do not forget 551 # to add the character to the _formatRe regexp or it will be ignored 552 # (and hard to debug if you don't know the trick). 553 # Of course, you should also document it in the docstring above. 554 if minisix.PY2: 555 def pred(s): 556 if isinstance(s, unicode): 557 return s.encode('utf8') 558 else: 559 return s 560 args = map(pred, args) 561 args = list(args) 562 args.reverse() # For more efficient popping. 563 def sub(match): 564 char = match.group(1) 565 if char == 's': 566 token = args.pop() 567 if isinstance(token, str): 568 return token 569 elif minisix.PY2 and isinstance(token, unicode): 570 return token.encode('utf8', 'replace') 571 else: 572 return str(token) 573 elif char == 'i': 574 # XXX Improve me! 575 return str(args.pop()) 576 elif char.endswith('f'): 577 return ('%'+char) % args.pop() 578 elif char == 'b': 579 return be(args.pop()) 580 elif char == 'h': 581 return has(args.pop()) 582 elif char == 'L': 583 t = args.pop() 584 if isinstance(t, tuple) and len(t) == 2: 585 if not isinstance(t[0], list): 586 raise ValueError('Invalid list for %%L in format: %s' % t) 587 if not isinstance(t[1], minisix.string_types): 588 raise ValueError('Invalid string for %%L in format: %s' % t) 589 return commaAndify(t[0], And=t[1]) 590 elif hasattr(t, '__iter__'): 591 return commaAndify(t) 592 else: 593 raise ValueError('Invalid value for %%L in format: %s' % t) 594 elif char == 'p': 595 return pluralize(args.pop()) 596 elif char == 'q': 597 return quoted(args.pop()) 598 elif char == 'r': 599 return repr(args.pop()) 600 elif char == 'n': 601 t = args.pop() 602 if not isinstance(t, (tuple, list)): 603 raise ValueError('Invalid value for %%n in format: %s' % t) 604 if len(t) == 2: 605 return nItems(*t) 606 elif len(t) == 3: 607 return nItems(t[0], t[2], between=t[1]) 608 else: 609 raise ValueError('Invalid value for %%n in format: %s' % t) 610 elif char == 'S': 611 t = args.pop() 612 if not isinstance(t, minisix.integer_types): 613 raise ValueError('Invalid value for %%S in format: %s' % t) 614 for suffix in ['B','KB','MB','GB','TB']: 615 if t < 1024: 616 return "%i%s" % (t, suffix) 617 t /= 1024 618 619 elif char == 't': 620 return timestamp(args.pop()) 621 elif char == 'T': 622 from .gen import timeElapsed 623 return timeElapsed(args.pop()) 624 elif char == 'u': 625 return url(args.pop()) 626 elif char == 'v': 627 args.pop() 628 return '' 629 elif char == '%': 630 return '%' 631 else: 632 raise ValueError('Invalid char in sub (in format).') 633 try: 634 return _formatRe.sub(sub, s) 635 except IndexError: 636 raise ValueError('Extra format chars in format spec: %r' % s) 637 638# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: 639