1###
2# Copyright (c) 2002-2005, Jeremiah Fincher
3# Copyright (c) 2008-2009, James McCoy
4# Copyright (c) 2010, Valentin Lorentz
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions are met:
9#
10#   * Redistributions of source code must retain the above copyright notice,
11#     this list of conditions, and the following disclaimer.
12#   * Redistributions in binary form must reproduce the above copyright notice,
13#     this list of conditions, and the following disclaimer in the
14#     documentation and/or other materials provided with the distribution.
15#   * Neither the name of the author of this software nor the name of
16#     contributors to this software may be used to endorse or promote products
17#     derived from this software without specific prior written consent.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29# POSSIBILITY OF SUCH DAMAGE.
30###
31
32"""
33Simple utility functions related to strings.
34"""
35
36import re
37import sys
38import time
39import string
40import textwrap
41
42from . import minisix
43from .iter import any
44from .structures import TwoWayDictionary
45
46from . import internationalization as _
47internationalizeFunction = _.internationalizeFunction
48
49try:
50    from charade.universaldetector import UniversalDetector
51    charadeLoaded = True
52except ImportError:
53    charadeLoaded = False
54
55if minisix.PY3:
56    def decode_raw_line(line):
57        #first, try to decode using utf-8
58        try:
59            line = line.decode('utf8', 'strict')
60        except UnicodeError:
61            # if this fails and charade is loaded, try to guess the correct encoding
62            if charadeLoaded:
63                u = UniversalDetector()
64                u.feed(line)
65                u.close()
66                if u.result['encoding']:
67                    # try to use the guessed encoding
68                    try:
69                        line = line.decode(u.result['encoding'],
70                            'strict')
71                    # on error, give up and replace the offending characters
72                    except UnicodeError:
73                        line = line.decode(errors='replace')
74                else:
75                    # if no encoding could be guessed, fall back to utf-8 and
76                    # replace offending characters
77                    line = line.decode('utf8', 'replace')
78            # if charade is not loaded, try to decode using utf-8 and replace any
79            # offending characters
80            else:
81                line = line.decode('utf8', 'replace')
82        return line
83else:
84    def decode_raw_line(line):
85        return line
86
87def rsplit(s, sep=None, maxsplit=-1):
88    """Equivalent to str.split, except splitting from the right."""
89    return s.rsplit(sep, maxsplit)
90
91def normalizeWhitespace(s, removeNewline=True):
92    r"""Normalizes the whitespace in a string; \s+ becomes one space."""
93    if not s:
94        return str(s) # not the same reference
95    starts_with_space = (s[0] in ' \n\t\r')
96    ends_with_space = (s[-1] in ' \n\t\r')
97    if removeNewline:
98        newline_re = re.compile('[\r\n]+')
99        s = ' '.join(filter(bool, newline_re.split(s)))
100    s = ' '.join(filter(bool, s.split('\t')))
101    s = ' '.join(filter(bool, s.split(' ')))
102    if starts_with_space:
103        s = ' ' + s
104    if ends_with_space:
105        s += ' '
106    return s
107
108def distance(s, t):
109    """Returns the levenshtein edit distance between two strings."""
110    n = len(s)
111    m = len(t)
112    if n == 0:
113        return m
114    elif m == 0:
115        return n
116    d = []
117    for i in range(n+1):
118        d.append([])
119        for j in range(m+1):
120            d[i].append(0)
121            d[0][j] = j
122        d[i][0] = i
123    for i in range(1, n+1):
124        cs = s[i-1]
125        for j in range(1, m+1):
126            ct = t[j-1]
127            cost = int(cs != ct)
128            d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
129    return d[n][m]
130
131class MultipleReplacer:
132    """Return a callable that replaces all dict keys by the associated
133    value. More efficient than multiple .replace()."""
134
135    # We use an object instead of a lambda function because it avoids the
136    # need for using the staticmethod() on the lambda function if assigning
137    # it to a class in Python 3.
138    def __init__(self, dict_):
139        self._dict = dict_
140        dict_ = dict([(re.escape(key), val) for key,val in dict_.items()])
141        self._matcher = re.compile('|'.join(dict_.keys()))
142    def __call__(self, s):
143        return self._matcher.sub(lambda m: self._dict[m.group(0)], s)
144def multipleReplacer(dict_):
145    return MultipleReplacer(dict_)
146
147class MultipleRemover:
148    """Return a callable that removes all words in the list. A bit more
149    efficient than multipleReplacer"""
150    # See comment of  MultipleReplacer
151    def __init__(self, list_):
152        list_ = [re.escape(x) for x in list_]
153        self._matcher = re.compile('|'.join(list_))
154    def __call__(self, s):
155        return self._matcher.sub(lambda m: '', s)
156
157_soundextrans = MultipleReplacer(dict(list(zip(string.ascii_uppercase,
158                                 '01230120022455012623010202'))))
159def soundex(s, length=4):
160    """Returns the soundex hash of a given string.
161
162    length=0 doesn't truncate the hash.
163    """
164    s = s.upper() # Make everything uppercase.
165    s = ''.join([x for x in s if x in string.ascii_uppercase])
166    if not s:
167        raise ValueError('Invalid string for soundex: %s')
168    firstChar = s[0] # Save the first character.
169    s = _soundextrans(s) # Convert to soundex numbers.
170    s = s.lstrip(s[0]) # Remove all repeated first characters.
171    L = [firstChar]
172    for c in s:
173        if c != L[-1]:
174            L.append(c)
175    L = [c for c in L if c != '0']
176    s = ''.join(L)
177    if length:
178        s = s.ljust(length, '0')[:length]
179    return s
180
181def dqrepr(s):
182    """Returns a repr() of s guaranteed to be in double quotes."""
183    # The wankers-that-be decided not to use double-quotes anymore in 2.3.
184    # return '"' + repr("'\x00" + s)[6:]
185    encoding = 'string_escape' if minisix.PY2 else 'unicode_escape'
186    if minisix.PY2 and isinstance(s, unicode):
187        s = s.encode('utf8', 'replace')
188    return '"%s"' % s.encode(encoding).decode().replace('"', '\\"')
189
190def quoted(s):
191    """Returns a quoted s."""
192    return '"%s"' % s
193
194_openers = '{[(<'
195_closers = '}])>'
196def _getSep(s, allowBraces=False):
197    if len(s) < 2:
198        raise ValueError('string given to _getSep is too short: %r' % s)
199    if allowBraces:
200        braces = _closers
201    else:
202        braces = _openers + _closers
203    if s.startswith('m') or s.startswith('s'):
204        separator = s[1]
205    else:
206        separator = s[0]
207    if separator.isalnum() or separator in braces:
208        raise ValueError('Invalid separator: separator must not be alphanumeric or in ' \
209              '"%s"' % braces)
210    return separator
211
212def perlReToPythonRe(s, allowG=False):
213    """Converts a string representation of a Perl regular expression (i.e.,
214    m/^foo$/i or /foo|bar/) to a Python regular expression.
215    """
216    opener = closer = _getSep(s, True)
217    if opener in '{[(<':
218        closer = _closers[_openers.index(opener)]
219    opener = re.escape(opener)
220    closer = re.escape(closer)
221    matcher = re.compile(r'm?%s((?:\\.|[^\\])*)%s(.*)' % (opener, closer))
222    try:
223        (regexp, flags) = matcher.match(s).groups()
224    except AttributeError: # Unpack list of wrong size.
225        raise ValueError('Must be of the form m/.../ or /.../')
226    regexp = regexp.replace('\\'+opener, opener)
227    if opener != closer:
228        regexp = regexp.replace('\\'+closer, closer)
229    flag = 0
230    g = False
231    try:
232        for c in flags.upper():
233            if c == 'G' and allowG:
234                g = True
235                continue
236            flag |= getattr(re, c)
237    except AttributeError:
238        raise ValueError('Invalid flag: %s' % c)
239    try:
240        r = re.compile(regexp, flag)
241    except re.error as e:
242        raise ValueError(str(e))
243    if allowG:
244        return (r, g)
245    else:
246        return r
247
248def perlReToFindall(s):
249    """Converts a string representation of a Perl regular expression (i.e.,
250    m/^foo$/i or /foo|bar/) to a Python regular expression, with support for
251    G flag
252    """
253    (r, g) = perlReToPythonRe(s, allowG=True)
254    if g:
255        return lambda s: r.findall(s)
256    else:
257        return lambda s: r.search(s) and r.search(s).group(0) or ''
258
259def perlReToReplacer(s):
260    """Converts a string representation of a Perl regular expression (i.e.,
261    s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent
262    replacement.
263    """
264    sep = _getSep(s)
265    escaped = re.escape(sep)
266    matcher = re.compile(r's%s((?:\\.|[^\\])*)%s((?:\\.|[^\\])*)%s(.*)'
267                         % (escaped, escaped, escaped))
268    try:
269        (regexp, replace, flags) = matcher.match(s).groups()
270    except AttributeError: # Unpack list of wrong size.
271        raise ValueError('Must be of the form s/.../.../')
272    regexp = regexp.replace('\x08', r'\b')
273    replace = replace.replace('\\'+sep, sep)
274    for i in range(10):
275        replace = replace.replace(chr(i), r'\%s' % i)
276    g = False
277    if 'g' in flags:
278        g = True
279        flags = list(filter('g'.__ne__, flags))
280    if isinstance(flags, list):
281        flags = ''.join(flags)
282    r = perlReToPythonRe(sep.join(('', regexp, flags)))
283    if g:
284        return lambda s: r.sub(replace, s)
285    else:
286        return lambda s: r.sub(replace, s, 1)
287
288_perlVarSubstituteRe = re.compile(r'\$\{([^}]+)\}|\$([a-zA-Z][a-zA-Z0-9]*)')
289def perlVariableSubstitute(vars, text):
290    def replacer(m):
291        (braced, unbraced) = m.groups()
292        var = braced or unbraced
293        try:
294            x = vars[var]
295            if callable(x):
296                return x()
297            else:
298                try:
299                    return str(x)
300                except UnicodeEncodeError: # Python 2
301                    return str(x).encode('utf8')
302        except KeyError:
303            if braced:
304                return '${%s}' % braced
305            else:
306                return '$' + unbraced
307    return _perlVarSubstituteRe.sub(replacer, text)
308
309def splitBytes(word, size):
310    # I'm going to hell for this function
311    for i in range(4): # a character takes at most 4 bytes in UTF-8
312        try:
313            if sys.version_info[0] >= 3:
314                word[size-i:].decode()
315            else:
316                word[size-i:].encode('utf8')
317        except UnicodeDecodeError:
318            continue
319        else:
320            return (word[0:size-i], word[size-i:])
321    assert False, (word, size)
322
323def byteTextWrap(text, size, break_on_hyphens=False):
324    """Similar to textwrap.wrap(), but considers the size of strings (in bytes)
325    instead of their length (in characters)."""
326    try:
327        words = textwrap.TextWrapper()._split_chunks(text)
328    except AttributeError: # Python 2
329        words = textwrap.TextWrapper()._split(text)
330    words.reverse() # use it as a stack
331    if sys.version_info[0] >= 3:
332        words = [w.encode() for w in words]
333    lines = [b'']
334    while words:
335        word = words.pop(-1)
336        if len(word) > size:
337            (before, after) = splitBytes(word, size)
338            words.append(after)
339            word = before
340        if len(lines[-1]) + len(word) <= size:
341            lines[-1] += word
342        else:
343            lines.append(word)
344    if sys.version_info[0] >= 3:
345        return [l.decode() for l in lines]
346    else:
347        return lines
348
349def commaAndify(seq, comma=',', And=None):
350    """Given a a sequence, returns an English clause for that sequence.
351
352    I.e., given [1, 2, 3], returns '1, 2, and 3'
353    """
354    if And is None:
355        And = _('and')
356    L = list(seq)
357    if len(L) == 0:
358        return ''
359    elif len(L) == 1:
360        return ''.join(L) # We need this because it raises TypeError.
361    elif len(L) == 2:
362        L.insert(1, And)
363        return ' '.join(L)
364    else:
365        L[-1] = '%s %s' % (And, L[-1])
366        sep = '%s ' % comma
367        return sep.join(L)
368
369_unCommaTheRe = re.compile(r'(.*),\s*(the)$', re.I)
370def unCommaThe(s):
371    """Takes a string of the form 'foo, the' and turns it into 'the foo'."""
372    m = _unCommaTheRe.match(s)
373    if m is not None:
374        return '%s %s' % (m.group(2), m.group(1))
375    else:
376        return s
377
378def ellipsisify(s, n):
379    """Returns a shortened version of s.  Produces up to the first n chars at
380    the nearest word boundary.
381    """
382    if len(s) <= n:
383        return s
384    else:
385        return (textwrap.wrap(s, n-3)[0] + '...')
386
387plurals = TwoWayDictionary({})
388def matchCase(s1, s2):
389    """Matches the case of s1 in s2"""
390    if s1.isupper():
391        return s2.upper()
392    else:
393        L = list(s2)
394        for (i, char) in enumerate(s1[:len(s2)]):
395            if char.isupper():
396                L[i] = L[i].upper()
397        return ''.join(L)
398
399@internationalizeFunction('pluralize')
400def pluralize(s):
401    """Returns the plural of s.  Put any exceptions to the general English
402    rule of appending 's' in the plurals dictionary.
403    """
404    consonants = 'bcdfghjklmnpqrstvwxz'
405    _pluralizeRegex = re.compile('[%s]y$' % consonants)
406    lowered = s.lower()
407    # Exception dictionary
408    if lowered in plurals:
409        return matchCase(s, plurals[lowered])
410    # Words ending with 'ch', 'sh' or 'ss' such as 'punch(es)', 'fish(es)
411    # and miss(es)
412    elif any(lowered.endswith, ['x', 'ch', 'sh', 'ss']):
413        return matchCase(s, s+'es')
414    # Words ending with a consonant followed by a 'y' such as
415    # 'try (tries)' or 'spy (spies)'
416    elif _pluralizeRegex.search(lowered):
417        return matchCase(s, s[:-1] + 'ies')
418    # In all other cases, we simply add an 's' to the base word
419    else:
420        return matchCase(s, s+'s')
421
422@internationalizeFunction('depluralize')
423def depluralize(s):
424    """Returns the singular of s."""
425    consonants = 'bcdfghjklmnpqrstvwxz'
426    _depluralizeRegex = re.compile('[%s]ies' % consonants)
427    lowered = s.lower()
428    if lowered in plurals:
429        return matchCase(s, plurals[lowered])
430    elif any(lowered.endswith, ['ches', 'shes', 'sses']):
431        return s[:-2]
432    elif re.search(_depluralizeRegex, lowered):
433        return s[:-3] + 'y'
434    else:
435        if lowered.endswith('s'):
436            return s[:-1] # Chop off 's'.
437        else:
438            return s # Don't know what to do.
439
440def nItems(n, item, between=None):
441    """Works like this:
442
443    >>> nItems(4, '<empty>')
444    '4'
445
446    >>> nItems(1, 'clock')
447    '1 clock'
448
449    >>> nItems(10, 'clock')
450    '10 clocks'
451
452    >>> nItems(4, '<empty>', between='grandfather')
453    '4 grandfather'
454
455    >>> nItems(10, 'clock', between='grandfather')
456    '10 grandfather clocks'
457    """
458    assert isinstance(n, minisix.integer_types), \
459           'The order of the arguments to nItems changed again, sorry.'
460    if item == '<empty>':
461        if between is None:
462            return format('%s', n)
463        else:
464            return format('%s %s', n, item)
465    if between is None:
466        if n != 1:
467            return format('%s %p', n, item)
468        else:
469            return format('%s %s', n, item)
470    else:
471        if n != 1:
472            return format('%s %s %p', n, between, item)
473        else:
474            return format('%s %s %s', n, between, item)
475
476@internationalizeFunction('ordinal')
477def ordinal(i):
478    """Returns i + the ordinal indicator for the number.
479
480    Example: ordinal(3) => '3rd'
481    """
482    i = int(i)
483    if i % 100 in (11,12,13):
484        return '%sth' % i
485    ord = 'th'
486    test = i % 10
487    if test == 1:
488        ord = 'st'
489    elif test == 2:
490        ord = 'nd'
491    elif test == 3:
492        ord = 'rd'
493    return '%s%s' % (i, ord)
494
495@internationalizeFunction('be')
496def be(i):
497    """Returns the form of the verb 'to be' based on the number i."""
498    if i == 1:
499        return 'is'
500    else:
501        return 'are'
502
503@internationalizeFunction('has')
504def has(i):
505    """Returns the form of the verb 'to have' based on the number i."""
506    if i == 1:
507        return 'has'
508    else:
509        return 'have'
510
511def toBool(s):
512    s = s.strip().lower()
513    if s in ('true', 'on', 'enable', 'enabled', '1'):
514        return True
515    elif s in ('false', 'off', 'disable', 'disabled', '0'):
516        return False
517    else:
518        raise ValueError('Invalid string for toBool: %s' % quoted(s))
519
520# When used with Supybot, this is overriden when supybot.conf is loaded
521def timestamp(t):
522    if t is None:
523        t = time.time()
524    return time.ctime(t)
525def url(url):
526    return url
527
528_formatRe = re.compile(r'%((?:\d+)?\.\d+f|[bfhiLnpqrsStTuv%])')
529def format(s, *args, **kwargs):
530    """w00t.
531
532    %: literal %.
533    i: integer
534    s: string
535    f: float
536    r: repr
537    b: form of the verb 'to be' (takes an int)
538    h: form of the verb 'to have' (takes an int)
539    L: commaAndify (takes a list of strings or a tuple of ([strings], and))
540    p: pluralize (takes a string)
541    q: quoted (takes a string)
542    n: nItems (takes a 2-tuple of (n, item) or a 3-tuple of (n, between, item))
543    S: returns a human-readable size (takes an int)
544    t: time, formatted (takes an int)
545    T: time delta, formatted (takes an int)
546    u: url, wrapped in braces (this should be configurable at some point)
547    v: void : takes one or many arguments, but doesn't display it
548       (useful for translation)
549    """
550    # Note to developers: If you want to add an argument type, do not forget
551    # to add the character to the _formatRe regexp or it will be ignored
552    # (and hard to debug if you don't know the trick).
553    # Of course, you should also document it in the docstring above.
554    if minisix.PY2:
555        def pred(s):
556            if isinstance(s, unicode):
557                return s.encode('utf8')
558            else:
559                return s
560        args = map(pred, args)
561    args = list(args)
562    args.reverse() # For more efficient popping.
563    def sub(match):
564        char = match.group(1)
565        if char == 's':
566            token = args.pop()
567            if isinstance(token, str):
568                return token
569            elif minisix.PY2 and isinstance(token, unicode):
570                return token.encode('utf8', 'replace')
571            else:
572                return str(token)
573        elif char == 'i':
574            # XXX Improve me!
575            return str(args.pop())
576        elif char.endswith('f'):
577            return ('%'+char) % args.pop()
578        elif char == 'b':
579            return be(args.pop())
580        elif char == 'h':
581            return has(args.pop())
582        elif char == 'L':
583            t = args.pop()
584            if isinstance(t, tuple) and len(t) == 2:
585                if not isinstance(t[0], list):
586                    raise ValueError('Invalid list for %%L in format: %s' % t)
587                if not isinstance(t[1], minisix.string_types):
588                    raise ValueError('Invalid string for %%L in format: %s' % t)
589                return commaAndify(t[0], And=t[1])
590            elif hasattr(t, '__iter__'):
591                return commaAndify(t)
592            else:
593                raise ValueError('Invalid value for %%L in format: %s' % t)
594        elif char == 'p':
595            return pluralize(args.pop())
596        elif char == 'q':
597            return quoted(args.pop())
598        elif char == 'r':
599            return repr(args.pop())
600        elif char == 'n':
601            t = args.pop()
602            if not isinstance(t, (tuple, list)):
603                raise ValueError('Invalid value for %%n in format: %s' % t)
604            if len(t) == 2:
605                return nItems(*t)
606            elif len(t) == 3:
607                return nItems(t[0], t[2], between=t[1])
608            else:
609                raise ValueError('Invalid value for %%n in format: %s' % t)
610        elif char == 'S':
611            t = args.pop()
612            if not isinstance(t, minisix.integer_types):
613                raise ValueError('Invalid value for %%S in format: %s' % t)
614            for suffix in ['B','KB','MB','GB','TB']:
615                if t < 1024:
616                    return "%i%s" % (t, suffix)
617                t /= 1024
618
619        elif char == 't':
620            return timestamp(args.pop())
621        elif char == 'T':
622            from .gen import timeElapsed
623            return timeElapsed(args.pop())
624        elif char == 'u':
625            return url(args.pop())
626        elif char == 'v':
627            args.pop()
628            return ''
629        elif char == '%':
630            return '%'
631        else:
632            raise ValueError('Invalid char in sub (in format).')
633    try:
634        return _formatRe.sub(sub, s)
635    except IndexError:
636        raise ValueError('Extra format chars in format spec: %r' % s)
637
638# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
639