1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2003-2021 Edgewall Software
4# Copyright (C) 2003-2004 Jonas Borgström <jonas@edgewall.com>
5# Copyright (C) 2006 Matthew Good <trac@matt-good.net>
6# Copyright (C) 2005-2006 Christian Boos <cboos@edgewall.org>
7# All rights reserved.
8#
9# This software is licensed as described in the file COPYING, which
10# you should have received as part of this distribution. The terms
11# are also available at https://trac.edgewall.org/wiki/TracLicense.
12#
13# This software consists of voluntary contributions made by many
14# individuals. For the exact contribution history, see the revision
15# history and logs, available at https://trac.edgewall.org/log/.
16#
17# Author: Jonas Borgström <jonas@edgewall.com>
18#         Matthew Good <trac@matt-good.net>
19#         Christian Boos <cboos@edgewall.org>
20
21import base64
22import configparser
23import locale
24import os
25import pkg_resources
26import re
27import sys
28import textwrap
29from urllib.parse import quote, quote_plus, unquote
30from unicodedata import east_asian_width
31
32import jinja2
33
34CRLF = '\r\n'
35
36class Empty(str):
37    """A special tag object evaluating to the empty string"""
38    __slots__ = []
39
40empty = Empty()
41
42del Empty # shouldn't be used outside of Trac core
43
44
45# -- Jinja2
46
47_jinja2_ver = pkg_resources.parse_version(jinja2.__version__)
48_jinja2_exts = ['jinja2.ext.do', 'jinja2.ext.i18n']
49if _jinja2_ver < pkg_resources.parse_version('3'):
50    _jinja2_exts.append('jinja2.ext.with_')
51
52def jinja2env(**kwargs):
53    """Creates a Jinja2 ``Environment`` configured with Trac conventions.
54
55    All default parameters can optionally be overriden. The ``loader``
56    parameter is not set by default, so unless it is set by the
57    caller, only inline templates can be created from the environment.
58
59    :rtype: `jinja.Environment`
60
61    """
62    exts = ('html', 'rss', 'xml')
63    def filterout_none(v):
64        return '' if v is None else v
65    def autoescape_extensions(template):
66        return template and template.rsplit('.', 1)[1] in exts
67    defaults = dict(
68        variable_start_string='${',
69        variable_end_string='}',
70        line_statement_prefix='#',
71        line_comment_prefix='##',
72        trim_blocks=True,
73        lstrip_blocks=True,
74        extensions=list(_jinja2_exts),
75        finalize=filterout_none,
76        autoescape=autoescape_extensions,
77    )
78    defaults.update(kwargs)
79    jenv = jinja2.Environment(**defaults)
80    jenv.globals.update(
81        len=len,
82    )
83    return jenv
84
85def jinja2template(template, text=False, **kwargs):
86    """Creates a Jinja2 ``Template`` from inlined source.
87
88    :param template: the template content
89    :param text: if set to `False`, the result of the variable
90                 expansion will be XML/HTML escaped
91    :param kwargs: additional arguments to pass to `jinja2env`. See
92                   `jinja2.Environment` for supported arguments.
93    """
94    return jinja2env(autoescape=not text, **kwargs).from_string(template)
95
96
97# -- Unicode
98
99def to_unicode(text, charset=None):
100    """Convert input to a `str` object.
101
102    For a `bytes` object, we'll first try to decode the bytes using the given
103    `charset` encoding (or UTF-8 if none is specified), then we fall back to
104    the latin1 encoding which might be correct or not, but at least preserves
105    the original byte sequence by mapping each byte to the corresponding
106    unicode code point in the range U+0000 to U+00FF.
107
108    For anything else, a simple `str()` conversion is attempted,
109    with special care taken with `Exception` objects.
110    """
111    if isinstance(text, bytes):
112        try:
113            return str(text, charset or 'utf-8')
114        except UnicodeDecodeError:
115            return str(text, 'latin1')
116    if isinstance(text, Exception):
117        # two possibilities for storing unicode strings in exception data:
118        try:
119            # custom __str__ method on the exception (e.g. PermissionError)
120            result = str(text)
121        except UnicodeError:
122            # unicode arguments given to the exception (e.g. parse_date)
123            return ' '.join(to_unicode(arg) for arg in text.args)
124        if os.name == 'nt':
125            # remove duplicated backslashes from filename in the message
126            if isinstance(text, EnvironmentError) and text.filename:
127                source = repr(text.filename)
128            elif isinstance(text, configparser.ParsingError) and text.source:
129                source = repr(text.source)
130            else:
131                source = None
132            if source:
133                result = result.replace(source, source.replace(r'\\', '\\'))
134        return result
135    return str(text)
136
137
138def exception_to_unicode(e, traceback=False):
139    """Convert an `Exception` to a `str` object.
140
141    In addition to `to_unicode`, this representation of the exception
142    also contains the class name and optionally the traceback.
143    """
144    message = '%s: %s' % (e.__class__.__name__, to_unicode(e))
145    if traceback:
146        from trac.util import get_last_traceback
147        traceback_only = get_last_traceback().split('\n')[:-2]
148        message = '\n%s\n%s' % (to_unicode('\n'.join(traceback_only)), message)
149    return message
150
151
152def path_to_unicode(path):
153    """Convert a filesystem path to str, using the filesystem encoding."""
154    if isinstance(path, bytes):
155        try:
156            return str(path, sys.getfilesystemencoding())
157        except UnicodeDecodeError:
158            return str(path, 'latin1')
159    return str(path)
160
161
162_ws_leading_re = re.compile('\\A[\\s\u200b]+', re.UNICODE)
163_ws_trailing_re = re.compile('[\\s\u200b]+\\Z', re.UNICODE)
164
165def stripws(text, leading=True, trailing=True):
166    """Strips unicode white-spaces and ZWSPs from ``text``.
167
168    :param leading: strips leading spaces from ``text`` unless ``leading`` is
169                    `False`.
170    :param trailing: strips trailing spaces from ``text`` unless ``trailing``
171                     is `False`.
172    """
173    if leading:
174        text = _ws_leading_re.sub('', text)
175    if trailing:
176        text = _ws_trailing_re.sub('', text)
177    return text
178
179
180def strip_line_ws(text, leading=True, trailing=True):
181    """Strips unicode white-spaces and ZWSPs from each line of ``text``.
182
183    :param leading: strips leading spaces from ``text`` unless ``leading`` is
184                    `False`.
185    :param trailing: strips trailing spaces from ``text`` unless ``trailing``
186                     is `False`.
187    """
188    lines = re.compile(r'(\n|\r\n|\r)').split(text)
189    if leading:
190        lines[::2] = (_ws_leading_re.sub('', line) for line in lines[::2])
191    if trailing:
192        lines[::2] = (_ws_trailing_re.sub('', line) for line in lines[::2])
193    return ''.join(lines)
194
195
196_js_quote = {'\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f',
197             '\n': '\\n', '\r': '\\r', '\t': '\\t', "'": "\\'"}
198for i in list(range(0x20)) + [ord(c) for c in '&<>\u2028\u2029']:
199    _js_quote.setdefault(chr(i), '\\u%04x' % i)
200_js_quote_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t\'&<>' + '\u2028\u2029]')
201_js_string_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t&<>' + '\u2028\u2029]')
202
203
204def javascript_quote(text):
205    """Quote strings for inclusion in single or double quote delimited
206    Javascript strings
207    """
208    if not text:
209        return ''
210    def replace(match):
211        return _js_quote[match.group(0)]
212    return _js_quote_re.sub(replace, text)
213
214
215def to_js_string(text):
216    """Embed the given string in a double quote delimited Javascript string
217    (conform to the JSON spec)
218    """
219    if not text:
220        return '""'
221    def replace(match):
222        return _js_quote[match.group(0)]
223    return '"%s"' % _js_string_re.sub(replace, text)
224
225
226def unicode_quote(value, safe='/'):
227    """A unicode aware version of `urllib.quote`
228
229    :param value: anything that converts to a `bytes`. If `str`
230                  input is given, it will be UTF-8 encoded.
231    :param safe: as in `quote`, the characters that would otherwise be
232                 quoted but shouldn't here (defaults to '/')
233    """
234    return quote(value if isinstance(value, bytes) else str(value), safe)
235
236
237def unicode_quote_plus(value, safe=''):
238    """A unicode aware version of `urllib.quote_plus`.
239
240    :param value: anything that converts to a `bytes`. If `str`
241                  input is given, it will be UTF-8 encoded.
242    :param safe: as in `quote_plus`, the characters that would
243                 otherwise be quoted but shouldn't here (defaults to
244                 '/')
245    """
246    return quote_plus(value if isinstance(value, bytes) else str(value), safe)
247
248
249def unicode_unquote(value):
250    """A unicode aware version of `urllib.unquote`.
251
252    :param value: UTF-8 encoded `str` value (for example, as obtained by
253                  `unicode_quote`).
254    :rtype: `str`
255    """
256    if isinstance(value, bytes):
257        value = value.decode('latin1')
258    return unquote(value, encoding='utf-8', errors='strict')
259
260
261def unicode_urlencode(params, safe=''):
262    """A unicode aware version of `urllib.urlencode`.
263
264    Values set to `empty` are converted to the key alone, without the
265    equal sign.
266    """
267    if isinstance(params, dict):
268        params = sorted(params.items(), key=lambda i: i[0])
269    l = []
270    for k, v in params:
271        if v is empty:
272            l.append(unicode_quote_plus(k, safe))
273        else:
274            l.append(unicode_quote_plus(k, safe) + '=' +
275                     unicode_quote_plus(v, safe))
276    return '&'.join(l)
277
278
279_qs_quote_safe = ''.join(chr(c) for c in range(0x21, 0x7f))
280
281def quote_query_string(text):
282    """Quote strings for query string
283    """
284    return unicode_quote_plus(text, _qs_quote_safe)
285
286
287def to_utf8(text, charset='latin1'):
288    """Convert input to a UTF-8 `bytes` object.
289
290    If the input is not an `str` object, we assume the encoding is
291    already UTF-8, ISO Latin-1, or as specified by the optional
292    *charset* parameter.
293    """
294    if isinstance(text, bytes):
295        try:
296            u = str(text, 'utf-8')
297        except UnicodeError:
298            try:
299                # Use the user supplied charset if possible
300                u = str(text, charset)
301            except UnicodeError:
302                # This should always work
303                u = str(text, 'latin1')
304        else:
305            # Do nothing if it's already utf-8
306            return text
307    else:
308        u = to_unicode(text)
309    return u.encode('utf-8')
310
311
312class unicode_passwd(str):
313    """Conceal the actual content of the string when `repr` is called."""
314    def __repr__(self):
315        return '*******'
316
317
318def stream_encoding(stream):
319    """Return the appropriate encoding for the given stream."""
320    encoding = getattr(stream, 'encoding', None)
321    # Windows returns 'cp0' to indicate no encoding
322    return encoding if encoding not in (None, 'cp0') else 'utf-8'
323
324
325def console_print(out, *args, **kwargs):
326    """Output the given arguments to the console, encoding the output
327    as appropriate.
328
329    :param kwargs: ``newline`` controls whether a newline will be appended
330                   (defaults to `True`)
331    """
332    out.write(' '.join(to_unicode(a) for a in args))
333    if kwargs.get('newline', True):
334        out.write('\n')
335
336
337def printout(*args, **kwargs):
338    """Do a `console_print` on `sys.stdout`."""
339    console_print(sys.stdout, *args, **kwargs)
340
341
342def printerr(*args, **kwargs):
343    """Do a `console_print` on `sys.stderr`."""
344    console_print(sys.stderr, *args, **kwargs)
345
346
347def printfout(message, *args, **kwargs):
348    """Format `message`, do a `console.print` on `sys.stdout` and flush
349    the buffer.
350    """
351    if args:
352        message %= args
353    printout(message, **kwargs)
354    sys.stdout.flush()
355
356
357def printferr(message, *args, **kwargs):
358    """Format `message`, do a `console.print` on `sys.stderr` and flush
359    the buffer.
360    """
361    if args:
362        message %= args
363    printerr(message, **kwargs)
364    sys.stderr.flush()
365
366
367def raw_input(prompt):
368    """Input one line from the console and converts it to unicode as
369    appropriate.
370    """
371    printout(prompt, newline=False)
372    return to_unicode(input(), sys.stdin.encoding)
373
374
375_preferredencoding = locale.getpreferredencoding()
376
377def getpreferredencoding():
378    """Return the encoding, which is retrieved on ahead, according to user
379    preference.
380
381    We should use this instead of `locale.getpreferredencoding()` which
382    is not thread-safe."""
383    return _preferredencoding
384
385
386# -- Plain text formatting
387
388def text_width(text, ambiwidth=1):
389    """Determine the column width of `text` in Unicode characters.
390
391    The characters in the East Asian Fullwidth (F) or East Asian Wide (W)
392    have a column width of 2. The other characters in the East Asian
393    Halfwidth (H) or East Asian Narrow (Na) have a column width of 1.
394
395    That `ambiwidth` parameter is used for the column width of the East
396    Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
397    This is expected by most users. If `2`, twice the width of US-ASCII
398    characters. This is expected by CJK users.
399
400    cf. http://www.unicode.org/reports/tr11/.
401    """
402    twice = 'FWA' if ambiwidth == 2 else 'FW'
403    return sum([2 if east_asian_width(chr) in twice else 1
404                for chr in to_unicode(text)])
405
406
407def _get_default_ambiwidth():
408    """Return width of East Asian Ambiguous based on locale environment
409    variables or Windows codepage.
410    """
411
412    if os.name == 'nt':
413        import ctypes
414        codepage = ctypes.windll.kernel32.GetConsoleOutputCP()
415        if codepage in (932,   # Japanese (Shift-JIS)
416                        936,   # Chinese Simplified (GB2312)
417                        949,   # Korean (Unified Hangul Code)
418                        950):  # Chinese Traditional (Big5)
419            return 2
420    else:
421        for name in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
422            value = os.environ.get(name) or ''
423            if value:
424                if name == 'LANGUAGE' and ':' in value:
425                    value = value.split(':')[0]
426                return 2 if value.lower().startswith(('zh', 'ja', 'ko')) else 1
427
428    return 1
429
430
431_default_ambiwidth = _get_default_ambiwidth()
432
433
434def print_table(data, headers=None, sep='  ', out=None, ambiwidth=None):
435    """Print data according to a tabular layout.
436
437    :param data: a sequence of rows; assume all rows are of equal length.
438    :param headers: an optional row containing column headers; must be of
439                    the same length as each row in `data`.
440    :param sep: column separator
441    :param out: output file descriptor (`None` means use `sys.stdout`)
442    :param ambiwidth: column width of the East Asian Ambiguous (A). If None,
443                      detect ambiwidth with the locale settings. If others,
444                      pass to the `ambiwidth` parameter of `text_width`.
445    """
446    if out is None:
447        out = sys.stdout
448    if ambiwidth is None:
449        ambiwidth = _default_ambiwidth
450    data = list(data)
451    if headers:
452        data.insert(0, headers)
453    elif not data:
454        return
455
456    # Convert to a str object with `to_unicode`. If None, convert to a
457    # empty string.
458    def to_text(val):
459        if val is None:
460            return ''
461        return to_unicode(val)
462
463    def tw(text):
464        return text_width(text, ambiwidth=ambiwidth)
465
466    def to_lines(data):
467        lines = []
468        for row in data:
469            row = [to_text(cell) for cell in row]
470            if any('\n' in cell for cell in row):
471                row = [cell.splitlines() for cell in row]
472                max_lines = max(len(cell) for cell in row)
473                for cell in row:
474                    if len(cell) < max_lines:
475                        cell += [''] * (max_lines - len(cell))
476                lines.extend([cell[idx] for cell in row]
477                             for idx in range(max_lines))
478            else:
479                lines.append(row)
480        return lines
481
482    data = to_lines(data)
483
484    num_cols = len(data[0])
485    col_width = [max(tw(row[idx]) for row in data)
486                 for idx in range(num_cols)]
487
488    out.write('\n')
489    for ridx, row in enumerate(data):
490        for cidx, cell in enumerate(row):
491            if cidx + 1 == num_cols:
492                line = cell  # No separator after last column
493            else:
494                if headers and ridx == 0:
495                    sp = ' ' * tw(sep)  # No separator in header
496                else:
497                    sp = sep
498                line = '%-*s%s' % (col_width[cidx] - tw(cell) + len(cell),
499                                    cell, sp)
500            out.write(line)
501
502        out.write('\n')
503        if ridx == 0 and headers:
504            out.write('-' * (tw(sep) * cidx + sum(col_width)))
505            out.write('\n')
506    out.write('\n')
507
508
509def shorten_line(text, maxlen=75):
510    """Truncates `text` to length less than or equal to `maxlen` characters.
511
512    This tries to be (a bit) clever and attempts to find a proper word
513    boundary for doing so.
514    """
515    if len(text or '') <= maxlen:
516        return text
517    suffix = ' ...'
518    maxtextlen = maxlen - len(suffix)
519    cut = max(text.rfind(' ', 0, maxtextlen), text.rfind('\n', 0, maxtextlen))
520    if cut < 0:
521        cut = maxtextlen
522    return text[:cut] + suffix
523
524
525class UnicodeTextWrapper(textwrap.TextWrapper):
526    breakable_char_ranges = [
527        (0x1100, 0x11FF),   # Hangul Jamo
528        (0x2E80, 0x2EFF),   # CJK Radicals Supplement
529        (0x3000, 0x303F),   # CJK Symbols and Punctuation
530        (0x3040, 0x309F),   # Hiragana
531        (0x30A0, 0x30FF),   # Katakana
532        (0x3130, 0x318F),   # Hangul Compatibility Jamo
533        (0x3190, 0x319F),   # Kanbun
534        (0x31C0, 0x31EF),   # CJK Strokes
535        (0x3200, 0x32FF),   # Enclosed CJK Letters and Months
536        (0x3300, 0x33FF),   # CJK Compatibility
537        (0x3400, 0x4DBF),   # CJK Unified Ideographs Extension A
538        (0x4E00, 0x9FFF),   # CJK Unified Ideographs
539        (0xA960, 0xA97F),   # Hangul Jamo Extended-A
540        (0xAC00, 0xD7AF),   # Hangul Syllables
541        (0xD7B0, 0xD7FF),   # Hangul Jamo Extended-B
542        (0xF900, 0xFAFF),   # CJK Compatibility Ideographs
543        (0xFE30, 0xFE4F),   # CJK Compatibility Forms
544        (0xFF00, 0xFFEF),   # Halfwidth and Fullwidth Forms
545        (0x20000, 0x2FFFF, '[\uD840-\uD87F][\uDC00-\uDFFF]'), # Plane 2
546        (0x30000, 0x3FFFF, '[\uD880-\uD8BF][\uDC00-\uDFFF]'), # Plane 3
547    ]
548
549    split_re = None
550    breakable_re = None
551
552    @classmethod
553    def _init_patterns(cls):
554        char_ranges = []
555        for val in cls.breakable_char_ranges:
556            high = chr(val[0])
557            low = chr(val[1])
558            char_ranges.append('%s-%s' % (high, low))
559        char_ranges = ''.join(char_ranges)
560        pattern = '[%s]+' % char_ranges
561
562        cls.split_re = re.compile(
563            r'(\s+|' +                                  # any whitespace
564            pattern + '|' +                             # breakable text
565            r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' +   # hyphenated words
566            r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))',     # em-dash
567            re.UNICODE)
568        cls.breakable_re = re.compile(r'\A' + pattern, re.UNICODE)
569
570    def __init__(self, cols, replace_whitespace=0, break_long_words=0,
571                 initial_indent='', subsequent_indent='', ambiwidth=1):
572        textwrap.TextWrapper.__init__(
573                self, cols, replace_whitespace=0, break_long_words=0,
574                initial_indent=initial_indent,
575                subsequent_indent=subsequent_indent)
576        self.ambiwidth = ambiwidth
577        if self.split_re is None:
578            self._init_patterns()
579
580    def _split(self, text):
581        chunks = self.split_re.split(to_unicode(text))
582        return list(filter(None, chunks))
583
584    def _text_width(self, text):
585        return text_width(text, ambiwidth=self.ambiwidth)
586
587    def _wrap_chunks(self, chunks):
588        lines = []
589        chunks.reverse()
590        text_width = self._text_width
591
592        while chunks:
593            cur_line = []
594            cur_width = 0
595
596            if lines:
597                indent = self.subsequent_indent
598            else:
599                indent = self.initial_indent
600            width = self.width - text_width(indent)
601
602            if chunks[-1].strip() == '' and lines:
603                del chunks[-1]
604
605            while chunks:
606                chunk = chunks[-1]
607                w = text_width(chunk)
608                if cur_width + w <= width:
609                    cur_line.append(chunks.pop())
610                    cur_width += w
611                elif self.breakable_re.match(chunk):
612                    left_space = width - cur_width
613                    for i in range(len(chunk)):
614                        w = text_width(chunk[i])
615                        if left_space < w:
616                            break
617                        left_space -= w
618                    if i > 0:
619                        cur_line.append(chunk[:i])
620                        chunk = chunk[i:]
621                        chunks[-1] = chunk
622                    w = text_width(chunk)
623                    break
624                else:
625                    break
626
627            if chunks and w > width:
628                self._handle_long_word(chunks, cur_line, cur_width, width)
629
630            if cur_line and cur_line[-1].strip() == '':
631                del cur_line[-1]
632
633            if cur_line:
634                lines.append(indent + ''.join(cur_line))
635
636        return lines
637
638
639def wrap(t, cols=75, initial_indent='', subsequent_indent='',
640         linesep=os.linesep, ambiwidth=1):
641    """Wraps the single paragraph in `t`, which contains unicode characters.
642    The every line is at most `cols` characters long.
643
644    That `ambiwidth` parameter is used for the column width of the East
645    Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
646    This is expected by most users. If `2`, twice the width of US-ASCII
647    characters. This is expected by CJK users.
648    """
649    t = t.strip().replace('\r\n', '\n').replace('\r', '\n')
650    wrapper = UnicodeTextWrapper(cols, replace_whitespace=0,
651                                 break_long_words=0,
652                                 initial_indent=initial_indent,
653                                 subsequent_indent=subsequent_indent,
654                                 ambiwidth=ambiwidth)
655    wrappedLines = []
656    for line in t.split('\n'):
657        wrappedLines += wrapper.wrap(line.rstrip()) or ['']
658    return linesep.join(wrappedLines)
659
660
661_obfuscation_char = '@\u2026'
662
663def obfuscate_email_address(address):
664    """Replace anything looking like an e-mail address (``'@something'``)
665    with a trailing ellipsis (``'@…'``)
666    """
667    if address:
668        at = address.find('@')
669        if at != -1:
670            return address[:at] + _obfuscation_char + \
671                   ('>' if address[-1] == '>' else '')
672    return address
673
674
675def is_obfuscated(word):
676    """Returns `True` if the `word` looks like an obfuscated e-mail
677    address.
678
679    :since: 1.2
680    """
681    return _obfuscation_char in word
682
683
684def breakable_path(path):
685    """Make a path breakable after path separators, and conversely, avoid
686    breaking at spaces.
687    """
688    if not path:
689        return path
690    prefix = ''
691    if path.startswith('/'):    # Avoid breaking after a leading /
692        prefix = '/'
693        path = path[1:]
694    return prefix + path.replace('/', '/\u200b').replace('\\', '\\\u200b') \
695                        .replace(' ', '\u00a0')
696
697
698def normalize_whitespace(text, to_space='\u00a0', remove='\u200b'):
699    """Normalize whitespace in a string, by replacing special spaces by normal
700    spaces and removing zero-width spaces."""
701    if not text:
702        return text
703    for each in to_space:
704        text = text.replace(each, ' ')
705    for each in remove:
706        text = text.replace(each, '')
707    return text
708
709
710def unquote_label(txt):
711    """Remove (one level of) enclosing single or double quotes.
712
713    .. versionadded :: 1.0
714    """
715    return txt[1:-1] if txt and txt[0] in "'\"" and txt[0] == txt[-1] else txt
716
717
718def cleandoc(message):
719    """Removes uniform indentation and leading/trailing whitespace."""
720    from inspect import cleandoc
721    return cleandoc(message).strip()
722
723
724# -- Conversion
725
726def pretty_size(size, format='%.1f'):
727    """Pretty print content size information with appropriate unit.
728
729    :param size: number of bytes
730    :param format: can be used to adjust the precision shown
731    """
732    if size is None:
733        return ''
734
735    jump = 1024
736    if size < jump:
737        from trac.util.translation import ngettext
738        return ngettext("%(num)d byte", "%(num)d bytes", num=size)
739
740    units = ['KB', 'MB', 'GB', 'TB']
741    i = 0
742    while size >= jump and i < len(units):
743        i += 1
744        size /= 1024.
745
746    return (format + ' %s') % (size, units[i - 1])
747
748
749def expandtabs(s, tabstop=8, ignoring=None):
750    """Expand tab characters `'\\\\t'` into spaces.
751
752    :param tabstop: number of space characters per tab
753                    (defaults to the canonical 8)
754
755    :param ignoring: if not `None`, the expansion will be "smart" and
756                     go from one tabstop to the next. In addition,
757                     this parameter lists characters which can be
758                     ignored when computing the indent.
759    """
760    if '\t' not in s:
761        return s
762    if ignoring is None:
763        return s.expandtabs(tabstop)
764
765    outlines = []
766    for line in s.split('\n'):
767        if '\t' not in line:
768            outlines.append(line)
769            continue
770        p = 0
771        s = []
772        for c in line:
773            if c == '\t':
774                n = tabstop - p % tabstop
775                s.append(' ' * n)
776                p += n
777            elif not ignoring or c not in ignoring:
778                p += 1
779                s.append(c)
780            else:
781                s.append(c)
782        outlines.append(''.join(s))
783    return '\n'.join(outlines)
784
785
786def fix_eol(text, eol):
787    """Fix end-of-lines in a text."""
788    lines = text.splitlines()
789    if isinstance(text, bytes):
790        last = b''
791        eol = eol.encode('utf-8')
792    else:
793        last = ''
794    lines.append(last)
795    return eol.join(lines)
796
797def unicode_to_base64(text, strip_newlines=True):
798    """Safe conversion of ``text`` to base64 representation using
799    utf-8 bytes.
800
801    Strips newlines from output unless ``strip_newlines`` is `False`.
802    """
803    text = to_unicode(text)
804    text = text.encode('utf-8')
805    if strip_newlines:
806        rv = base64.b64encode(text)
807    else:
808        rv = base64.encodebytes(text)
809    return str(rv, 'ascii')
810
811def unicode_from_base64(text):
812    """Safe conversion of ``text`` to str based on utf-8 bytes."""
813    return str(base64.b64decode(text), 'utf-8')
814
815
816def levenshtein_distance(lhs, rhs):
817    """Return the Levenshtein distance between two strings."""
818    if len(lhs) > len(rhs):
819        rhs, lhs = lhs, rhs
820    if not lhs:
821        return len(rhs)
822
823    prev = range(len(rhs) + 1)
824    for lidx, lch in enumerate(lhs):
825        curr = [lidx + 1]
826        for ridx, rch in enumerate(rhs):
827            cost = (lch != rch) * 2
828            curr.append(min(prev[ridx + 1] + 1, # deletion
829                            curr[ridx] + 1,     # insertion
830                            prev[ridx] + cost)) # substitution
831        prev = curr
832    return prev[-1]
833
834
835sub_vars_re = re.compile("[$]([A-Z_][A-Z0-9_]*)")
836
837def sub_vars(text, args):
838    """Substitute $XYZ-style variables in a string with provided values.
839
840    :param text: string containing variables to substitute.
841    :param args: dictionary with keys matching the variables to be substituted.
842                 The keys should not be prefixed with the $ character."""
843    def repl(match):
844        key = match.group(1)
845        return args[key] if key in args else '$' + key
846    return sub_vars_re.sub(repl, text)
847