1# encoding: utf-8
2"""
3Utilities for working with strings and text.
4
5Inheritance diagram:
6
7.. inheritance-diagram:: IPython.utils.text
8   :parts: 3
9"""
10from __future__ import absolute_import
11
12import os
13import re
14import sys
15import textwrap
16from string import Formatter
17try:
18    from pathlib import Path
19except ImportError:
20    # Python 2 backport
21    from pathlib2 import Path
22
23from IPython.testing.skipdoctest import skip_doctest_py3, skip_doctest
24from IPython.utils import py3compat
25
26# datetime.strftime date format for ipython
27if sys.platform == 'win32':
28    date_format = "%B %d, %Y"
29else:
30    date_format = "%B %-d, %Y"
31
32class LSString(str):
33    """String derivative with a special access attributes.
34
35    These are normal strings, but with the special attributes:
36
37        .l (or .list) : value as list (split on newlines).
38        .n (or .nlstr): original value (the string itself).
39        .s (or .spstr): value as whitespace-separated string.
40        .p (or .paths): list of path objects (requires path.py package)
41
42    Any values which require transformations are computed only once and
43    cached.
44
45    Such strings are very useful to efficiently interact with the shell, which
46    typically only understands whitespace-separated options for commands."""
47
48    def get_list(self):
49        try:
50            return self.__list
51        except AttributeError:
52            self.__list = self.split('\n')
53            return self.__list
54
55    l = list = property(get_list)
56
57    def get_spstr(self):
58        try:
59            return self.__spstr
60        except AttributeError:
61            self.__spstr = self.replace('\n',' ')
62            return self.__spstr
63
64    s = spstr = property(get_spstr)
65
66    def get_nlstr(self):
67        return self
68
69    n = nlstr = property(get_nlstr)
70
71    def get_paths(self):
72        try:
73            return self.__paths
74        except AttributeError:
75            self.__paths = [Path(p) for p in self.split('\n') if os.path.exists(p)]
76            return self.__paths
77
78    p = paths = property(get_paths)
79
80# FIXME: We need to reimplement type specific displayhook and then add this
81# back as a custom printer. This should also be moved outside utils into the
82# core.
83
84# def print_lsstring(arg):
85#     """ Prettier (non-repr-like) and more informative printer for LSString """
86#     print "LSString (.p, .n, .l, .s available). Value:"
87#     print arg
88#
89#
90# print_lsstring = result_display.when_type(LSString)(print_lsstring)
91
92
93class SList(list):
94    """List derivative with a special access attributes.
95
96    These are normal lists, but with the special attributes:
97
98    * .l (or .list) : value as list (the list itself).
99    * .n (or .nlstr): value as a string, joined on newlines.
100    * .s (or .spstr): value as a string, joined on spaces.
101    * .p (or .paths): list of path objects (requires path.py package)
102
103    Any values which require transformations are computed only once and
104    cached."""
105
106    def get_list(self):
107        return self
108
109    l = list = property(get_list)
110
111    def get_spstr(self):
112        try:
113            return self.__spstr
114        except AttributeError:
115            self.__spstr = ' '.join(self)
116            return self.__spstr
117
118    s = spstr = property(get_spstr)
119
120    def get_nlstr(self):
121        try:
122            return self.__nlstr
123        except AttributeError:
124            self.__nlstr = '\n'.join(self)
125            return self.__nlstr
126
127    n = nlstr = property(get_nlstr)
128
129    def get_paths(self):
130        try:
131            return self.__paths
132        except AttributeError:
133            self.__paths = [Path(p) for p in self if os.path.exists(p)]
134            return self.__paths
135
136    p = paths = property(get_paths)
137
138    def grep(self, pattern, prune = False, field = None):
139        """ Return all strings matching 'pattern' (a regex or callable)
140
141        This is case-insensitive. If prune is true, return all items
142        NOT matching the pattern.
143
144        If field is specified, the match must occur in the specified
145        whitespace-separated field.
146
147        Examples::
148
149            a.grep( lambda x: x.startswith('C') )
150            a.grep('Cha.*log', prune=1)
151            a.grep('chm', field=-1)
152        """
153
154        def match_target(s):
155            if field is None:
156                return s
157            parts = s.split()
158            try:
159                tgt = parts[field]
160                return tgt
161            except IndexError:
162                return ""
163
164        if isinstance(pattern, py3compat.string_types):
165            pred = lambda x : re.search(pattern, x, re.IGNORECASE)
166        else:
167            pred = pattern
168        if not prune:
169            return SList([el for el in self if pred(match_target(el))])
170        else:
171            return SList([el for el in self if not pred(match_target(el))])
172
173    def fields(self, *fields):
174        """ Collect whitespace-separated fields from string list
175
176        Allows quick awk-like usage of string lists.
177
178        Example data (in var a, created by 'a = !ls -l')::
179
180            -rwxrwxrwx  1 ville None      18 Dec 14  2006 ChangeLog
181            drwxrwxrwx+ 6 ville None       0 Oct 24 18:05 IPython
182
183        * ``a.fields(0)`` is ``['-rwxrwxrwx', 'drwxrwxrwx+']``
184        * ``a.fields(1,0)`` is ``['1 -rwxrwxrwx', '6 drwxrwxrwx+']``
185          (note the joining by space).
186        * ``a.fields(-1)`` is ``['ChangeLog', 'IPython']``
187
188        IndexErrors are ignored.
189
190        Without args, fields() just split()'s the strings.
191        """
192        if len(fields) == 0:
193            return [el.split() for el in self]
194
195        res = SList()
196        for el in [f.split() for f in self]:
197            lineparts = []
198
199            for fd in fields:
200                try:
201                    lineparts.append(el[fd])
202                except IndexError:
203                    pass
204            if lineparts:
205                res.append(" ".join(lineparts))
206
207        return res
208
209    def sort(self,field= None,  nums = False):
210        """ sort by specified fields (see fields())
211
212        Example::
213
214            a.sort(1, nums = True)
215
216        Sorts a by second field, in numerical order (so that 21 > 3)
217
218        """
219
220        #decorate, sort, undecorate
221        if field is not None:
222            dsu = [[SList([line]).fields(field),  line] for line in self]
223        else:
224            dsu = [[line,  line] for line in self]
225        if nums:
226            for i in range(len(dsu)):
227                numstr = "".join([ch for ch in dsu[i][0] if ch.isdigit()])
228                try:
229                    n = int(numstr)
230                except ValueError:
231                    n = 0
232                dsu[i][0] = n
233
234
235        dsu.sort()
236        return SList([t[1] for t in dsu])
237
238
239# FIXME: We need to reimplement type specific displayhook and then add this
240# back as a custom printer. This should also be moved outside utils into the
241# core.
242
243# def print_slist(arg):
244#     """ Prettier (non-repr-like) and more informative printer for SList """
245#     print "SList (.p, .n, .l, .s, .grep(), .fields(), sort() available):"
246#     if hasattr(arg,  'hideonce') and arg.hideonce:
247#         arg.hideonce = False
248#         return
249#
250#     nlprint(arg)   # This was a nested list printer, now removed.
251#
252# print_slist = result_display.when_type(SList)(print_slist)
253
254
255def indent(instr,nspaces=4, ntabs=0, flatten=False):
256    """Indent a string a given number of spaces or tabstops.
257
258    indent(str,nspaces=4,ntabs=0) -> indent str by ntabs+nspaces.
259
260    Parameters
261    ----------
262
263    instr : basestring
264        The string to be indented.
265    nspaces : int (default: 4)
266        The number of spaces to be indented.
267    ntabs : int (default: 0)
268        The number of tabs to be indented.
269    flatten : bool (default: False)
270        Whether to scrub existing indentation.  If True, all lines will be
271        aligned to the same indentation.  If False, existing indentation will
272        be strictly increased.
273
274    Returns
275    -------
276
277    str|unicode : string indented by ntabs and nspaces.
278
279    """
280    if instr is None:
281        return
282    ind = '\t'*ntabs+' '*nspaces
283    if flatten:
284        pat = re.compile(r'^\s*', re.MULTILINE)
285    else:
286        pat = re.compile(r'^', re.MULTILINE)
287    outstr = re.sub(pat, ind, instr)
288    if outstr.endswith(os.linesep+ind):
289        return outstr[:-len(ind)]
290    else:
291        return outstr
292
293
294def list_strings(arg):
295    """Always return a list of strings, given a string or list of strings
296    as input.
297
298    Examples
299    --------
300    ::
301
302        In [7]: list_strings('A single string')
303        Out[7]: ['A single string']
304
305        In [8]: list_strings(['A single string in a list'])
306        Out[8]: ['A single string in a list']
307
308        In [9]: list_strings(['A','list','of','strings'])
309        Out[9]: ['A', 'list', 'of', 'strings']
310    """
311
312    if isinstance(arg, py3compat.string_types): return [arg]
313    else: return arg
314
315
316def marquee(txt='',width=78,mark='*'):
317    """Return the input string centered in a 'marquee'.
318
319    Examples
320    --------
321    ::
322
323        In [16]: marquee('A test',40)
324        Out[16]: '**************** A test ****************'
325
326        In [17]: marquee('A test',40,'-')
327        Out[17]: '---------------- A test ----------------'
328
329        In [18]: marquee('A test',40,' ')
330        Out[18]: '                 A test                 '
331
332    """
333    if not txt:
334        return (mark*width)[:width]
335    nmark = (width-len(txt)-2)//len(mark)//2
336    if nmark < 0: nmark =0
337    marks = mark*nmark
338    return '%s %s %s' % (marks,txt,marks)
339
340
341ini_spaces_re = re.compile(r'^(\s+)')
342
343def num_ini_spaces(strng):
344    """Return the number of initial spaces in a string"""
345
346    ini_spaces = ini_spaces_re.match(strng)
347    if ini_spaces:
348        return ini_spaces.end()
349    else:
350        return 0
351
352
353def format_screen(strng):
354    """Format a string for screen printing.
355
356    This removes some latex-type format codes."""
357    # Paragraph continue
358    par_re = re.compile(r'\\$',re.MULTILINE)
359    strng = par_re.sub('',strng)
360    return strng
361
362
363def dedent(text):
364    """Equivalent of textwrap.dedent that ignores unindented first line.
365
366    This means it will still dedent strings like:
367    '''foo
368    is a bar
369    '''
370
371    For use in wrap_paragraphs.
372    """
373
374    if text.startswith('\n'):
375        # text starts with blank line, don't ignore the first line
376        return textwrap.dedent(text)
377
378    # split first line
379    splits = text.split('\n',1)
380    if len(splits) == 1:
381        # only one line
382        return textwrap.dedent(text)
383
384    first, rest = splits
385    # dedent everything but the first line
386    rest = textwrap.dedent(rest)
387    return '\n'.join([first, rest])
388
389
390def wrap_paragraphs(text, ncols=80):
391    """Wrap multiple paragraphs to fit a specified width.
392
393    This is equivalent to textwrap.wrap, but with support for multiple
394    paragraphs, as separated by empty lines.
395
396    Returns
397    -------
398
399    list of complete paragraphs, wrapped to fill `ncols` columns.
400    """
401    paragraph_re = re.compile(r'\n(\s*\n)+', re.MULTILINE)
402    text = dedent(text).strip()
403    paragraphs = paragraph_re.split(text)[::2] # every other entry is space
404    out_ps = []
405    indent_re = re.compile(r'\n\s+', re.MULTILINE)
406    for p in paragraphs:
407        # presume indentation that survives dedent is meaningful formatting,
408        # so don't fill unless text is flush.
409        if indent_re.search(p) is None:
410            # wrap paragraph
411            p = textwrap.fill(p, ncols)
412        out_ps.append(p)
413    return out_ps
414
415
416def long_substr(data):
417    """Return the longest common substring in a list of strings.
418
419    Credit: http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
420    """
421    substr = ''
422    if len(data) > 1 and len(data[0]) > 0:
423        for i in range(len(data[0])):
424            for j in range(len(data[0])-i+1):
425                if j > len(substr) and all(data[0][i:i+j] in x for x in data):
426                    substr = data[0][i:i+j]
427    elif len(data) == 1:
428        substr = data[0]
429    return substr
430
431
432def strip_email_quotes(text):
433    """Strip leading email quotation characters ('>').
434
435    Removes any combination of leading '>' interspersed with whitespace that
436    appears *identically* in all lines of the input text.
437
438    Parameters
439    ----------
440    text : str
441
442    Examples
443    --------
444
445    Simple uses::
446
447        In [2]: strip_email_quotes('> > text')
448        Out[2]: 'text'
449
450        In [3]: strip_email_quotes('> > text\\n> > more')
451        Out[3]: 'text\\nmore'
452
453    Note how only the common prefix that appears in all lines is stripped::
454
455        In [4]: strip_email_quotes('> > text\\n> > more\\n> more...')
456        Out[4]: '> text\\n> more\\nmore...'
457
458    So if any line has no quote marks ('>') , then none are stripped from any
459    of them ::
460
461        In [5]: strip_email_quotes('> > text\\n> > more\\nlast different')
462        Out[5]: '> > text\\n> > more\\nlast different'
463    """
464    lines = text.splitlines()
465    matches = set()
466    for line in lines:
467        prefix = re.match(r'^(\s*>[ >]*)', line)
468        if prefix:
469            matches.add(prefix.group(1))
470        else:
471            break
472    else:
473        prefix = long_substr(list(matches))
474        if prefix:
475            strip = len(prefix)
476            text = '\n'.join([ ln[strip:] for ln in lines])
477    return text
478
479def strip_ansi(source):
480    """
481    Remove ansi escape codes from text.
482
483    Parameters
484    ----------
485    source : str
486        Source to remove the ansi from
487    """
488    return re.sub(r'\033\[(\d|;)+?m', '', source)
489
490
491class EvalFormatter(Formatter):
492    """A String Formatter that allows evaluation of simple expressions.
493
494    Note that this version interprets a : as specifying a format string (as per
495    standard string formatting), so if slicing is required, you must explicitly
496    create a slice.
497
498    This is to be used in templating cases, such as the parallel batch
499    script templates, where simple arithmetic on arguments is useful.
500
501    Examples
502    --------
503    ::
504
505        In [1]: f = EvalFormatter()
506        In [2]: f.format('{n//4}', n=8)
507        Out[2]: '2'
508
509        In [3]: f.format("{greeting[slice(2,4)]}", greeting="Hello")
510        Out[3]: 'll'
511    """
512    def get_field(self, name, args, kwargs):
513        v = eval(name, kwargs)
514        return v, name
515
516#XXX: As of Python 3.4, the format string parsing no longer splits on a colon
517# inside [], so EvalFormatter can handle slicing. Once we only support 3.4 and
518# above, it should be possible to remove FullEvalFormatter.
519
520@skip_doctest_py3
521class FullEvalFormatter(Formatter):
522    """A String Formatter that allows evaluation of simple expressions.
523
524    Any time a format key is not found in the kwargs,
525    it will be tried as an expression in the kwargs namespace.
526
527    Note that this version allows slicing using [1:2], so you cannot specify
528    a format string. Use :class:`EvalFormatter` to permit format strings.
529
530    Examples
531    --------
532    ::
533
534        In [1]: f = FullEvalFormatter()
535        In [2]: f.format('{n//4}', n=8)
536        Out[2]: u'2'
537
538        In [3]: f.format('{list(range(5))[2:4]}')
539        Out[3]: u'[2, 3]'
540
541        In [4]: f.format('{3*2}')
542        Out[4]: u'6'
543    """
544    # copied from Formatter._vformat with minor changes to allow eval
545    # and replace the format_spec code with slicing
546    def vformat(self, format_string, args, kwargs):
547        result = []
548        for literal_text, field_name, format_spec, conversion in \
549                self.parse(format_string):
550
551            # output the literal text
552            if literal_text:
553                result.append(literal_text)
554
555            # if there's a field, output it
556            if field_name is not None:
557                # this is some markup, find the object and do
558                # the formatting
559
560                if format_spec:
561                    # override format spec, to allow slicing:
562                    field_name = ':'.join([field_name, format_spec])
563
564                # eval the contents of the field for the object
565                # to be formatted
566                obj = eval(field_name, kwargs)
567
568                # do any conversion on the resulting object
569                obj = self.convert_field(obj, conversion)
570
571                # format the object and append to the result
572                result.append(self.format_field(obj, ''))
573
574        return u''.join(py3compat.cast_unicode(s) for s in result)
575
576
577@skip_doctest_py3
578class DollarFormatter(FullEvalFormatter):
579    """Formatter allowing Itpl style $foo replacement, for names and attribute
580    access only. Standard {foo} replacement also works, and allows full
581    evaluation of its arguments.
582
583    Examples
584    --------
585    ::
586
587        In [1]: f = DollarFormatter()
588        In [2]: f.format('{n//4}', n=8)
589        Out[2]: u'2'
590
591        In [3]: f.format('23 * 76 is $result', result=23*76)
592        Out[3]: u'23 * 76 is 1748'
593
594        In [4]: f.format('$a or {b}', a=1, b=2)
595        Out[4]: u'1 or 2'
596    """
597    _dollar_pattern = re.compile("(.*?)\$(\$?[\w\.]+)")
598    def parse(self, fmt_string):
599        for literal_txt, field_name, format_spec, conversion \
600                    in Formatter.parse(self, fmt_string):
601
602            # Find $foo patterns in the literal text.
603            continue_from = 0
604            txt = ""
605            for m in self._dollar_pattern.finditer(literal_txt):
606                new_txt, new_field = m.group(1,2)
607                # $$foo --> $foo
608                if new_field.startswith("$"):
609                    txt += new_txt + new_field
610                else:
611                    yield (txt + new_txt, new_field, "", None)
612                    txt = ""
613                continue_from = m.end()
614
615            # Re-yield the {foo} style pattern
616            yield (txt + literal_txt[continue_from:], field_name, format_spec, conversion)
617
618#-----------------------------------------------------------------------------
619# Utils to columnize a list of string
620#-----------------------------------------------------------------------------
621
622def _col_chunks(l, max_rows, row_first=False):
623    """Yield successive max_rows-sized column chunks from l."""
624    if row_first:
625        ncols = (len(l) // max_rows) + (len(l) % max_rows > 0)
626        for i in py3compat.xrange(ncols):
627            yield [l[j] for j in py3compat.xrange(i, len(l), ncols)]
628    else:
629        for i in py3compat.xrange(0, len(l), max_rows):
630            yield l[i:(i + max_rows)]
631
632
633def _find_optimal(rlist, row_first=False, separator_size=2, displaywidth=80):
634    """Calculate optimal info to columnize a list of string"""
635    for max_rows in range(1, len(rlist) + 1):
636        col_widths = list(map(max, _col_chunks(rlist, max_rows, row_first)))
637        sumlength = sum(col_widths)
638        ncols = len(col_widths)
639        if sumlength + separator_size * (ncols - 1) <= displaywidth:
640            break
641    return {'num_columns': ncols,
642            'optimal_separator_width': (displaywidth - sumlength) / (ncols - 1) if (ncols - 1) else 0,
643            'max_rows': max_rows,
644            'column_widths': col_widths
645            }
646
647
648def _get_or_default(mylist, i, default=None):
649    """return list item number, or default if don't exist"""
650    if i >= len(mylist):
651        return default
652    else :
653        return mylist[i]
654
655
656def compute_item_matrix(items, row_first=False, empty=None, *args, **kwargs) :
657    """Returns a nested list, and info to columnize items
658
659    Parameters
660    ----------
661
662    items
663        list of strings to columize
664    row_first : (default False)
665        Whether to compute columns for a row-first matrix instead of
666        column-first (default).
667    empty : (default None)
668        default value to fill list if needed
669    separator_size : int (default=2)
670        How much caracters will be used as a separation between each columns.
671    displaywidth : int (default=80)
672        The width of the area onto wich the columns should enter
673
674    Returns
675    -------
676
677    strings_matrix
678
679        nested list of string, the outer most list contains as many list as
680        rows, the innermost lists have each as many element as colums. If the
681        total number of elements in `items` does not equal the product of
682        rows*columns, the last element of some lists are filled with `None`.
683
684    dict_info
685        some info to make columnize easier:
686
687        num_columns
688          number of columns
689        max_rows
690          maximum number of rows (final number may be less)
691        column_widths
692          list of with of each columns
693        optimal_separator_width
694          best separator width between columns
695
696    Examples
697    --------
698    ::
699
700        In [1]: l = ['aaa','b','cc','d','eeeee','f','g','h','i','j','k','l']
701           ...: compute_item_matrix(l, displaywidth=12)
702        Out[1]:
703            ([['aaa', 'f', 'k'],
704            ['b', 'g', 'l'],
705            ['cc', 'h', None],
706            ['d', 'i', None],
707            ['eeeee', 'j', None]],
708            {'num_columns': 3,
709            'column_widths': [5, 1, 1],
710            'optimal_separator_width': 2,
711            'max_rows': 5})
712    """
713    info = _find_optimal(list(map(len, items)), row_first, *args, **kwargs)
714    nrow, ncol = info['max_rows'], info['num_columns']
715    if row_first:
716        return ([[_get_or_default(items, r * ncol + c, default=empty) for c in range(ncol)] for r in range(nrow)], info)
717    else:
718        return ([[_get_or_default(items, c * nrow + r, default=empty) for c in range(ncol)] for r in range(nrow)], info)
719
720
721def columnize(items, row_first=False, separator='  ', displaywidth=80, spread=False):
722    """ Transform a list of strings into a single string with columns.
723
724    Parameters
725    ----------
726    items : sequence of strings
727        The strings to process.
728
729    row_first : (default False)
730        Whether to compute columns for a row-first matrix instead of
731        column-first (default).
732
733    separator : str, optional [default is two spaces]
734        The string that separates columns.
735
736    displaywidth : int, optional [default is 80]
737        Width of the display in number of characters.
738
739    Returns
740    -------
741    The formatted string.
742    """
743    if not items:
744        return '\n'
745    matrix, info = compute_item_matrix(items, row_first=row_first, separator_size=len(separator), displaywidth=displaywidth)
746    if spread:
747        separator = separator.ljust(int(info['optimal_separator_width']))
748    fmatrix = [filter(None, x) for x in matrix]
749    sjoin = lambda x : separator.join([ y.ljust(w, ' ') for y, w in zip(x, info['column_widths'])])
750    return '\n'.join(map(sjoin, fmatrix))+'\n'
751
752
753def get_text_list(list_, last_sep=' and ', sep=", ", wrap_item_with=""):
754    """
755    Return a string with a natural enumeration of items
756
757    >>> get_text_list(['a', 'b', 'c', 'd'])
758    'a, b, c and d'
759    >>> get_text_list(['a', 'b', 'c'], ' or ')
760    'a, b or c'
761    >>> get_text_list(['a', 'b', 'c'], ', ')
762    'a, b, c'
763    >>> get_text_list(['a', 'b'], ' or ')
764    'a or b'
765    >>> get_text_list(['a'])
766    'a'
767    >>> get_text_list([])
768    ''
769    >>> get_text_list(['a', 'b'], wrap_item_with="`")
770    '`a` and `b`'
771    >>> get_text_list(['a', 'b', 'c', 'd'], " = ", sep=" + ")
772    'a + b + c = d'
773    """
774    if len(list_) == 0:
775        return ''
776    if wrap_item_with:
777        list_ = ['%s%s%s' % (wrap_item_with, item, wrap_item_with) for
778                 item in list_]
779    if len(list_) == 1:
780        return list_[0]
781    return '%s%s%s' % (
782        sep.join(i for i in list_[:-1]),
783        last_sep, list_[-1])
784