1# encoding: utf-8 2""" 3Utilities for working with strings and text. 4 5Inheritance diagram: 6 7.. inheritance-diagram:: IPython.utils.text 8 :parts: 3 9""" 10from __future__ import absolute_import 11 12import os 13import re 14import sys 15import textwrap 16from string import Formatter 17try: 18 from pathlib import Path 19except ImportError: 20 # Python 2 backport 21 from pathlib2 import Path 22 23from IPython.testing.skipdoctest import skip_doctest_py3, skip_doctest 24from IPython.utils import py3compat 25 26# datetime.strftime date format for ipython 27if sys.platform == 'win32': 28 date_format = "%B %d, %Y" 29else: 30 date_format = "%B %-d, %Y" 31 32class LSString(str): 33 """String derivative with a special access attributes. 34 35 These are normal strings, but with the special attributes: 36 37 .l (or .list) : value as list (split on newlines). 38 .n (or .nlstr): original value (the string itself). 39 .s (or .spstr): value as whitespace-separated string. 40 .p (or .paths): list of path objects (requires path.py package) 41 42 Any values which require transformations are computed only once and 43 cached. 44 45 Such strings are very useful to efficiently interact with the shell, which 46 typically only understands whitespace-separated options for commands.""" 47 48 def get_list(self): 49 try: 50 return self.__list 51 except AttributeError: 52 self.__list = self.split('\n') 53 return self.__list 54 55 l = list = property(get_list) 56 57 def get_spstr(self): 58 try: 59 return self.__spstr 60 except AttributeError: 61 self.__spstr = self.replace('\n',' ') 62 return self.__spstr 63 64 s = spstr = property(get_spstr) 65 66 def get_nlstr(self): 67 return self 68 69 n = nlstr = property(get_nlstr) 70 71 def get_paths(self): 72 try: 73 return self.__paths 74 except AttributeError: 75 self.__paths = [Path(p) for p in self.split('\n') if os.path.exists(p)] 76 return self.__paths 77 78 p = paths = property(get_paths) 79 80# FIXME: We need to reimplement type specific displayhook and then add this 81# back as a custom printer. This should also be moved outside utils into the 82# core. 83 84# def print_lsstring(arg): 85# """ Prettier (non-repr-like) and more informative printer for LSString """ 86# print "LSString (.p, .n, .l, .s available). Value:" 87# print arg 88# 89# 90# print_lsstring = result_display.when_type(LSString)(print_lsstring) 91 92 93class SList(list): 94 """List derivative with a special access attributes. 95 96 These are normal lists, but with the special attributes: 97 98 * .l (or .list) : value as list (the list itself). 99 * .n (or .nlstr): value as a string, joined on newlines. 100 * .s (or .spstr): value as a string, joined on spaces. 101 * .p (or .paths): list of path objects (requires path.py package) 102 103 Any values which require transformations are computed only once and 104 cached.""" 105 106 def get_list(self): 107 return self 108 109 l = list = property(get_list) 110 111 def get_spstr(self): 112 try: 113 return self.__spstr 114 except AttributeError: 115 self.__spstr = ' '.join(self) 116 return self.__spstr 117 118 s = spstr = property(get_spstr) 119 120 def get_nlstr(self): 121 try: 122 return self.__nlstr 123 except AttributeError: 124 self.__nlstr = '\n'.join(self) 125 return self.__nlstr 126 127 n = nlstr = property(get_nlstr) 128 129 def get_paths(self): 130 try: 131 return self.__paths 132 except AttributeError: 133 self.__paths = [Path(p) for p in self if os.path.exists(p)] 134 return self.__paths 135 136 p = paths = property(get_paths) 137 138 def grep(self, pattern, prune = False, field = None): 139 """ Return all strings matching 'pattern' (a regex or callable) 140 141 This is case-insensitive. If prune is true, return all items 142 NOT matching the pattern. 143 144 If field is specified, the match must occur in the specified 145 whitespace-separated field. 146 147 Examples:: 148 149 a.grep( lambda x: x.startswith('C') ) 150 a.grep('Cha.*log', prune=1) 151 a.grep('chm', field=-1) 152 """ 153 154 def match_target(s): 155 if field is None: 156 return s 157 parts = s.split() 158 try: 159 tgt = parts[field] 160 return tgt 161 except IndexError: 162 return "" 163 164 if isinstance(pattern, py3compat.string_types): 165 pred = lambda x : re.search(pattern, x, re.IGNORECASE) 166 else: 167 pred = pattern 168 if not prune: 169 return SList([el for el in self if pred(match_target(el))]) 170 else: 171 return SList([el for el in self if not pred(match_target(el))]) 172 173 def fields(self, *fields): 174 """ Collect whitespace-separated fields from string list 175 176 Allows quick awk-like usage of string lists. 177 178 Example data (in var a, created by 'a = !ls -l'):: 179 180 -rwxrwxrwx 1 ville None 18 Dec 14 2006 ChangeLog 181 drwxrwxrwx+ 6 ville None 0 Oct 24 18:05 IPython 182 183 * ``a.fields(0)`` is ``['-rwxrwxrwx', 'drwxrwxrwx+']`` 184 * ``a.fields(1,0)`` is ``['1 -rwxrwxrwx', '6 drwxrwxrwx+']`` 185 (note the joining by space). 186 * ``a.fields(-1)`` is ``['ChangeLog', 'IPython']`` 187 188 IndexErrors are ignored. 189 190 Without args, fields() just split()'s the strings. 191 """ 192 if len(fields) == 0: 193 return [el.split() for el in self] 194 195 res = SList() 196 for el in [f.split() for f in self]: 197 lineparts = [] 198 199 for fd in fields: 200 try: 201 lineparts.append(el[fd]) 202 except IndexError: 203 pass 204 if lineparts: 205 res.append(" ".join(lineparts)) 206 207 return res 208 209 def sort(self,field= None, nums = False): 210 """ sort by specified fields (see fields()) 211 212 Example:: 213 214 a.sort(1, nums = True) 215 216 Sorts a by second field, in numerical order (so that 21 > 3) 217 218 """ 219 220 #decorate, sort, undecorate 221 if field is not None: 222 dsu = [[SList([line]).fields(field), line] for line in self] 223 else: 224 dsu = [[line, line] for line in self] 225 if nums: 226 for i in range(len(dsu)): 227 numstr = "".join([ch for ch in dsu[i][0] if ch.isdigit()]) 228 try: 229 n = int(numstr) 230 except ValueError: 231 n = 0 232 dsu[i][0] = n 233 234 235 dsu.sort() 236 return SList([t[1] for t in dsu]) 237 238 239# FIXME: We need to reimplement type specific displayhook and then add this 240# back as a custom printer. This should also be moved outside utils into the 241# core. 242 243# def print_slist(arg): 244# """ Prettier (non-repr-like) and more informative printer for SList """ 245# print "SList (.p, .n, .l, .s, .grep(), .fields(), sort() available):" 246# if hasattr(arg, 'hideonce') and arg.hideonce: 247# arg.hideonce = False 248# return 249# 250# nlprint(arg) # This was a nested list printer, now removed. 251# 252# print_slist = result_display.when_type(SList)(print_slist) 253 254 255def indent(instr,nspaces=4, ntabs=0, flatten=False): 256 """Indent a string a given number of spaces or tabstops. 257 258 indent(str,nspaces=4,ntabs=0) -> indent str by ntabs+nspaces. 259 260 Parameters 261 ---------- 262 263 instr : basestring 264 The string to be indented. 265 nspaces : int (default: 4) 266 The number of spaces to be indented. 267 ntabs : int (default: 0) 268 The number of tabs to be indented. 269 flatten : bool (default: False) 270 Whether to scrub existing indentation. If True, all lines will be 271 aligned to the same indentation. If False, existing indentation will 272 be strictly increased. 273 274 Returns 275 ------- 276 277 str|unicode : string indented by ntabs and nspaces. 278 279 """ 280 if instr is None: 281 return 282 ind = '\t'*ntabs+' '*nspaces 283 if flatten: 284 pat = re.compile(r'^\s*', re.MULTILINE) 285 else: 286 pat = re.compile(r'^', re.MULTILINE) 287 outstr = re.sub(pat, ind, instr) 288 if outstr.endswith(os.linesep+ind): 289 return outstr[:-len(ind)] 290 else: 291 return outstr 292 293 294def list_strings(arg): 295 """Always return a list of strings, given a string or list of strings 296 as input. 297 298 Examples 299 -------- 300 :: 301 302 In [7]: list_strings('A single string') 303 Out[7]: ['A single string'] 304 305 In [8]: list_strings(['A single string in a list']) 306 Out[8]: ['A single string in a list'] 307 308 In [9]: list_strings(['A','list','of','strings']) 309 Out[9]: ['A', 'list', 'of', 'strings'] 310 """ 311 312 if isinstance(arg, py3compat.string_types): return [arg] 313 else: return arg 314 315 316def marquee(txt='',width=78,mark='*'): 317 """Return the input string centered in a 'marquee'. 318 319 Examples 320 -------- 321 :: 322 323 In [16]: marquee('A test',40) 324 Out[16]: '**************** A test ****************' 325 326 In [17]: marquee('A test',40,'-') 327 Out[17]: '---------------- A test ----------------' 328 329 In [18]: marquee('A test',40,' ') 330 Out[18]: ' A test ' 331 332 """ 333 if not txt: 334 return (mark*width)[:width] 335 nmark = (width-len(txt)-2)//len(mark)//2 336 if nmark < 0: nmark =0 337 marks = mark*nmark 338 return '%s %s %s' % (marks,txt,marks) 339 340 341ini_spaces_re = re.compile(r'^(\s+)') 342 343def num_ini_spaces(strng): 344 """Return the number of initial spaces in a string""" 345 346 ini_spaces = ini_spaces_re.match(strng) 347 if ini_spaces: 348 return ini_spaces.end() 349 else: 350 return 0 351 352 353def format_screen(strng): 354 """Format a string for screen printing. 355 356 This removes some latex-type format codes.""" 357 # Paragraph continue 358 par_re = re.compile(r'\\$',re.MULTILINE) 359 strng = par_re.sub('',strng) 360 return strng 361 362 363def dedent(text): 364 """Equivalent of textwrap.dedent that ignores unindented first line. 365 366 This means it will still dedent strings like: 367 '''foo 368 is a bar 369 ''' 370 371 For use in wrap_paragraphs. 372 """ 373 374 if text.startswith('\n'): 375 # text starts with blank line, don't ignore the first line 376 return textwrap.dedent(text) 377 378 # split first line 379 splits = text.split('\n',1) 380 if len(splits) == 1: 381 # only one line 382 return textwrap.dedent(text) 383 384 first, rest = splits 385 # dedent everything but the first line 386 rest = textwrap.dedent(rest) 387 return '\n'.join([first, rest]) 388 389 390def wrap_paragraphs(text, ncols=80): 391 """Wrap multiple paragraphs to fit a specified width. 392 393 This is equivalent to textwrap.wrap, but with support for multiple 394 paragraphs, as separated by empty lines. 395 396 Returns 397 ------- 398 399 list of complete paragraphs, wrapped to fill `ncols` columns. 400 """ 401 paragraph_re = re.compile(r'\n(\s*\n)+', re.MULTILINE) 402 text = dedent(text).strip() 403 paragraphs = paragraph_re.split(text)[::2] # every other entry is space 404 out_ps = [] 405 indent_re = re.compile(r'\n\s+', re.MULTILINE) 406 for p in paragraphs: 407 # presume indentation that survives dedent is meaningful formatting, 408 # so don't fill unless text is flush. 409 if indent_re.search(p) is None: 410 # wrap paragraph 411 p = textwrap.fill(p, ncols) 412 out_ps.append(p) 413 return out_ps 414 415 416def long_substr(data): 417 """Return the longest common substring in a list of strings. 418 419 Credit: http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python 420 """ 421 substr = '' 422 if len(data) > 1 and len(data[0]) > 0: 423 for i in range(len(data[0])): 424 for j in range(len(data[0])-i+1): 425 if j > len(substr) and all(data[0][i:i+j] in x for x in data): 426 substr = data[0][i:i+j] 427 elif len(data) == 1: 428 substr = data[0] 429 return substr 430 431 432def strip_email_quotes(text): 433 """Strip leading email quotation characters ('>'). 434 435 Removes any combination of leading '>' interspersed with whitespace that 436 appears *identically* in all lines of the input text. 437 438 Parameters 439 ---------- 440 text : str 441 442 Examples 443 -------- 444 445 Simple uses:: 446 447 In [2]: strip_email_quotes('> > text') 448 Out[2]: 'text' 449 450 In [3]: strip_email_quotes('> > text\\n> > more') 451 Out[3]: 'text\\nmore' 452 453 Note how only the common prefix that appears in all lines is stripped:: 454 455 In [4]: strip_email_quotes('> > text\\n> > more\\n> more...') 456 Out[4]: '> text\\n> more\\nmore...' 457 458 So if any line has no quote marks ('>') , then none are stripped from any 459 of them :: 460 461 In [5]: strip_email_quotes('> > text\\n> > more\\nlast different') 462 Out[5]: '> > text\\n> > more\\nlast different' 463 """ 464 lines = text.splitlines() 465 matches = set() 466 for line in lines: 467 prefix = re.match(r'^(\s*>[ >]*)', line) 468 if prefix: 469 matches.add(prefix.group(1)) 470 else: 471 break 472 else: 473 prefix = long_substr(list(matches)) 474 if prefix: 475 strip = len(prefix) 476 text = '\n'.join([ ln[strip:] for ln in lines]) 477 return text 478 479def strip_ansi(source): 480 """ 481 Remove ansi escape codes from text. 482 483 Parameters 484 ---------- 485 source : str 486 Source to remove the ansi from 487 """ 488 return re.sub(r'\033\[(\d|;)+?m', '', source) 489 490 491class EvalFormatter(Formatter): 492 """A String Formatter that allows evaluation of simple expressions. 493 494 Note that this version interprets a : as specifying a format string (as per 495 standard string formatting), so if slicing is required, you must explicitly 496 create a slice. 497 498 This is to be used in templating cases, such as the parallel batch 499 script templates, where simple arithmetic on arguments is useful. 500 501 Examples 502 -------- 503 :: 504 505 In [1]: f = EvalFormatter() 506 In [2]: f.format('{n//4}', n=8) 507 Out[2]: '2' 508 509 In [3]: f.format("{greeting[slice(2,4)]}", greeting="Hello") 510 Out[3]: 'll' 511 """ 512 def get_field(self, name, args, kwargs): 513 v = eval(name, kwargs) 514 return v, name 515 516#XXX: As of Python 3.4, the format string parsing no longer splits on a colon 517# inside [], so EvalFormatter can handle slicing. Once we only support 3.4 and 518# above, it should be possible to remove FullEvalFormatter. 519 520@skip_doctest_py3 521class FullEvalFormatter(Formatter): 522 """A String Formatter that allows evaluation of simple expressions. 523 524 Any time a format key is not found in the kwargs, 525 it will be tried as an expression in the kwargs namespace. 526 527 Note that this version allows slicing using [1:2], so you cannot specify 528 a format string. Use :class:`EvalFormatter` to permit format strings. 529 530 Examples 531 -------- 532 :: 533 534 In [1]: f = FullEvalFormatter() 535 In [2]: f.format('{n//4}', n=8) 536 Out[2]: u'2' 537 538 In [3]: f.format('{list(range(5))[2:4]}') 539 Out[3]: u'[2, 3]' 540 541 In [4]: f.format('{3*2}') 542 Out[4]: u'6' 543 """ 544 # copied from Formatter._vformat with minor changes to allow eval 545 # and replace the format_spec code with slicing 546 def vformat(self, format_string, args, kwargs): 547 result = [] 548 for literal_text, field_name, format_spec, conversion in \ 549 self.parse(format_string): 550 551 # output the literal text 552 if literal_text: 553 result.append(literal_text) 554 555 # if there's a field, output it 556 if field_name is not None: 557 # this is some markup, find the object and do 558 # the formatting 559 560 if format_spec: 561 # override format spec, to allow slicing: 562 field_name = ':'.join([field_name, format_spec]) 563 564 # eval the contents of the field for the object 565 # to be formatted 566 obj = eval(field_name, kwargs) 567 568 # do any conversion on the resulting object 569 obj = self.convert_field(obj, conversion) 570 571 # format the object and append to the result 572 result.append(self.format_field(obj, '')) 573 574 return u''.join(py3compat.cast_unicode(s) for s in result) 575 576 577@skip_doctest_py3 578class DollarFormatter(FullEvalFormatter): 579 """Formatter allowing Itpl style $foo replacement, for names and attribute 580 access only. Standard {foo} replacement also works, and allows full 581 evaluation of its arguments. 582 583 Examples 584 -------- 585 :: 586 587 In [1]: f = DollarFormatter() 588 In [2]: f.format('{n//4}', n=8) 589 Out[2]: u'2' 590 591 In [3]: f.format('23 * 76 is $result', result=23*76) 592 Out[3]: u'23 * 76 is 1748' 593 594 In [4]: f.format('$a or {b}', a=1, b=2) 595 Out[4]: u'1 or 2' 596 """ 597 _dollar_pattern = re.compile("(.*?)\$(\$?[\w\.]+)") 598 def parse(self, fmt_string): 599 for literal_txt, field_name, format_spec, conversion \ 600 in Formatter.parse(self, fmt_string): 601 602 # Find $foo patterns in the literal text. 603 continue_from = 0 604 txt = "" 605 for m in self._dollar_pattern.finditer(literal_txt): 606 new_txt, new_field = m.group(1,2) 607 # $$foo --> $foo 608 if new_field.startswith("$"): 609 txt += new_txt + new_field 610 else: 611 yield (txt + new_txt, new_field, "", None) 612 txt = "" 613 continue_from = m.end() 614 615 # Re-yield the {foo} style pattern 616 yield (txt + literal_txt[continue_from:], field_name, format_spec, conversion) 617 618#----------------------------------------------------------------------------- 619# Utils to columnize a list of string 620#----------------------------------------------------------------------------- 621 622def _col_chunks(l, max_rows, row_first=False): 623 """Yield successive max_rows-sized column chunks from l.""" 624 if row_first: 625 ncols = (len(l) // max_rows) + (len(l) % max_rows > 0) 626 for i in py3compat.xrange(ncols): 627 yield [l[j] for j in py3compat.xrange(i, len(l), ncols)] 628 else: 629 for i in py3compat.xrange(0, len(l), max_rows): 630 yield l[i:(i + max_rows)] 631 632 633def _find_optimal(rlist, row_first=False, separator_size=2, displaywidth=80): 634 """Calculate optimal info to columnize a list of string""" 635 for max_rows in range(1, len(rlist) + 1): 636 col_widths = list(map(max, _col_chunks(rlist, max_rows, row_first))) 637 sumlength = sum(col_widths) 638 ncols = len(col_widths) 639 if sumlength + separator_size * (ncols - 1) <= displaywidth: 640 break 641 return {'num_columns': ncols, 642 'optimal_separator_width': (displaywidth - sumlength) / (ncols - 1) if (ncols - 1) else 0, 643 'max_rows': max_rows, 644 'column_widths': col_widths 645 } 646 647 648def _get_or_default(mylist, i, default=None): 649 """return list item number, or default if don't exist""" 650 if i >= len(mylist): 651 return default 652 else : 653 return mylist[i] 654 655 656def compute_item_matrix(items, row_first=False, empty=None, *args, **kwargs) : 657 """Returns a nested list, and info to columnize items 658 659 Parameters 660 ---------- 661 662 items 663 list of strings to columize 664 row_first : (default False) 665 Whether to compute columns for a row-first matrix instead of 666 column-first (default). 667 empty : (default None) 668 default value to fill list if needed 669 separator_size : int (default=2) 670 How much caracters will be used as a separation between each columns. 671 displaywidth : int (default=80) 672 The width of the area onto wich the columns should enter 673 674 Returns 675 ------- 676 677 strings_matrix 678 679 nested list of string, the outer most list contains as many list as 680 rows, the innermost lists have each as many element as colums. If the 681 total number of elements in `items` does not equal the product of 682 rows*columns, the last element of some lists are filled with `None`. 683 684 dict_info 685 some info to make columnize easier: 686 687 num_columns 688 number of columns 689 max_rows 690 maximum number of rows (final number may be less) 691 column_widths 692 list of with of each columns 693 optimal_separator_width 694 best separator width between columns 695 696 Examples 697 -------- 698 :: 699 700 In [1]: l = ['aaa','b','cc','d','eeeee','f','g','h','i','j','k','l'] 701 ...: compute_item_matrix(l, displaywidth=12) 702 Out[1]: 703 ([['aaa', 'f', 'k'], 704 ['b', 'g', 'l'], 705 ['cc', 'h', None], 706 ['d', 'i', None], 707 ['eeeee', 'j', None]], 708 {'num_columns': 3, 709 'column_widths': [5, 1, 1], 710 'optimal_separator_width': 2, 711 'max_rows': 5}) 712 """ 713 info = _find_optimal(list(map(len, items)), row_first, *args, **kwargs) 714 nrow, ncol = info['max_rows'], info['num_columns'] 715 if row_first: 716 return ([[_get_or_default(items, r * ncol + c, default=empty) for c in range(ncol)] for r in range(nrow)], info) 717 else: 718 return ([[_get_or_default(items, c * nrow + r, default=empty) for c in range(ncol)] for r in range(nrow)], info) 719 720 721def columnize(items, row_first=False, separator=' ', displaywidth=80, spread=False): 722 """ Transform a list of strings into a single string with columns. 723 724 Parameters 725 ---------- 726 items : sequence of strings 727 The strings to process. 728 729 row_first : (default False) 730 Whether to compute columns for a row-first matrix instead of 731 column-first (default). 732 733 separator : str, optional [default is two spaces] 734 The string that separates columns. 735 736 displaywidth : int, optional [default is 80] 737 Width of the display in number of characters. 738 739 Returns 740 ------- 741 The formatted string. 742 """ 743 if not items: 744 return '\n' 745 matrix, info = compute_item_matrix(items, row_first=row_first, separator_size=len(separator), displaywidth=displaywidth) 746 if spread: 747 separator = separator.ljust(int(info['optimal_separator_width'])) 748 fmatrix = [filter(None, x) for x in matrix] 749 sjoin = lambda x : separator.join([ y.ljust(w, ' ') for y, w in zip(x, info['column_widths'])]) 750 return '\n'.join(map(sjoin, fmatrix))+'\n' 751 752 753def get_text_list(list_, last_sep=' and ', sep=", ", wrap_item_with=""): 754 """ 755 Return a string with a natural enumeration of items 756 757 >>> get_text_list(['a', 'b', 'c', 'd']) 758 'a, b, c and d' 759 >>> get_text_list(['a', 'b', 'c'], ' or ') 760 'a, b or c' 761 >>> get_text_list(['a', 'b', 'c'], ', ') 762 'a, b, c' 763 >>> get_text_list(['a', 'b'], ' or ') 764 'a or b' 765 >>> get_text_list(['a']) 766 'a' 767 >>> get_text_list([]) 768 '' 769 >>> get_text_list(['a', 'b'], wrap_item_with="`") 770 '`a` and `b`' 771 >>> get_text_list(['a', 'b', 'c', 'd'], " = ", sep=" + ") 772 'a + b + c = d' 773 """ 774 if len(list_) == 0: 775 return '' 776 if wrap_item_with: 777 list_ = ['%s%s%s' % (wrap_item_with, item, wrap_item_with) for 778 item in list_] 779 if len(list_) == 1: 780 return list_[0] 781 return '%s%s%s' % ( 782 sep.join(i for i in list_[:-1]), 783 last_sep, list_[-1]) 784