1"""Text wrapping and filling. 2""" 3 4# Copyright (C) 1999-2001 Gregory P. Ward. 5# Copyright (C) 2002, 2003 Python Software Foundation. 6# Written by Greg Ward <gward@python.net> 7 8import re 9 10__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] 11 12# Hardcode the recognized whitespace characters to the US-ASCII 13# whitespace characters. The main reason for doing this is that 14# some Unicode spaces (like \u00a0) are non-breaking whitespaces. 15_whitespace = '\t\n\x0b\x0c\r ' 16 17class TextWrapper: 18 """ 19 Object for wrapping/filling text. The public interface consists of 20 the wrap() and fill() methods; the other methods are just there for 21 subclasses to override in order to tweak the default behaviour. 22 If you want to completely replace the main wrapping algorithm, 23 you'll probably have to override _wrap_chunks(). 24 25 Several instance attributes control various aspects of wrapping: 26 width (default: 70) 27 the maximum width of wrapped lines (unless break_long_words 28 is false) 29 initial_indent (default: "") 30 string that will be prepended to the first line of wrapped 31 output. Counts towards the line's width. 32 subsequent_indent (default: "") 33 string that will be prepended to all lines save the first 34 of wrapped output; also counts towards each line's width. 35 expand_tabs (default: true) 36 Expand tabs in input text to spaces before further processing. 37 Each tab will become 0 .. 'tabsize' spaces, depending on its position 38 in its line. If false, each tab is treated as a single character. 39 tabsize (default: 8) 40 Expand tabs in input text to 0 .. 'tabsize' spaces, unless 41 'expand_tabs' is false. 42 replace_whitespace (default: true) 43 Replace all whitespace characters in the input text by spaces 44 after tab expansion. Note that if expand_tabs is false and 45 replace_whitespace is true, every tab will be converted to a 46 single space! 47 fix_sentence_endings (default: false) 48 Ensure that sentence-ending punctuation is always followed 49 by two spaces. Off by default because the algorithm is 50 (unavoidably) imperfect. 51 break_long_words (default: true) 52 Break words longer than 'width'. If false, those words will not 53 be broken, and some lines might be longer than 'width'. 54 break_on_hyphens (default: true) 55 Allow breaking hyphenated words. If true, wrapping will occur 56 preferably on whitespaces and right after hyphens part of 57 compound words. 58 drop_whitespace (default: true) 59 Drop leading and trailing whitespace from lines. 60 max_lines (default: None) 61 Truncate wrapped lines. 62 placeholder (default: ' [...]') 63 Append to the last line of truncated text. 64 """ 65 66 unicode_whitespace_trans = {} 67 uspace = ord(' ') 68 for x in _whitespace: 69 unicode_whitespace_trans[ord(x)] = uspace 70 71 # This funky little regex is just the trick for splitting 72 # text up into word-wrappable chunks. E.g. 73 # "Hello there -- you goof-ball, use the -b option!" 74 # splits into 75 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 76 # (after stripping out empty strings). 77 word_punct = r'[\w!"\'&.,?]' 78 letter = r'[^\d\W]' 79 whitespace = r'[%s]' % re.escape(_whitespace) 80 nowhitespace = '[^' + whitespace[1:] 81 wordsep_re = re.compile(r''' 82 ( # any whitespace 83 %(ws)s+ 84 | # em-dash between words 85 (?<=%(wp)s) -{2,} (?=\w) 86 | # word, possibly hyphenated 87 %(nws)s+? (?: 88 # hyphenated word 89 -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) 90 (?= %(lt)s -? %(lt)s) 91 | # end of word 92 (?=%(ws)s|\Z) 93 | # em-dash 94 (?<=%(wp)s) (?=-{2,}\w) 95 ) 96 )''' % {'wp': word_punct, 'lt': letter, 97 'ws': whitespace, 'nws': nowhitespace}, 98 re.VERBOSE) 99 del word_punct, letter, nowhitespace 100 101 # This less funky little regex just split on recognized spaces. E.g. 102 # "Hello there -- you goof-ball, use the -b option!" 103 # splits into 104 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ 105 wordsep_simple_re = re.compile(r'(%s+)' % whitespace) 106 del whitespace 107 108 # XXX this is not locale- or charset-aware -- string.lowercase 109 # is US-ASCII only (and therefore English-only) 110 sentence_end_re = re.compile(r'[a-z]' # lowercase letter 111 r'[\.\!\?]' # sentence-ending punct. 112 r'[\"\']?' # optional end-of-quote 113 r'\Z') # end of chunk 114 115 def __init__(self, 116 width=70, 117 initial_indent="", 118 subsequent_indent="", 119 expand_tabs=True, 120 replace_whitespace=True, 121 fix_sentence_endings=False, 122 break_long_words=True, 123 drop_whitespace=True, 124 break_on_hyphens=True, 125 tabsize=8, 126 *, 127 max_lines=None, 128 placeholder=' [...]'): 129 self.width = width 130 self.initial_indent = initial_indent 131 self.subsequent_indent = subsequent_indent 132 self.expand_tabs = expand_tabs 133 self.replace_whitespace = replace_whitespace 134 self.fix_sentence_endings = fix_sentence_endings 135 self.break_long_words = break_long_words 136 self.drop_whitespace = drop_whitespace 137 self.break_on_hyphens = break_on_hyphens 138 self.tabsize = tabsize 139 self.max_lines = max_lines 140 self.placeholder = placeholder 141 142 143 # -- Private methods ----------------------------------------------- 144 # (possibly useful for subclasses to override) 145 146 def _munge_whitespace(self, text): 147 """_munge_whitespace(text : string) -> string 148 149 Munge whitespace in text: expand tabs and convert all other 150 whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz" 151 becomes " foo bar baz". 152 """ 153 if self.expand_tabs: 154 text = text.expandtabs(self.tabsize) 155 if self.replace_whitespace: 156 text = text.translate(self.unicode_whitespace_trans) 157 return text 158 159 160 def _split(self, text): 161 """_split(text : string) -> [string] 162 163 Split the text to wrap into indivisible chunks. Chunks are 164 not quite the same as words; see _wrap_chunks() for full 165 details. As an example, the text 166 Look, goof-ball -- use the -b option! 167 breaks into the following chunks: 168 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 169 'use', ' ', 'the', ' ', '-b', ' ', 'option!' 170 if break_on_hyphens is True, or in: 171 'Look,', ' ', 'goof-ball', ' ', '--', ' ', 172 'use', ' ', 'the', ' ', '-b', ' ', option!' 173 otherwise. 174 """ 175 if self.break_on_hyphens is True: 176 chunks = self.wordsep_re.split(text) 177 else: 178 chunks = self.wordsep_simple_re.split(text) 179 chunks = [c for c in chunks if c] 180 return chunks 181 182 def _fix_sentence_endings(self, chunks): 183 """_fix_sentence_endings(chunks : [string]) 184 185 Correct for sentence endings buried in 'chunks'. Eg. when the 186 original text contains "... foo.\\nBar ...", munge_whitespace() 187 and split() will convert that to [..., "foo.", " ", "Bar", ...] 188 which has one too few spaces; this method simply changes the one 189 space to two. 190 """ 191 i = 0 192 patsearch = self.sentence_end_re.search 193 while i < len(chunks)-1: 194 if chunks[i+1] == " " and patsearch(chunks[i]): 195 chunks[i+1] = " " 196 i += 2 197 else: 198 i += 1 199 200 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): 201 """_handle_long_word(chunks : [string], 202 cur_line : [string], 203 cur_len : int, width : int) 204 205 Handle a chunk of text (most likely a word, not whitespace) that 206 is too long to fit in any line. 207 """ 208 # Figure out when indent is larger than the specified width, and make 209 # sure at least one character is stripped off on every pass 210 if width < 1: 211 space_left = 1 212 else: 213 space_left = width - cur_len 214 215 # If we're allowed to break long words, then do so: put as much 216 # of the next chunk onto the current line as will fit. 217 if self.break_long_words: 218 end = space_left 219 chunk = reversed_chunks[-1] 220 if self.break_on_hyphens and len(chunk) > space_left: 221 # break after last hyphen, but only if there are 222 # non-hyphens before it 223 hyphen = chunk.rfind('-', 0, space_left) 224 if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]): 225 end = hyphen + 1 226 cur_line.append(chunk[:end]) 227 reversed_chunks[-1] = chunk[end:] 228 229 # Otherwise, we have to preserve the long word intact. Only add 230 # it to the current line if there's nothing already there -- 231 # that minimizes how much we violate the width constraint. 232 elif not cur_line: 233 cur_line.append(reversed_chunks.pop()) 234 235 # If we're not allowed to break long words, and there's already 236 # text on the current line, do nothing. Next time through the 237 # main loop of _wrap_chunks(), we'll wind up here again, but 238 # cur_len will be zero, so the next line will be entirely 239 # devoted to the long word that we can't handle right now. 240 241 def _wrap_chunks(self, chunks): 242 """_wrap_chunks(chunks : [string]) -> [string] 243 244 Wrap a sequence of text chunks and return a list of lines of 245 length 'self.width' or less. (If 'break_long_words' is false, 246 some lines may be longer than this.) Chunks correspond roughly 247 to words and the whitespace between them: each chunk is 248 indivisible (modulo 'break_long_words'), but a line break can 249 come between any two chunks. Chunks should not have internal 250 whitespace; ie. a chunk is either all whitespace or a "word". 251 Whitespace chunks will be removed from the beginning and end of 252 lines, but apart from that whitespace is preserved. 253 """ 254 lines = [] 255 if self.width <= 0: 256 raise ValueError("invalid width %r (must be > 0)" % self.width) 257 if self.max_lines is not None: 258 if self.max_lines > 1: 259 indent = self.subsequent_indent 260 else: 261 indent = self.initial_indent 262 if len(indent) + len(self.placeholder.lstrip()) > self.width: 263 raise ValueError("placeholder too large for max width") 264 265 # Arrange in reverse order so items can be efficiently popped 266 # from a stack of chucks. 267 chunks.reverse() 268 269 while chunks: 270 271 # Start the list of chunks that will make up the current line. 272 # cur_len is just the length of all the chunks in cur_line. 273 cur_line = [] 274 cur_len = 0 275 276 # Figure out which static string will prefix this line. 277 if lines: 278 indent = self.subsequent_indent 279 else: 280 indent = self.initial_indent 281 282 # Maximum width for this line. 283 width = self.width - len(indent) 284 285 # First chunk on line is whitespace -- drop it, unless this 286 # is the very beginning of the text (ie. no lines started yet). 287 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 288 del chunks[-1] 289 290 while chunks: 291 l = len(chunks[-1]) 292 293 # Can at least squeeze this chunk onto the current line. 294 if cur_len + l <= width: 295 cur_line.append(chunks.pop()) 296 cur_len += l 297 298 # Nope, this line is full. 299 else: 300 break 301 302 # The current line is full, and the next chunk is too big to 303 # fit on *any* line (not just this one). 304 if chunks and len(chunks[-1]) > width: 305 self._handle_long_word(chunks, cur_line, cur_len, width) 306 cur_len = sum(map(len, cur_line)) 307 308 # If the last chunk on this line is all whitespace, drop it. 309 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': 310 cur_len -= len(cur_line[-1]) 311 del cur_line[-1] 312 313 if cur_line: 314 if (self.max_lines is None or 315 len(lines) + 1 < self.max_lines or 316 (not chunks or 317 self.drop_whitespace and 318 len(chunks) == 1 and 319 not chunks[0].strip()) and cur_len <= width): 320 # Convert current line back to a string and store it in 321 # list of all lines (return value). 322 lines.append(indent + ''.join(cur_line)) 323 else: 324 while cur_line: 325 if (cur_line[-1].strip() and 326 cur_len + len(self.placeholder) <= width): 327 cur_line.append(self.placeholder) 328 lines.append(indent + ''.join(cur_line)) 329 break 330 cur_len -= len(cur_line[-1]) 331 del cur_line[-1] 332 else: 333 if lines: 334 prev_line = lines[-1].rstrip() 335 if (len(prev_line) + len(self.placeholder) <= 336 self.width): 337 lines[-1] = prev_line + self.placeholder 338 break 339 lines.append(indent + self.placeholder.lstrip()) 340 break 341 342 return lines 343 344 def _split_chunks(self, text): 345 text = self._munge_whitespace(text) 346 return self._split(text) 347 348 # -- Public interface ---------------------------------------------- 349 350 def wrap(self, text): 351 """wrap(text : string) -> [string] 352 353 Reformat the single paragraph in 'text' so it fits in lines of 354 no more than 'self.width' columns, and return a list of wrapped 355 lines. Tabs in 'text' are expanded with string.expandtabs(), 356 and all other whitespace characters (including newline) are 357 converted to space. 358 """ 359 chunks = self._split_chunks(text) 360 if self.fix_sentence_endings: 361 self._fix_sentence_endings(chunks) 362 return self._wrap_chunks(chunks) 363 364 def fill(self, text): 365 """fill(text : string) -> string 366 367 Reformat the single paragraph in 'text' to fit in lines of no 368 more than 'self.width' columns, and return a new string 369 containing the entire wrapped paragraph. 370 """ 371 return "\n".join(self.wrap(text)) 372 373 374# -- Convenience interface --------------------------------------------- 375 376def wrap(text, width=70, **kwargs): 377 """Wrap a single paragraph of text, returning a list of wrapped lines. 378 379 Reformat the single paragraph in 'text' so it fits in lines of no 380 more than 'width' columns, and return a list of wrapped lines. By 381 default, tabs in 'text' are expanded with string.expandtabs(), and 382 all other whitespace characters (including newline) are converted to 383 space. See TextWrapper class for available keyword args to customize 384 wrapping behaviour. 385 """ 386 w = TextWrapper(width=width, **kwargs) 387 return w.wrap(text) 388 389def fill(text, width=70, **kwargs): 390 """Fill a single paragraph of text, returning a new string. 391 392 Reformat the single paragraph in 'text' to fit in lines of no more 393 than 'width' columns, and return a new string containing the entire 394 wrapped paragraph. As with wrap(), tabs are expanded and other 395 whitespace characters converted to space. See TextWrapper class for 396 available keyword args to customize wrapping behaviour. 397 """ 398 w = TextWrapper(width=width, **kwargs) 399 return w.fill(text) 400 401def shorten(text, width, **kwargs): 402 """Collapse and truncate the given text to fit in the given width. 403 404 The text first has its whitespace collapsed. If it then fits in 405 the *width*, it is returned as is. Otherwise, as many words 406 as possible are joined and then the placeholder is appended:: 407 408 >>> textwrap.shorten("Hello world!", width=12) 409 'Hello world!' 410 >>> textwrap.shorten("Hello world!", width=11) 411 'Hello [...]' 412 """ 413 w = TextWrapper(width=width, max_lines=1, **kwargs) 414 return w.fill(' '.join(text.strip().split())) 415 416 417# -- Loosely related functionality ------------------------------------- 418 419_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) 420_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 421 422def dedent(text): 423 """Remove any common leading whitespace from every line in `text`. 424 425 This can be used to make triple-quoted strings line up with the left 426 edge of the display, while still presenting them in the source code 427 in indented form. 428 429 Note that tabs and spaces are both treated as whitespace, but they 430 are not equal: the lines " hello" and "\\thello" are 431 considered to have no common leading whitespace. 432 433 Entirely blank lines are normalized to a newline character. 434 """ 435 # Look for the longest leading string of spaces and tabs common to 436 # all lines. 437 margin = None 438 text = _whitespace_only_re.sub('', text) 439 indents = _leading_whitespace_re.findall(text) 440 for indent in indents: 441 if margin is None: 442 margin = indent 443 444 # Current line more deeply indented than previous winner: 445 # no change (previous winner is still on top). 446 elif indent.startswith(margin): 447 pass 448 449 # Current line consistent with and no deeper than previous winner: 450 # it's the new winner. 451 elif margin.startswith(indent): 452 margin = indent 453 454 # Find the largest common whitespace between current line and previous 455 # winner. 456 else: 457 for i, (x, y) in enumerate(zip(margin, indent)): 458 if x != y: 459 margin = margin[:i] 460 break 461 462 # sanity check (testing/debugging only) 463 if 0 and margin: 464 for line in text.split("\n"): 465 assert not line or line.startswith(margin), \ 466 "line = %r, margin = %r" % (line, margin) 467 468 if margin: 469 text = re.sub(r'(?m)^' + margin, '', text) 470 return text 471 472 473def indent(text, prefix, predicate=None): 474 """Adds 'prefix' to the beginning of selected lines in 'text'. 475 476 If 'predicate' is provided, 'prefix' will only be added to the lines 477 where 'predicate(line)' is True. If 'predicate' is not provided, 478 it will default to adding 'prefix' to all non-empty lines that do not 479 consist solely of whitespace characters. 480 """ 481 if predicate is None: 482 def predicate(line): 483 return line.strip() 484 485 def prefixed_lines(): 486 for line in text.splitlines(True): 487 yield (prefix + line if predicate(line) else line) 488 return ''.join(prefixed_lines()) 489 490 491if __name__ == "__main__": 492 #print dedent("\tfoo\n\tbar") 493 #print dedent(" \thello there\n \t how are you?") 494 print(dedent("Hello there.\n This is indented.")) 495