1# -*- coding: utf-8 -*- 2from __future__ import unicode_literals 3 4__copyright__ = """ 5Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ 6Copyright (c) 2010, Kurt Raschke <kurt@kurtraschke.com> 7Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ 8Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/ 9 10Original PHP Version: 11Copyright (c) 2003-2004, Dean Allen <dean@textism.com> 12All rights reserved. 13 14Thanks to Carlo Zottmann <carlo@g-blog.net> for refactoring 15Textile's procedural code into a class framework 16 17Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/ 18 19""" 20import uuid 21from urllib.parse import urlparse, urlsplit, urlunsplit, quote, unquote 22from collections import OrderedDict 23 24from textile.tools import sanitizer, imagesize 25from textile.regex_strings import (align_re_s, cls_re_s, pnct_re_s, 26 regex_snippets, syms_re_s, table_span_re_s) 27from textile.utils import (decode_high, encode_high, encode_html, generate_tag, 28 has_raw_text, is_rel_url, is_valid_url, list_type, normalize_newlines, 29 parse_attributes, pba) 30from textile.objects import Block, Table 31 32try: 33 import regex as re 34except ImportError: 35 import re 36 37 38class Textile(object): 39 restricted_url_schemes = ('http', 'https', 'ftp', 'mailto') 40 unrestricted_url_schemes = restricted_url_schemes + ('file', 'tel', 41 'callto', 'sftp', 'data') 42 43 btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', r'fn\d+', 'p', '###') 44 btag_lite = ('bq', 'bc', 'p') 45 46 note_index = 1 47 48 doctype_whitelist = ['xhtml', 'html5'] 49 50 glyph_definitions = { 51 'quote_single_open': '‘', 52 'quote_single_close': '’', 53 'quote_double_open': '“', 54 'quote_double_close': '”', 55 'apostrophe': '’', 56 'prime': '′', 57 'prime_double': '″', 58 'ellipsis': '…', 59 'ampersand': '&', 60 'emdash': '—', 61 'endash': '–', 62 'dimension': '×', 63 'trademark': '™', 64 'registered': '®', 65 'copyright': '©', 66 'half': '½', 67 'quarter': '¼', 68 'threequarters': '¾', 69 'degrees': '°', 70 'plusminus': '±', 71 } 72 73 def __init__(self, restricted=False, lite=False, noimage=False, 74 get_sizes=False, html_type='xhtml', rel='', block_tags=True): 75 """Textile properties that are common to regular textile and 76 textile_restricted""" 77 self.restricted = restricted 78 self.lite = lite 79 self.noimage = noimage 80 self.get_sizes = get_sizes 81 self.fn = {} 82 self.urlrefs = {} 83 self.shelf = {} 84 self.rel = rel 85 self.html_type = html_type 86 self.max_span_depth = 5 87 self.span_depth = 0 88 uid = uuid.uuid4().hex 89 self.uid = 'textileRef:{0}:'.format(uid) 90 self.linkPrefix = '{0}-'.format(uid) 91 self.linkIndex = 0 92 self.refCache = {} 93 self.refIndex = 0 94 self.block_tags = block_tags 95 96 cur = r'' 97 if regex_snippets['cur']: # pragma: no branch 98 cur = r'(?:[{0}]{1}*)?'.format(regex_snippets['cur'], 99 regex_snippets['space']) 100 101 # We'll be searching for characters that need to be HTML-encoded to 102 # produce properly valid html. These are the defaults that work in 103 # most cases. Below, we'll copy this and modify the necessary pieces 104 # to make it work for characters at the beginning of the string. 105 self.glyph_search = [ 106 # apostrophe's 107 re.compile(r"(^|{0}|\))'({0})".format(regex_snippets['wrd']), 108 flags=re.U), 109 # back in '88 110 re.compile(r"({0})'(\d+{1}?)\b(?![.]?[{1}]*?')".format( 111 regex_snippets['space'], regex_snippets['wrd']), 112 flags=re.U), 113 # single opening following an open bracket. 114 re.compile(r"([([{])'(?=\S)", flags=re.U), 115 # single closing 116 re.compile(r"(^|\S)'(?={0}|{1}|<|$)".format( 117 regex_snippets['space'], pnct_re_s), flags=re.U), 118 # single opening 119 re.compile(r"'", re.U), 120 # double opening following an open bracket. Allows things like 121 # Hello ["(Mum) & dad"] 122 re.compile(r'([([{])"(?=\S)', flags=re.U), 123 # double closing 124 re.compile(r'(^|\S)"(?={0}|{1}|<|$)'.format( 125 regex_snippets['space'], pnct_re_s), re.U), 126 # double opening 127 re.compile(r'"'), 128 # ellipsis 129 re.compile(r'([^.]?)\.{3}'), 130 # ampersand 131 re.compile(r'(\s?)&(\s)', re.U), 132 # em dash 133 re.compile(r'(\s?)--(\s?)'), 134 # en dash 135 re.compile(r' - '), 136 # dimension sign 137 re.compile(r'([0-9]+[\])]?[\'"]? ?)[x]( ?[\[(]?)' 138 r'(?=[+-]?{0}[0-9]*\.?[0-9]+)'.format(cur), flags=re.I | re.U), 139 # trademark 140 re.compile(r'(\b ?|{0}|^)[([]TM[])]'.format(regex_snippets['space'] 141 ), flags=re.I | re.U), 142 # registered 143 re.compile(r'(\b ?|{0}|^)[([]R[])]'.format(regex_snippets['space'] 144 ), flags=re.I | re.U), 145 # copyright 146 re.compile(r'(\b ?|{0}|^)[([]C[])]'.format(regex_snippets['space'] 147 ), flags=re.I | re.U), 148 # 1/2 149 re.compile(r'[([]1\/2[])]'), 150 # 1/4 151 re.compile(r'[([]1\/4[])]'), 152 # 3/4 153 re.compile(r'[([]3\/4[])]'), 154 # degrees 155 re.compile(r'[([]o[])]'), 156 # plus/minus 157 re.compile(r'[([]\+\/-[])]'), 158 # 3+ uppercase acronym 159 re.compile(r'\b([{0}][{1}]{{2,}})\b(?:[(]([^)]*)[)])'.format( 160 regex_snippets['abr'], regex_snippets['acr']), flags=re.U), 161 # 3+ uppercase 162 re.compile(r'({space}|^|[>(;-])([{abr}]{{3,}})([{nab}]*)' 163 '(?={space}|{pnct}|<|$)(?=[^">]*?(<|$))'.format(**{ 'space': 164 regex_snippets['space'], 'abr': regex_snippets['abr'], 165 'nab': regex_snippets['nab'], 'pnct': pnct_re_s}), re.U), 166 ] 167 # These are the changes that need to be made for characters that occur 168 # at the beginning of the string. 169 self.glyph_search_initial = list(self.glyph_search) 170 # apostrophe's 171 self.glyph_search_initial[0] = re.compile(r"({0}|\))'({0})".format( 172 regex_snippets['wrd']), flags=re.U) 173 # single closing 174 self.glyph_search_initial[3] = re.compile(r"(\S)'(?={0}|{1}|$)".format( 175 regex_snippets['space'], pnct_re_s), re.U) 176 # double closing 177 self.glyph_search_initial[6] = re.compile(r'(\S)"(?={0}|{1}|<|$)'.format( 178 regex_snippets['space'], pnct_re_s), re.U) 179 180 self.glyph_replace = [x.format(**self.glyph_definitions) for x in ( 181 r'\1{apostrophe}\2', # apostrophe's 182 r'\1{apostrophe}\2', # back in '88 183 r'\1{quote_single_open}', # single opening after bracket 184 r'\1{quote_single_close}', # single closing 185 r'{quote_single_open}', # single opening 186 r'\1{quote_double_open}', # double opening after bracket 187 r'\1{quote_double_close}', # double closing 188 r'{quote_double_open}', # double opening 189 r'\1{ellipsis}', # ellipsis 190 r'\1{ampersand}\2', # ampersand 191 r'\1{emdash}\2', # em dash 192 r' {endash} ', # en dash 193 r'\1{dimension}\2', # dimension sign 194 r'\1{trademark}', # trademark 195 r'\1{registered}', # registered 196 r'\1{copyright}', # copyright 197 r'{half}', # 1/2 198 r'{quarter}', # 1/4 199 r'{threequarters}', # 3/4 200 r'{degrees}', # degrees 201 r'{plusminus}', # plus/minus 202 r'<acronym title="\2">\1</acronym>', # 3+ uppercase acronym 203 r'\1<span class="caps">{0}:glyph:\2' # 3+ uppercase 204 r'</span>\3'.format(self.uid), 205 )] 206 207 if self.html_type == 'html5': 208 self.glyph_replace[21] = r'<abbr title="\2">\1</abbr>' 209 210 if self.restricted is True: 211 self.url_schemes = self.restricted_url_schemes 212 else: 213 self.url_schemes = self.unrestricted_url_schemes 214 215 all_schemes_re_s = '|'.join([ 216 '(?:{0})'.format(scheme) 217 for scheme in self.url_schemes 218 ]) 219 self.url_ref_regex = re.compile( 220 r'(?:(?<=^)|(?<=\s))\[(.+)\]\s?((?:{0}:\/\/|\/)\S+)(?=\s|$)'.format(all_schemes_re_s), 221 re.U 222 ) 223 224 def parse(self, text, rel=None, sanitize=False): 225 """Parse the input text as textile and return html output.""" 226 self.notes = OrderedDict() 227 self.unreferencedNotes = OrderedDict() 228 self.notelist_cache = OrderedDict() 229 230 if text.strip() == '': 231 return text 232 233 if self.restricted: 234 text = encode_html(text, quotes=False) 235 236 text = normalize_newlines(text) 237 text = text.replace(self.uid, '') 238 239 if self.block_tags: 240 if self.lite: 241 self.blocktag_whitelist = ['bq', 'p'] 242 text = self.block(text) 243 else: 244 self.blocktag_whitelist = [ 'bq', 'p', 'bc', 'notextile', 245 'pre', 'h[1-6]', 246 'fn{0}+'.format(regex_snippets['digit']), '###'] 247 text = self.block(text) 248 text = self.placeNoteLists(text) 249 else: 250 # Inline markup (em, strong, sup, sub, del etc). 251 text = self.span(text) 252 253 # Glyph level substitutions (mainly typographic -- " & ' => curly 254 # quotes, -- => em-dash etc. 255 text = self.glyphs(text) 256 257 if rel: 258 self.rel = ' rel="{0}"'.format(rel) 259 260 text = self.getRefs(text) 261 262 if not self.lite: 263 text = self.placeNoteLists(text) 264 text = self.retrieve(text) 265 text = text.replace('{0}:glyph:'.format(self.uid), '') 266 267 if sanitize: 268 text = sanitizer.sanitize(text) 269 270 text = self.retrieveURLs(text) 271 272 # if the text contains a break tag (<br> or <br />) not followed by 273 # a newline, replace it with a new style break tag and a newline. 274 text = re.sub(r'<br( /)?>(?!\n)', '<br />\n', text) 275 276 text = text.rstrip('\n') 277 278 return text 279 280 def table(self, text): 281 text = "{0}\n\n".format(text) 282 pattern = re.compile(r'^(?:table(?P<tatts>_?{s}{a}{c})\.' 283 r'(?P<summary>.*?)\n)?^(?P<rows>{a}{c}\.? ?\|.*\|)' 284 r'[\s]*\n\n'.format(**{'s': table_span_re_s, 'a': align_re_s, 285 'c': cls_re_s}), flags=re.S | re.M | re.U) 286 match = pattern.search(text) 287 if match: 288 table = Table(self, **match.groupdict()) 289 return table.process() 290 return text 291 292 def textileLists(self, text): 293 pattern = re.compile(r'^((?:[*;:]+|[*;:#]*#(?:_|\d+)?){0}[ .].*)$' 294 r'(?![^#*;:])'.format(cls_re_s), re.U | re.M | re.S) 295 return pattern.sub(self.fTextileList, text) 296 297 def fTextileList(self, match): 298 text = re.split(r'\n(?=[*#;:])', match.group(), flags=re.M) 299 pt = '' 300 result = [] 301 ls = OrderedDict() 302 for i, line in enumerate(text): 303 try: 304 nextline = text[i + 1] 305 except IndexError: 306 nextline = '' 307 308 m = re.search(r"^(?P<tl>[#*;:]+)(?P<st>_|\d+)?(?P<atts>{0})[ .]" 309 "(?P<content>.*)$".format(cls_re_s), line, re.S) 310 if m: 311 tl, start, atts, content = m.groups() 312 content = content.strip() 313 else: 314 result.append(line) 315 continue 316 317 nl = '' 318 ltype = list_type(tl) 319 tl_tags = {';': 'dt', ':': 'dd'} 320 litem = tl_tags.get(tl[0], 'li') 321 322 showitem = len(content) > 0 323 324 # handle list continuation/start attribute on ordered lists 325 if ltype == 'o': 326 if not hasattr(self, 'olstarts'): 327 self.olstarts = {tl: 1} 328 329 # does the first line of this ol have a start attribute 330 if len(tl) > len(pt): 331 # no, set it to 1. 332 if start is None: 333 self.olstarts[tl] = 1 334 # yes, set it to the given number 335 elif start != '_': 336 self.olstarts[tl] = int(start) 337 # we won't need to handle the '_' case, we'll just 338 # print out the number when it's needed 339 340 # put together the start attribute if needed 341 if len(tl) > len(pt) and start is not None: 342 start = ' start="{0}"'.format(self.olstarts[tl]) 343 344 # This will only increment the count for list items, not 345 # definition items 346 if showitem: 347 # Assume properly formatted input 348 try: 349 self.olstarts[tl] = self.olstarts[tl] + 1 350 # if we get here, we've got some poor textile formatting. 351 # add this type of list to olstarts and assume we'll start 352 # it at 1. expect screwy output. 353 except KeyError: 354 self.olstarts[tl] = 1 355 356 nm = re.match(r"^(?P<nextlistitem>[#\*;:]+)(_|[\d]+)?{0}" 357 r"[ .].*".format(cls_re_s), nextline) 358 if nm: 359 nl = nm.group('nextlistitem') 360 361 # We need to handle nested definition lists differently. If 362 # the next tag is a dt (';') of a lower nested level than the 363 # current dd (':'), 364 if ';' in pt and ':' in tl: 365 ls[tl] = 2 366 367 atts = pba(atts, restricted=self.restricted) 368 tabs = '\t' * len(tl) 369 # If start is still None, set it to '', else leave the value that 370 # we've already formatted. 371 start = start or '' 372 # if this item tag isn't in the list, create a new list and 373 # item, else just create the item 374 if tl not in ls: 375 ls[tl] = 1 376 itemtag = ("\n{0}\t<{1}>{2}".format(tabs, litem, content) if 377 showitem else '') 378 line = "<{0}l{1}{2}>{3}".format(ltype, atts, start, itemtag) 379 else: 380 line = ("\t<{0}{1}>{2}".format(litem, atts, content) if 381 showitem else '') 382 line = '{0}{1}'.format(tabs, line) 383 if len(nl) <= len(tl): 384 if showitem: 385 line = "{0}</{1}>".format(line, litem) 386 # work backward through the list closing nested lists/items 387 for k, v in reversed(list(ls.items())): 388 if len(k) > len(nl): 389 if v != 2: 390 line = "{0}\n{1}</{2}l>".format(line, tabs, 391 list_type(k)) 392 if len(k) > 1 and v != 2: 393 line = "{0}</{1}>".format(line, litem) 394 del ls[k] 395 # Remember the current Textile tag: 396 pt = tl 397 # This else exists in the original php version. I'm not sure how 398 # to come up with a case where the line would not match. I think 399 # it may have been necessary due to the way php returns matches. 400 # else: 401 #line = "{0}\n".format(line) 402 result.append(line) 403 return self.doTagBr(litem, "\n".join(result)) 404 405 def doTagBr(self, tag, input): 406 return re.compile(r'<({0})([^>]*?)>(.*)(</\1>)'.format(re.escape(tag)), 407 re.S).sub(self.doBr, input) 408 409 def doPBr(self, in_): 410 return re.compile(r'<(p)([^>]*?)>(.*)(</\1>)', re.S).sub(self.doBr, 411 in_) 412 413 def doBr(self, match): 414 content = re.sub(r'(.+)(?:(?<!<br>)|(?<!<br />))\n(?![#*;:\s|])', 415 r'\1<br />', match.group(3)) 416 return '<{0}{1}>{2}{3}'.format(match.group(1), match.group(2), content, 417 match.group(4)) 418 419 def block(self, text): 420 if not self.lite: 421 tre = '|'.join(self.btag) 422 else: 423 tre = '|'.join(self.btag_lite) 424 # split the text by two or more newlines, retaining the newlines in the 425 # split list 426 text = re.split(r'(\n{2,})', text) 427 428 # some blocks, when processed, will ask us to output nothing, if that's 429 # the case, we'd want to drop the whitespace which follows it. 430 eat_whitespace = False 431 432 # check to see if previous block has already been escaped 433 escaped = False 434 435 # check if multiline paragraph (p..) tags <p>..</p> are added to line 436 multiline_para = False 437 438 tag = 'p' 439 atts = cite = ext = '' 440 441 out = [] 442 443 for line in text: 444 # the line is just whitespace, add it to the output, and move on 445 if not line.strip(): 446 if not eat_whitespace: 447 out.append(line) 448 continue 449 450 eat_whitespace = False 451 452 pattern = (r'^(?P<tag>{0})(?P<atts>{1}{2})\.(?P<ext>\.?)' 453 r'(?::(?P<cite>\S+))? (?P<content>.*)$'.format(tre, 454 align_re_s, cls_re_s)) 455 match = re.search(pattern, line, flags=re.S | re.U) 456 # tag specified on this line. 457 if match: 458 # if we had a previous extended tag but not this time, close up 459 # the tag 460 if ext and out: 461 # it's out[-2] because the last element in out is the 462 # whitespace that preceded this line 463 if not escaped: 464 content = encode_html(out[-2], quotes=True) 465 escaped = True 466 else: 467 content = out[-2] 468 469 if not multiline_para: 470 content = generate_tag(block.inner_tag, content, 471 block.inner_atts) 472 content = generate_tag(block.outer_tag, content, 473 block.outer_atts) 474 out[-2] = content 475 tag, atts, ext, cite, content = match.groups() 476 block = Block(self, **match.groupdict()) 477 inner_block = generate_tag(block.inner_tag, block.content, 478 block.inner_atts) 479 # code tags and raw text won't be indented inside outer_tag. 480 if block.inner_tag != 'code' and not has_raw_text(inner_block): 481 inner_block = "\n\t\t{0}\n\t".format(inner_block) 482 if ext: 483 line = block.content 484 else: 485 line = generate_tag(block.outer_tag, inner_block, 486 block.outer_atts) 487 # pre tags and raw text won't be indented. 488 if block.outer_tag != 'pre' and not has_raw_text(line): 489 line = "\t{0}".format(line) 490 491 # set having paragraph tags to false 492 if block.tag == 'p' and ext: 493 multiline_para = False 494 # no tag specified 495 else: 496 # if we're inside an extended block, add the text from the 497 # previous line to the front. 498 if ext and out: 499 if block.tag == 'p': 500 line = generate_tag(block.tag, line, block.outer_atts) 501 multiline_para = True 502 line = '{0}{1}'.format(out.pop(), line) 503 # the logic in the if statement below is a bit confusing in 504 # php-textile. I'm still not sure I understand what the php 505 # code is doing. Something tells me it's a phpsadness. Anyway, 506 # this works, and is much easier to understand: if we're not in 507 # an extension, and the line doesn't begin with a space, treat 508 # it like a block to insert. Lines that begin with a space are 509 # not processed as a block. 510 if not ext and not line[0] == ' ': 511 block = Block(self, tag, atts, ext, cite, line) 512 # if the block contains html tags, generate_tag would 513 # mangle it, so process as is. 514 if block.tag == 'p' and not has_raw_text(block.content): 515 line = block.content 516 else: 517 line = generate_tag(block.outer_tag, block.content, 518 block.outer_atts) 519 line = "\t{0}".format(line) 520 else: 521 if block.tag == 'pre' or block.inner_tag == 'code': 522 line = self.shelve(encode_html(line, quotes=True)) 523 else: 524 line = self.graf(line) 525 526 if block.tag == 'p': 527 escaped = True 528 529 if block.tag == 'p' and ext and not multiline_para: 530 line = generate_tag(block.tag, line, block.outer_atts) 531 multiline_para = True 532 else: 533 line = self.doPBr(line) 534 if not block.tag == 'p': 535 multiline_para = False 536 537 line = line.replace('<br>', '<br />') 538 539 # if we're in an extended block, and we haven't specified a new 540 # tag, join this line to the last item of the output 541 if ext and not match: 542 last_item = out.pop() 543 out.append('{0}{1}'.format(last_item, line)) 544 elif not block.eat: 545 # or if it's a type of block which indicates we shouldn't drop 546 # it, add it to the output. 547 out.append(line) 548 549 if not ext: 550 tag = 'p' 551 atts = '' 552 cite = '' 553 554 # if it's a block we should drop, don't keep the whitespace which 555 # will come after it. 556 if block.eat: 557 eat_whitespace = True 558 559 # at this point, we've gone through all the lines. if there's still an 560 # extension in effect, we close it here 561 if ext and out and not block.tag == 'p': 562 block.content = out.pop() 563 block.process() 564 final = generate_tag(block.outer_tag, block.content, 565 block.outer_atts) 566 out.append(final) 567 return ''.join(out) 568 569 def footnoteRef(self, text): 570 # somehow php-textile gets away with not capturing the space. 571 return re.compile(r'(?<=\S)\[(?P<id>{0}+)(?P<nolink>!?)\]' 572 r'(?P<space>{1}?)'.format(regex_snippets['digit'], 573 regex_snippets['space']), re.U).sub(self.footnoteID, text) 574 575 def footnoteID(self, m): 576 fn_att = OrderedDict({'class': 'footnote'}) 577 if m.group('id') not in self.fn: 578 self.fn[m.group('id')] = '{0}{1}'.format(self.linkPrefix, 579 self._increment_link_index()) 580 fnid = self.fn[m.group('id')] 581 fn_att['id'] = 'fnrev{0}'.format(fnid) 582 fnid = self.fn[m.group('id')] 583 footref = generate_tag('a', m.group('id'), {'href': '#fn{0}'.format( 584 fnid)}) 585 if '!' == m.group('nolink'): 586 footref = m.group('id') 587 footref = generate_tag('sup', footref, fn_att) 588 return '{0}{1}'.format(footref, m.group('space')) 589 590 def glyphs(self, text): 591 """ 592 Because of the split command, the regular expressions are different for 593 when the text at the beginning and the rest of the text. 594 for example: 595 let's say the raw text provided is "*Here*'s some textile" 596 before it gets to this glyphs method, the text has been converted to 597 "<strong>Here</strong>'s some textile" 598 When run through the split, we end up with ["<strong>", "Here", 599 "</strong>", "'s some textile"]. The re.search that follows tells it 600 not to ignore html tags. 601 If the single quote is the first character on the line, it's an open 602 single quote. If it's the first character of one of those splits, it's 603 an apostrophe or closed single quote, but the regex will bear that out. 604 A similar situation occurs for double quotes as well. 605 So, for the first pass, we use the glyph_search_initial set of 606 regexes. For all remaining passes, we use glyph_search 607 """ 608 text = text.rstrip('\n') 609 result = [] 610 searchlist = self.glyph_search_initial 611 # split the text by any angle-bracketed tags 612 for i, line in enumerate(re.compile(r'(<[\w\/!?].*?>)', re.U).split( 613 text)): 614 if not i % 2: 615 for s, r in zip(searchlist, self.glyph_replace): 616 line = s.sub(r, line) 617 result.append(line) 618 if i == 0: 619 searchlist = self.glyph_search 620 return ''.join(result) 621 622 def getRefs(self, text): 623 """Capture and store URL references in self.urlrefs.""" 624 return self.url_ref_regex.sub(self.refs, text) 625 626 def refs(self, match): 627 flag, url = match.groups() 628 self.urlrefs[flag] = url 629 return '' 630 631 def relURL(self, url): 632 scheme = urlparse(url)[0] 633 if scheme and scheme not in self.url_schemes: 634 return '#' 635 return url 636 637 def shelve(self, text): 638 self.refIndex = self.refIndex + 1 639 itemID = '{0}{1}:shelve'.format(self.uid, self.refIndex) 640 self.shelf[itemID] = text 641 return itemID 642 643 def retrieve(self, text): 644 while True: 645 old = text 646 for k, v in self.shelf.items(): 647 text = text.replace(k, v) 648 if text == old: 649 break 650 return text 651 652 def graf(self, text): 653 if not self.lite: 654 text = self.noTextile(text) 655 text = self.code(text) 656 657 text = self.getHTMLComments(text) 658 659 text = self.getRefs(text) 660 text = self.links(text) 661 662 if not self.noimage: 663 text = self.image(text) 664 665 if not self.lite: 666 text = self.table(text) 667 text = self.redcloth_list(text) 668 text = self.textileLists(text) 669 670 text = self.span(text) 671 text = self.footnoteRef(text) 672 text = self.noteRef(text) 673 text = self.glyphs(text) 674 675 return text.rstrip('\n') 676 677 def links(self, text): 678 """For some reason, the part of the regex below that matches the url 679 does not match a trailing parenthesis. It gets caught by tail, and 680 we check later to see if it should be included as part of the url.""" 681 text = self.markStartOfLinks(text) 682 683 return self.replaceLinks(text) 684 685 def markStartOfLinks(self, text): 686 """Finds and marks the start of well formed links in the input text.""" 687 # Slice text on '":<not space>' boundaries. These always occur in 688 # inline links between the link text and the url part and are much more 689 # infrequent than '"' characters so we have less possible links to 690 # process. 691 slice_re = re.compile(r'":(?={0})'.format(regex_snippets['char'])) 692 slices = slice_re.split(text) 693 output = [] 694 695 if len(slices) > 1: 696 # There are never any start of links in the last slice, so pop it 697 # off (we'll glue it back later). 698 last_slice = slices.pop() 699 700 for s in slices: 701 # If there is no possible start quote then this slice is not 702 # a link 703 if '"' not in s: 704 output.append(s) 705 continue 706 # Cut this slice into possible starting points wherever we find 707 # a '"' character. Any of these parts could represent the start 708 # of the link text - we have to find which one. 709 possible_start_quotes = s.split('"') 710 711 # Start our search for the start of the link with the closest 712 # prior quote mark. 713 possibility = possible_start_quotes.pop() 714 715 # Init the balanced count. If this is still zero at the end of 716 # our do loop we'll mark the " that caused it to balance as the 717 # start of the link and move on to the next slice. 718 balanced = 0 719 linkparts = [] 720 i = 0 721 722 while balanced != 0 or i == 0: # pragma: no branch 723 # Starting at the end, pop off the previous part of the 724 # slice's fragments. 725 726 # Add this part to those parts that make up the link text. 727 linkparts.append(possibility) 728 729 if len(possibility) > 0: 730 # did this part inc or dec the balanced count? 731 if re.search(r'^\S|=$', possibility, flags=re.U): # pragma: no branch 732 balanced = balanced - 1 733 if re.search(r'\S$', possibility, flags=re.U): # pragma: no branch 734 balanced = balanced + 1 735 try: 736 possibility = possible_start_quotes.pop() 737 except IndexError: 738 break 739 else: 740 # If quotes occur next to each other, we get zero 741 # length strings. eg. ...""Open the door, 742 # HAL!"":url... In this case we count a zero length in 743 # the last position as a closing quote and others as 744 # opening quotes. 745 if i == 0: 746 balanced = balanced + 1 747 else: 748 balanced = balanced - 1 749 i = i + 1 750 751 try: 752 possibility = possible_start_quotes.pop() 753 except IndexError: # pragma: no cover 754 # If out of possible starting segments we back the 755 # last one from the linkparts array 756 linkparts.pop() 757 break 758 # If the next possibility is empty or ends in a space 759 # we have a closing ". 760 if (possibility == '' or possibility.endswith(' ')): 761 # force search exit 762 balanced = 0; 763 764 if balanced <= 0: 765 possible_start_quotes.append(possibility) 766 break 767 768 # Rebuild the link's text by reversing the parts and sticking 769 # them back together with quotes. 770 linkparts.reverse() 771 link_content = '"'.join(linkparts) 772 # Rebuild the remaining stuff that goes before the link but 773 # that's already in order. 774 pre_link = '"'.join(possible_start_quotes) 775 # Re-assemble the link starts with a specific marker for the 776 # next regex. 777 o = '{0}{1}linkStartMarker:"{2}'.format(pre_link, self.uid, 778 link_content) 779 output.append(o) 780 781 # Add the last part back 782 output.append(last_slice) 783 # Re-assemble the full text with the start and end markers 784 text = '":'.join(output) 785 786 return text 787 788 def replaceLinks(self, text): 789 """Replaces links with tokens and stores them on the shelf.""" 790 stopchars = r"\s|^'\"*" 791 pattern = r""" 792 (?P<pre>\[)? # Optionally open with a square bracket eg. Look ["here":url] 793 {0}linkStartMarker:" # marks start of the link 794 (?P<inner>(?:.|\n)*?) # grab the content of the inner "..." part of the link, can be anything but 795 # do not worry about matching class, id, lang or title yet 796 ": # literal ": marks end of atts + text + title block 797 (?P<urlx>[^{1}]*) # url upto a stopchar 798 """.format(self.uid, stopchars) 799 text = re.compile(pattern, flags=re.X | re.U).sub(self.fLink, text) 800 return text 801 802 def fLink(self, m): 803 in_ = m.group() 804 pre, inner, url = m.groups() 805 pre = pre or '' 806 807 if inner == '': 808 return '{0}"{1}":{2}'.format(pre, inner, url) 809 810 m = re.search(r'''^ 811 (?P<atts>{0}) # $atts (if any) 812 {1}* # any optional spaces 813 (?P<text> # $text is... 814 (!.+!) # an image 815 | # else... 816 .+? # link text 817 ) # end of $text 818 (?:\((?P<title>[^)]+?)\))? # $title (if any) 819 $'''.format(cls_re_s, regex_snippets['space']), inner, 820 flags=re.X | re.U) 821 822 atts = (m and m.group('atts')) or '' 823 text = (m and m.group('text')) or inner 824 title = (m and m.group('title')) or '' 825 826 pop, tight = '', '' 827 counts = { '[': None, ']': url.count(']'), '(': None, ')': None } 828 829 # Look for footnotes or other square-bracket delimited stuff at the end 830 # of the url... 831 # 832 # eg. "text":url][otherstuff... will have "[otherstuff" popped back 833 # out. 834 # 835 # "text":url?q[]=x][123] will have "[123]" popped off the back, the 836 # remaining closing square brackets will later be tested for balance 837 if (counts[']']): 838 m = re.search(r'(?P<url>^.*\])(?P<tight>\[.*?)$', url, flags=re.U) 839 if m: 840 url, tight = m.groups() 841 842 # Split off any trailing text that isn't part of an array assignment. 843 # eg. "text":...?q[]=value1&q[]=value2 ... is ok 844 # "text":...?q[]=value1]following ... would have "following" popped 845 # back out and the remaining square bracket will later be tested for 846 # balance 847 if (counts[']']): 848 m = re.search(r'(?P<url>^.*\])(?!=)(?P<end>.*?)$', url, flags=re.U) 849 url = m.group('url') 850 tight = '{0}{1}'.format(m.group('end'), tight) 851 852 # Now we have the array of all the multi-byte chars in the url we will 853 # parse the uri backwards and pop off any chars that don't belong 854 # there (like . or , or unmatched brackets of various kinds). 855 first = True 856 popped = True 857 858 counts[']'] = url.count(']') 859 url_chars = list(url) 860 861 def _endchar(c, pop, popped, url_chars, counts, pre): 862 """Textile URL shouldn't end in these characters, we pop them off 863 the end and push them out the back of the url again.""" 864 pop = '{0}{1}'.format(c, pop) 865 url_chars.pop() 866 popped = True 867 return pop, popped, url_chars, counts, pre 868 869 def _rightanglebracket(c, pop, popped, url_chars, counts, pre): 870 url_chars.pop() 871 urlLeft = ''.join(url_chars) 872 873 m = re.search(r'(?P<url_chars>.*)(?P<tag><\/[a-z]+)$', urlLeft) 874 url_chars = m.group('url_chars') 875 pop = '{0}{1}{2}'.format(m.group('tag'), c, pop) 876 popped = True 877 return pop, popped, url_chars, counts, pre 878 879 def _closingsquarebracket(c, pop, popped, url_chars, counts, pre): 880 """If we find a closing square bracket we are going to see if it is 881 balanced. If it is balanced with matching opening bracket then it 882 is part of the URL else we spit it back out of the URL.""" 883 # If counts['['] is None, count the occurrences of '[' 884 counts['['] = counts['['] or url.count('[') 885 886 if counts['['] == counts[']']: 887 # It is balanced, so keep it 888 url_chars.append(c) 889 else: 890 # In the case of un-matched closing square brackets we just eat 891 # it 892 popped = True 893 url_chars.pop() 894 counts[']'] = counts[']'] - 1; 895 if first: # pragma: no branch 896 pre = '' 897 return pop, popped, url_chars, counts, pre 898 899 def _closingparenthesis(c, pop, popped, url_chars, counts, pre): 900 if counts[')'] is None: # pragma: no branch 901 counts['('] = url.count('(') 902 counts[')'] = url.count(')') 903 904 if counts['('] != counts[')']: 905 # Unbalanced so spit it out the back end 906 popped = True 907 pop = '{0}{1}'.format(url_chars.pop(), pop) 908 counts[')'] = counts[')'] - 1 909 return pop, popped, url_chars, counts, pre 910 911 def _casesdefault(c, pop, popped, url_chars, counts, pre): 912 return pop, popped, url_chars, counts, pre 913 914 cases = { 915 '!': _endchar, 916 '?': _endchar, 917 ':': _endchar, 918 ';': _endchar, 919 '.': _endchar, 920 ',': _endchar, 921 '>': _rightanglebracket, 922 ']': _closingsquarebracket, 923 ')': _closingparenthesis, 924 } 925 for c in url_chars[-1::-1]: # pragma: no branch 926 popped = False 927 pop, popped, url_chars, counts, pre = cases.get(c, 928 _casesdefault)(c, pop, popped, url_chars, counts, pre) 929 first = False 930 if popped is False: 931 break 932 933 url = ''.join(url_chars) 934 uri_parts = urlsplit(url) 935 936 scheme_in_list = uri_parts.scheme in self.url_schemes 937 valid_scheme = (uri_parts.scheme and scheme_in_list) 938 if not is_valid_url(url) and not valid_scheme: 939 return in_.replace('{0}linkStartMarker:'.format(self.uid), '') 940 941 if text == '$': 942 text = url 943 if "://" in text: 944 text = text.split("://")[1] 945 elif ":" in text: 946 text = text.split(":")[1] 947 948 text = text.strip() 949 title = encode_html(title) 950 951 if not self.noimage: # pragma: no branch 952 text = self.image(text) 953 text = self.span(text) 954 text = self.glyphs(text) 955 url = self.shelveURL(self.encode_url(urlunsplit(uri_parts))) 956 attributes = parse_attributes(atts, restricted=self.restricted) 957 attributes['href'] = url 958 if title: 959 # if the title contains unicode data, it is annoying to get Python 960 # 2.6 and all the latter versions working properly. But shelving 961 # the title is a quick and dirty solution. 962 attributes['title'] = self.shelve(title) 963 if self.rel: 964 attributes['rel'] = self.rel 965 a_text = generate_tag('a', text, attributes) 966 a_shelf_id = self.shelve(a_text) 967 968 out = '{0}{1}{2}{3}'.format(pre, a_shelf_id, pop, tight) 969 970 return out 971 972 def encode_url(self, url): 973 """ 974 Converts a (unicode) URL to an ASCII URL, with the domain part 975 IDNA-encoded and the path part %-encoded (as per RFC 3986). 976 977 Fixed version of the following code fragment from Stack Overflow: 978 http://stackoverflow.com/a/804380/72656 979 """ 980 # parse it 981 parsed = urlsplit(url) 982 983 if parsed.netloc: 984 # divide the netloc further 985 netloc_pattern = re.compile(r""" 986 (?:(?P<user>[^:@]+)(?::(?P<password>[^:@]+))?@)? 987 (?P<host>[^:]+) 988 (?::(?P<port>[0-9]+))? 989 """, re.X | re.U) 990 netloc_parsed = netloc_pattern.match(parsed.netloc).groupdict() 991 else: 992 netloc_parsed = {'user': '', 'password': '', 'host': '', 'port': 993 ''} 994 995 # encode each component 996 scheme = parsed.scheme 997 user = netloc_parsed['user'] and quote(netloc_parsed['user']) 998 password = (netloc_parsed['password'] and 999 quote(netloc_parsed['password'])) 1000 host = netloc_parsed['host'] 1001 port = netloc_parsed['port'] and netloc_parsed['port'] 1002 # the below splits the path portion of the url by slashes, translates 1003 # percent-encoded characters back into strings, then re-percent-encodes 1004 # what's necessary. Sounds screwy, but the url could include encoded 1005 # slashes, and this is a way to clean that up. It branches for PY2/3 1006 # because the quote and unquote functions expects different input 1007 # types: unicode strings for PY2 and str for PY3. 1008 path_parts = (quote(unquote(pce), b'') for pce in 1009 parsed.path.split('/')) 1010 path = '/'.join(path_parts) 1011 1012 # put it back together 1013 netloc = '' 1014 if user: 1015 netloc = '{0}{1}'.format(netloc, user) 1016 if password: 1017 netloc = '{0}:{1}'.format(netloc, password) 1018 netloc = '{0}@'.format(netloc) 1019 netloc = '{0}{1}'.format(netloc, host) 1020 if port: 1021 netloc = '{0}:{1}'.format(netloc, port) 1022 return urlunsplit((scheme, netloc, path, parsed.query, parsed.fragment)) 1023 1024 def span(self, text): 1025 qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', 1026 r'_', r'%', r'\+', r'~', r'\^') 1027 pnct = r""".,"'?!;:‹›«»„“”‚‘’""" 1028 self.span_depth = self.span_depth + 1 1029 1030 if self.span_depth <= self.max_span_depth: 1031 for tag in qtags: 1032 pattern = re.compile(r""" 1033 (?P<pre>^|(?<=[\s>{pnct}\(])|[{{[]) 1034 (?P<tag>{tag})(?!{tag}) 1035 (?P<atts>{cls}) 1036 (?!{tag}) 1037 (?::(?P<cite>\S+[^{tag}]{space}))? 1038 (?P<content>[^{space}{tag}]+|\S.*?[^\s{tag}\n]) 1039 (?P<end>[{pnct}]*) 1040 {tag} 1041 (?P<tail>$|[\[\]}}<]|(?=[{pnct}]{{1,2}}[^0-9]|\s|\))) 1042 """.format(**{'tag': tag, 'cls': cls_re_s, 'pnct': pnct, 1043 'space': regex_snippets['space']}), flags=re.X | re.U) 1044 text = pattern.sub(self.fSpan, text) 1045 self.span_depth = self.span_depth - 1 1046 return text 1047 1048 def fSpan(self, match): 1049 pre, tag, atts, cite, content, end, tail = match.groups() 1050 1051 qtags = { 1052 '*': 'strong', 1053 '**': 'b', 1054 '??': 'cite', 1055 '_': 'em', 1056 '__': 'i', 1057 '-': 'del', 1058 '%': 'span', 1059 '+': 'ins', 1060 '~': 'sub', 1061 '^': 'sup' 1062 } 1063 1064 tag = qtags[tag] 1065 atts = pba(atts, restricted=self.restricted) 1066 if cite: 1067 atts = '{0} cite="{1}"'.format(atts, cite.rstrip()) 1068 1069 content = self.span(content) 1070 1071 out = "<{0}{1}>{2}{3}</{4}>".format(tag, atts, content, end, tag) 1072 if pre and not tail or tail and not pre: 1073 out = '{0}{1}{2}'.format(pre, out, tail) 1074 return out 1075 1076 def image(self, text): 1077 pattern = re.compile(r""" 1078 (?:[\[{{])? # pre 1079 \! # opening ! 1080 (\<|\=|\>)? # optional alignment atts 1081 ({0}) # optional style,class atts 1082 (?:\.\s)? # optional dot-space 1083 ([^\s(!]+) # presume this is the src 1084 \s? # optional space 1085 (?:\(([^\)]+)\))? # optional title 1086 \! # closing 1087 (?::(\S+))? # optional href 1088 (?:[\]}}]|(?=\s|$)) # lookahead: space or end of string 1089 """.format(cls_re_s), re.U | re.X) 1090 return pattern.sub(self.fImage, text) 1091 1092 def fImage(self, match): 1093 # (None, '', '/imgs/myphoto.jpg', None, None) 1094 align, attributes, url, title, href = match.groups() 1095 atts = OrderedDict() 1096 size = None 1097 1098 alignments = {'<': 'left', '=': 'center', '>': 'right'} 1099 1100 if not title: 1101 title = '' 1102 1103 if not is_rel_url(url) and self.get_sizes: 1104 size = imagesize.getimagesize(url) 1105 1106 if href: 1107 href = self.shelveURL(href) 1108 1109 url = self.shelveURL(url) 1110 1111 if align: 1112 atts.update(align=alignments[align]) 1113 atts.update(alt=title) 1114 if size: 1115 atts.update(height="{0}".format(size[1])) 1116 atts.update(src=url) 1117 if attributes: 1118 atts.update(parse_attributes(attributes, restricted=self.restricted)) 1119 if title: 1120 atts.update(title=title) 1121 if size: 1122 atts.update(width="{0}".format(size[0])) 1123 img = generate_tag('img', ' /', atts) 1124 if href: 1125 a_atts = OrderedDict(href=href) 1126 if self.rel: 1127 a_atts.update(rel=self.rel) 1128 img = generate_tag('a', img, a_atts) 1129 return img 1130 1131 def code(self, text): 1132 text = self.doSpecial(text, '<code>', '</code>', self.fCode) 1133 text = self.doSpecial(text, '@', '@', self.fCode) 1134 text = self.doSpecial(text, '<pre>', '</pre>', self.fPre) 1135 return text 1136 1137 def fCode(self, match): 1138 before, text, after = match.groups() 1139 after = after or '' 1140 # text needs to be escaped 1141 text = encode_html(text, quotes=False) 1142 return ''.join([before, self.shelve('<code>{0}</code>'.format(text)), after]) 1143 1144 def fPre(self, match): 1145 before, text, after = match.groups() 1146 if after is None: 1147 after = '' 1148 # text needs to be escaped 1149 text = encode_html(text) 1150 return ''.join([before, '<pre>', self.shelve(text), '</pre>', after]) 1151 1152 def doSpecial(self, text, start, end, method): 1153 pattern = re.compile(r'(^|\s|[\[({{>|]){0}(.*?){1}($|[\])}}])?'.format( 1154 re.escape(start), re.escape(end)), re.M | re.S) 1155 return pattern.sub(method, text) 1156 1157 def noTextile(self, text): 1158 text = self.doSpecial(text, '<notextile>', '</notextile>', 1159 self.fTextile) 1160 return self.doSpecial(text, '==', '==', self.fTextile) 1161 1162 def fTextile(self, match): 1163 before, notextile, after = match.groups() 1164 if after is None: # pragma: no branch 1165 after = '' 1166 return ''.join([before, self.shelve(notextile), after]) 1167 1168 def getHTMLComments(self, text): 1169 """Search the string for HTML comments, e.g. <!-- comment text -->. We 1170 send the text that matches this to fParseHTMLComments.""" 1171 return self.doSpecial(text, '<!--', '-->', self.fParseHTMLComments) 1172 1173 def fParseHTMLComments(self, match): 1174 """If self.restricted is True, clean the matched contents of the HTML 1175 comment. Otherwise, return the comments unchanged. 1176 The original php had an if statement in here regarding restricted mode. 1177 nose reported that this line wasn't covered. It's correct. In 1178 restricted mode, the html comment tags have already been converted to 1179 <!*#8212; and —> so they don't match in getHTMLComments, 1180 and never arrive here. 1181 """ 1182 before, commenttext, after = match.groups() 1183 commenttext = self.shelve(commenttext) 1184 return '{0}<!--{1}-->'.format(before, commenttext) 1185 1186 def redcloth_list(self, text): 1187 """Parse the text for definition lists and send them to be 1188 formatted.""" 1189 pattern = re.compile(r"^([-]+{0}[ .].*:=.*)$(?![^-])".format(cls_re_s), 1190 re.M | re.U | re.S) 1191 return pattern.sub(self.fRCList, text) 1192 1193 def fRCList(self, match): 1194 """Format a definition list.""" 1195 out = [] 1196 text = re.split(r'\n(?=[-])', match.group(), flags=re.M) 1197 for line in text: 1198 # parse the attributes and content 1199 m = re.match(r'^[-]+({0})[ .](.*)$'.format(cls_re_s), line, 1200 flags=re.M | re.S) 1201 if not m: 1202 continue 1203 1204 atts, content = m.groups() 1205 # cleanup 1206 content = content.strip() 1207 atts = pba(atts, restricted=self.restricted) 1208 1209 # split the content into the term and definition 1210 xm = re.match(r'^(.*?)[\s]*:=(.*?)[\s]*(=:|:=)?[\s]*$', content, 1211 re.S) 1212 term, definition, ending = xm.groups() 1213 # cleanup 1214 term = term.strip() 1215 definition = definition.strip(' ') 1216 1217 # if this is the first time through, out as a bool is False 1218 if not out: 1219 if definition == '': 1220 dltag = "<dl{0}>".format(atts) 1221 else: 1222 dltag = "<dl>" 1223 out.append(dltag) 1224 1225 if definition != '' and term != '': 1226 if definition.startswith('\n'): 1227 definition = '<p>{0}</p>'.format(definition.lstrip()) 1228 definition = definition.replace('\n', '<br />').strip() 1229 1230 term = self.graf(term) 1231 definition = self.graf(definition) 1232 1233 out.extend(['\t<dt{0}>{1}</dt>'.format(atts, term), 1234 '\t<dd>{0}</dd>'.format(definition)]) 1235 1236 out.append('</dl>') 1237 out = '\n'.join(out) 1238 return out 1239 1240 def placeNoteLists(self, text): 1241 """Parse the text for endnotes.""" 1242 if self.notes: 1243 o = OrderedDict() 1244 for label, info in self.notes.items(): 1245 if 'seq' in info: 1246 i = info['seq'] 1247 info['seq'] = label 1248 o[i] = info 1249 else: 1250 self.unreferencedNotes[label] = info 1251 1252 if o: # pragma: no branch 1253 # sort o by key 1254 o = OrderedDict(sorted(o.items(), key=lambda t: t[0])) 1255 self.notes = o 1256 text_re = re.compile(r'<p>notelist({0})(?:\:([\w|{1}]))?([\^!]?)(\+?)' 1257 r'\.?[\s]*</p>'.format(cls_re_s, syms_re_s), re.U) 1258 text = text_re.sub(self.fNoteLists, text) 1259 return text 1260 1261 def fNoteLists(self, match): 1262 """Given the text that matches as a note, format it into HTML.""" 1263 att, start_char, g_links, extras = match.groups() 1264 start_char = start_char or 'a' 1265 index = '{0}{1}{2}'.format(g_links, extras, start_char) 1266 result = '' 1267 1268 if index not in self.notelist_cache: # pragma: no branch 1269 o = [] 1270 if self.notes: # pragma: no branch 1271 for seq, info in self.notes.items(): 1272 links = self.makeBackrefLink(info, g_links, start_char) 1273 atts = '' 1274 if 'def' in info: 1275 infoid = info['id'] 1276 atts = info['def']['atts'] 1277 content = info['def']['content'] 1278 li = ('\t\t<li{0}>{1}<span id="note{2}"> ' 1279 '</span>{3}</li>').format(atts, links, infoid, 1280 content) 1281 else: 1282 li = ('\t\t<li{0}>{1} Undefined Note [#{2}].<li>' 1283 ).format(atts, links, info['seq']) 1284 o.append(li) 1285 if '+' == extras and self.unreferencedNotes: 1286 for seq, info in self.unreferencedNotes.items(): 1287 atts = info['def']['atts'] 1288 content = info['def']['content'] 1289 li = '\t\t<li{0}>{1}</li>'.format(atts, content) 1290 o.append(li) 1291 self.notelist_cache[index] = "\n".join(o) 1292 result = self.notelist_cache[index] 1293 list_atts = pba(att, restricted=self.restricted) 1294 result = '<ol{0}>\n{1}\n\t</ol>'.format(list_atts, result) 1295 return result 1296 1297 def makeBackrefLink(self, info, g_links, i): 1298 """Given the pieces of a back reference link, create an <a> tag.""" 1299 atts, content, infoid, link = '', '', '', '' 1300 if 'def' in info: 1301 link = info['def']['link'] 1302 backlink_type = link or g_links 1303 i_ = encode_high(i) 1304 allow_inc = i not in syms_re_s 1305 i_ = int(i_) 1306 1307 if backlink_type == "!": 1308 return '' 1309 elif backlink_type == '^': 1310 return """<sup><a href="#noteref{0}">{1}</a></sup>""".format( 1311 info['refids'][0], i) 1312 else: 1313 result = [] 1314 for refid in info['refids']: 1315 i_entity = decode_high(i_) 1316 sup = """<sup><a href="#noteref{0}">{1}</a></sup>""".format( 1317 refid, i_entity) 1318 if allow_inc: 1319 i_ = i_ + 1 1320 result.append(sup) 1321 result = ' '.join(result) 1322 return result 1323 1324 def fParseNoteDefs(self, m): 1325 """Parse the note definitions and format them as HTML""" 1326 label = m.group('label') 1327 link = m.group('link') 1328 att = m.group('att') 1329 content = m.group('content') 1330 1331 # Assign an id if the note reference parse hasn't found the label yet. 1332 if label not in self.notes: 1333 self.notes[label] = {'id': '{0}{1}'.format(self.linkPrefix, 1334 self._increment_link_index())} 1335 1336 # Ignores subsequent defs using the same label 1337 if 'def' not in self.notes[label]: # pragma: no branch 1338 self.notes[label]['def'] = {'atts': pba(att, restricted=self.restricted), 'content': 1339 self.graf(content), 'link': link} 1340 return '' 1341 1342 def noteRef(self, text): 1343 """Search the text looking for note references.""" 1344 text_re = re.compile(r""" 1345 \[ # start 1346 ({0}) # !atts 1347 \# 1348 ([^\]!]+) # !label 1349 ([!]?) # !nolink 1350 \]""".format(cls_re_s), re.X) 1351 text = text_re.sub(self.fParseNoteRefs, text) 1352 return text 1353 1354 def fParseNoteRefs(self, match): 1355 """Parse and format the matched text into note references. 1356 By the time this function is called, all the defs will have been 1357 processed into the notes array. So now we can resolve the link numbers 1358 in the order we process the refs...""" 1359 atts, label, nolink = match.groups() 1360 atts = pba(atts, restricted=self.restricted) 1361 nolink = nolink == '!' 1362 1363 # Assign a sequence number to this reference if there isn't one already 1364 if label in self.notes: 1365 num = self.notes[label]['seq'] 1366 else: 1367 self.notes[label] = { 1368 'seq': self.note_index, 'refids': [], 'id': '' 1369 } 1370 num = self.note_index 1371 self.note_index = self.note_index + 1 1372 1373 # Make our anchor point and stash it for possible use in backlinks when 1374 # the note list is generated later... 1375 refid = '{0}{1}'.format(self.linkPrefix, self._increment_link_index()) 1376 self.notes[label]['refids'].append(refid) 1377 1378 # If we are referencing a note that hasn't had the definition parsed 1379 # yet, then assign it an ID... 1380 if not self.notes[label]['id']: 1381 self.notes[label]['id'] = '{0}{1}'.format(self.linkPrefix, 1382 self._increment_link_index()) 1383 labelid = self.notes[label]['id'] 1384 1385 # Build the link (if any)... 1386 result = '<span id="noteref{0}">{1}</span>'.format(refid, num) 1387 if not nolink: 1388 result = '<a href="#note{0}">{1}</a>'.format(labelid, result) 1389 1390 # Build the reference... 1391 result = '<sup{0}>{1}</sup>'.format(atts, result) 1392 return result 1393 1394 def shelveURL(self, text): 1395 if text == '': 1396 return '' 1397 self.refIndex = self.refIndex + 1 1398 self.refCache[self.refIndex] = text 1399 output = '{0}{1}{2}'.format(self.uid, self.refIndex, ':url') 1400 return output 1401 1402 def retrieveURLs(self, text): 1403 return re.sub(r'{0}(?P<token>[0-9]+):url'.format(self.uid), self.retrieveURL, text) 1404 1405 def retrieveURL(self, match): 1406 url = self.refCache.get(int(match.group('token')), '') 1407 if url == '': 1408 return url 1409 1410 if url in self.urlrefs: 1411 url = self.urlrefs[url] 1412 1413 return url 1414 1415 def _increment_link_index(self): 1416 """The self.linkIndex property needs to be incremented in various 1417 places. Don't Repeat Yourself.""" 1418 self.linkIndex = self.linkIndex + 1 1419 return self.linkIndex 1420 1421 1422def textile(text, html_type='xhtml'): 1423 """ 1424 Apply Textile to a block of text. 1425 1426 This function takes the following additional parameters: 1427 1428 html_type - 'xhtml' or 'html5' style tags (default: 'xhtml') 1429 1430 """ 1431 return Textile(html_type=html_type).parse(text) 1432 1433 1434def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): 1435 """ 1436 Apply Textile to a block of text, with restrictions designed for weblog 1437 comments and other untrusted input. Raw HTML is escaped, style attributes 1438 are disabled, and rel='nofollow' is added to external links. 1439 1440 This function takes the following additional parameters: 1441 1442 html_type - 'xhtml' or 'html5' style tags (default: 'xhtml') 1443 lite - restrict block tags to p, bq, and bc, disable tables (default: True) 1444 noimage - disable image tags (default: True) 1445 1446 """ 1447 return Textile(restricted=True, lite=lite, noimage=noimage, 1448 html_type=html_type, rel='nofollow').parse( 1449 text) 1450