1# -*- coding: utf-8 -*-
2from __future__ import unicode_literals
3
4__copyright__ = """
5Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
6Copyright (c) 2010, Kurt Raschke <kurt@kurtraschke.com>
7Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
8Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/
9
10Original PHP Version:
11Copyright (c) 2003-2004, Dean Allen <dean@textism.com>
12All rights reserved.
13
14Thanks to Carlo Zottmann <carlo@g-blog.net> for refactoring
15Textile's procedural code into a class framework
16
17Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/
18
19"""
20import uuid
21from urllib.parse import urlparse, urlsplit, urlunsplit, quote, unquote
22from collections import OrderedDict
23
24from textile.tools import sanitizer, imagesize
25from textile.regex_strings import (align_re_s, cls_re_s, pnct_re_s,
26        regex_snippets, syms_re_s, table_span_re_s)
27from textile.utils import (decode_high, encode_high, encode_html, generate_tag,
28        has_raw_text, is_rel_url, is_valid_url, list_type, normalize_newlines,
29        parse_attributes, pba)
30from textile.objects import Block, Table
31
32try:
33    import regex as re
34except ImportError:
35    import re
36
37
38class Textile(object):
39    restricted_url_schemes = ('http', 'https', 'ftp', 'mailto')
40    unrestricted_url_schemes = restricted_url_schemes + ('file', 'tel',
41            'callto', 'sftp', 'data')
42
43    btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', r'fn\d+', 'p', '###')
44    btag_lite = ('bq', 'bc', 'p')
45
46    note_index = 1
47
48    doctype_whitelist = ['xhtml', 'html5']
49
50    glyph_definitions = {
51        'quote_single_open':  '&#8216;',
52        'quote_single_close': '&#8217;',
53        'quote_double_open':  '&#8220;',
54        'quote_double_close': '&#8221;',
55        'apostrophe':         '&#8217;',
56        'prime':              '&#8242;',
57        'prime_double':       '&#8243;',
58        'ellipsis':           '&#8230;',
59        'ampersand':          '&amp;',
60        'emdash':             '&#8212;',
61        'endash':             '&#8211;',
62        'dimension':          '&#215;',
63        'trademark':          '&#8482;',
64        'registered':         '&#174;',
65        'copyright':          '&#169;',
66        'half':               '&#189;',
67        'quarter':            '&#188;',
68        'threequarters':      '&#190;',
69        'degrees':            '&#176;',
70        'plusminus':          '&#177;',
71    }
72
73    def __init__(self, restricted=False, lite=False, noimage=False,
74            get_sizes=False, html_type='xhtml', rel='', block_tags=True):
75        """Textile properties that are common to regular textile and
76        textile_restricted"""
77        self.restricted = restricted
78        self.lite = lite
79        self.noimage = noimage
80        self.get_sizes = get_sizes
81        self.fn = {}
82        self.urlrefs = {}
83        self.shelf = {}
84        self.rel = rel
85        self.html_type = html_type
86        self.max_span_depth = 5
87        self.span_depth = 0
88        uid = uuid.uuid4().hex
89        self.uid = 'textileRef:{0}:'.format(uid)
90        self.linkPrefix = '{0}-'.format(uid)
91        self.linkIndex = 0
92        self.refCache = {}
93        self.refIndex = 0
94        self.block_tags = block_tags
95
96        cur = r''
97        if regex_snippets['cur']: # pragma: no branch
98            cur = r'(?:[{0}]{1}*)?'.format(regex_snippets['cur'],
99                    regex_snippets['space'])
100
101        # We'll be searching for characters that need to be HTML-encoded to
102        # produce properly valid html.  These are the defaults that work in
103        # most cases.  Below, we'll copy this and modify the necessary pieces
104        # to make it work for characters at the beginning of the string.
105        self.glyph_search = [
106            # apostrophe's
107            re.compile(r"(^|{0}|\))'({0})".format(regex_snippets['wrd']),
108                flags=re.U),
109            # back in '88
110            re.compile(r"({0})'(\d+{1}?)\b(?![.]?[{1}]*?')".format(
111                regex_snippets['space'], regex_snippets['wrd']),
112                flags=re.U),
113            # single opening following an open bracket.
114            re.compile(r"([([{])'(?=\S)", flags=re.U),
115            # single closing
116            re.compile(r"(^|\S)'(?={0}|{1}|<|$)".format(
117                regex_snippets['space'], pnct_re_s), flags=re.U),
118            # single opening
119            re.compile(r"'", re.U),
120            # double opening following an open bracket. Allows things like
121            # Hello ["(Mum) & dad"]
122            re.compile(r'([([{])"(?=\S)', flags=re.U),
123            # double closing
124            re.compile(r'(^|\S)"(?={0}|{1}|<|$)'.format(
125                regex_snippets['space'], pnct_re_s), re.U),
126            # double opening
127            re.compile(r'"'),
128            # ellipsis
129            re.compile(r'([^.]?)\.{3}'),
130            # ampersand
131            re.compile(r'(\s?)&(\s)', re.U),
132            # em dash
133            re.compile(r'(\s?)--(\s?)'),
134            # en dash
135            re.compile(r' - '),
136            # dimension sign
137            re.compile(r'([0-9]+[\])]?[\'"]? ?)[x]( ?[\[(]?)'
138                r'(?=[+-]?{0}[0-9]*\.?[0-9]+)'.format(cur), flags=re.I | re.U),
139            # trademark
140            re.compile(r'(\b ?|{0}|^)[([]TM[])]'.format(regex_snippets['space']
141                ), flags=re.I | re.U),
142            # registered
143            re.compile(r'(\b ?|{0}|^)[([]R[])]'.format(regex_snippets['space']
144                ), flags=re.I | re.U),
145            # copyright
146            re.compile(r'(\b ?|{0}|^)[([]C[])]'.format(regex_snippets['space']
147                ), flags=re.I | re.U),
148            # 1/2
149            re.compile(r'[([]1\/2[])]'),
150            # 1/4
151            re.compile(r'[([]1\/4[])]'),
152            # 3/4
153            re.compile(r'[([]3\/4[])]'),
154            # degrees
155            re.compile(r'[([]o[])]'),
156            # plus/minus
157            re.compile(r'[([]\+\/-[])]'),
158            # 3+ uppercase acronym
159            re.compile(r'\b([{0}][{1}]{{2,}})\b(?:[(]([^)]*)[)])'.format(
160                regex_snippets['abr'], regex_snippets['acr']), flags=re.U),
161            # 3+ uppercase
162            re.compile(r'({space}|^|[>(;-])([{abr}]{{3,}})([{nab}]*)'
163                '(?={space}|{pnct}|<|$)(?=[^">]*?(<|$))'.format(**{ 'space':
164                    regex_snippets['space'], 'abr': regex_snippets['abr'],
165                    'nab': regex_snippets['nab'], 'pnct': pnct_re_s}), re.U),
166        ]
167        # These are the changes that need to be made for characters that occur
168        # at the beginning of the string.
169        self.glyph_search_initial = list(self.glyph_search)
170        # apostrophe's
171        self.glyph_search_initial[0] = re.compile(r"({0}|\))'({0})".format(
172            regex_snippets['wrd']), flags=re.U)
173        # single closing
174        self.glyph_search_initial[3] = re.compile(r"(\S)'(?={0}|{1}|$)".format(
175                regex_snippets['space'], pnct_re_s), re.U)
176        # double closing
177        self.glyph_search_initial[6] = re.compile(r'(\S)"(?={0}|{1}|<|$)'.format(
178                regex_snippets['space'], pnct_re_s), re.U)
179
180        self.glyph_replace = [x.format(**self.glyph_definitions) for x in (
181            r'\1{apostrophe}\2',                  # apostrophe's
182            r'\1{apostrophe}\2',                  # back in '88
183            r'\1{quote_single_open}',             # single opening after bracket
184            r'\1{quote_single_close}',            # single closing
185            r'{quote_single_open}',               # single opening
186            r'\1{quote_double_open}',             # double opening after bracket
187            r'\1{quote_double_close}',            # double closing
188            r'{quote_double_open}',               # double opening
189            r'\1{ellipsis}',                      # ellipsis
190            r'\1{ampersand}\2',                   # ampersand
191            r'\1{emdash}\2',                      # em dash
192            r' {endash} ',                        # en dash
193            r'\1{dimension}\2',                   # dimension sign
194            r'\1{trademark}',                     # trademark
195            r'\1{registered}',                    # registered
196            r'\1{copyright}',                     # copyright
197            r'{half}',                            # 1/2
198            r'{quarter}',                         # 1/4
199            r'{threequarters}',                   # 3/4
200            r'{degrees}',                         # degrees
201            r'{plusminus}',                       # plus/minus
202            r'<acronym title="\2">\1</acronym>',  # 3+ uppercase acronym
203            r'\1<span class="caps">{0}:glyph:\2'  # 3+ uppercase
204              r'</span>\3'.format(self.uid),
205        )]
206
207        if self.html_type == 'html5':
208            self.glyph_replace[21] = r'<abbr title="\2">\1</abbr>'
209
210        if self.restricted is True:
211            self.url_schemes = self.restricted_url_schemes
212        else:
213            self.url_schemes = self.unrestricted_url_schemes
214
215        all_schemes_re_s = '|'.join([
216            '(?:{0})'.format(scheme)
217            for scheme in self.url_schemes
218        ])
219        self.url_ref_regex = re.compile(
220            r'(?:(?<=^)|(?<=\s))\[(.+)\]\s?((?:{0}:\/\/|\/)\S+)(?=\s|$)'.format(all_schemes_re_s),
221            re.U
222        )
223
224    def parse(self, text, rel=None, sanitize=False):
225        """Parse the input text as textile and return html output."""
226        self.notes = OrderedDict()
227        self.unreferencedNotes = OrderedDict()
228        self.notelist_cache = OrderedDict()
229
230        if text.strip() == '':
231            return text
232
233        if self.restricted:
234            text = encode_html(text, quotes=False)
235
236        text = normalize_newlines(text)
237        text = text.replace(self.uid, '')
238
239        if self.block_tags:
240            if self.lite:
241                self.blocktag_whitelist = ['bq', 'p']
242                text = self.block(text)
243            else:
244                self.blocktag_whitelist = [ 'bq', 'p', 'bc', 'notextile',
245                        'pre', 'h[1-6]',
246                        'fn{0}+'.format(regex_snippets['digit']), '###']
247                text = self.block(text)
248                text = self.placeNoteLists(text)
249        else:
250            # Inline markup (em, strong, sup, sub, del etc).
251            text = self.span(text)
252
253            # Glyph level substitutions (mainly typographic -- " & ' => curly
254            # quotes, -- => em-dash etc.
255            text = self.glyphs(text)
256
257        if rel:
258            self.rel = ' rel="{0}"'.format(rel)
259
260        text = self.getRefs(text)
261
262        if not self.lite:
263            text = self.placeNoteLists(text)
264        text = self.retrieve(text)
265        text = text.replace('{0}:glyph:'.format(self.uid), '')
266
267        if sanitize:
268            text = sanitizer.sanitize(text)
269
270        text = self.retrieveURLs(text)
271
272        # if the text contains a break tag (<br> or <br />) not followed by
273        # a newline, replace it with a new style break tag and a newline.
274        text = re.sub(r'<br( /)?>(?!\n)', '<br />\n', text)
275
276        text = text.rstrip('\n')
277
278        return text
279
280    def table(self, text):
281        text = "{0}\n\n".format(text)
282        pattern = re.compile(r'^(?:table(?P<tatts>_?{s}{a}{c})\.'
283                r'(?P<summary>.*?)\n)?^(?P<rows>{a}{c}\.? ?\|.*\|)'
284                r'[\s]*\n\n'.format(**{'s': table_span_re_s, 'a': align_re_s,
285                    'c': cls_re_s}), flags=re.S | re.M | re.U)
286        match = pattern.search(text)
287        if match:
288            table = Table(self, **match.groupdict())
289            return table.process()
290        return text
291
292    def textileLists(self, text):
293        pattern = re.compile(r'^((?:[*;:]+|[*;:#]*#(?:_|\d+)?){0}[ .].*)$'
294                r'(?![^#*;:])'.format(cls_re_s), re.U | re.M | re.S)
295        return pattern.sub(self.fTextileList, text)
296
297    def fTextileList(self, match):
298        text = re.split(r'\n(?=[*#;:])', match.group(), flags=re.M)
299        pt = ''
300        result = []
301        ls = OrderedDict()
302        for i, line in enumerate(text):
303            try:
304                nextline = text[i + 1]
305            except IndexError:
306                nextline = ''
307
308            m = re.search(r"^(?P<tl>[#*;:]+)(?P<st>_|\d+)?(?P<atts>{0})[ .]"
309                    "(?P<content>.*)$".format(cls_re_s), line, re.S)
310            if m:
311                tl, start, atts, content = m.groups()
312                content = content.strip()
313            else:
314                result.append(line)
315                continue
316
317            nl = ''
318            ltype = list_type(tl)
319            tl_tags = {';': 'dt', ':': 'dd'}
320            litem = tl_tags.get(tl[0], 'li')
321
322            showitem = len(content) > 0
323
324            # handle list continuation/start attribute on ordered lists
325            if ltype == 'o':
326                if not hasattr(self, 'olstarts'):
327                    self.olstarts = {tl: 1}
328
329                # does the first line of this ol have a start attribute
330                if len(tl) > len(pt):
331                    # no, set it to 1.
332                    if start is None:
333                        self.olstarts[tl] = 1
334                    # yes, set it to the given number
335                    elif start != '_':
336                        self.olstarts[tl] = int(start)
337                    # we won't need to handle the '_' case, we'll just
338                    # print out the number when it's needed
339
340                # put together the start attribute if needed
341                if len(tl) > len(pt) and start is not None:
342                    start = ' start="{0}"'.format(self.olstarts[tl])
343
344                # This will only increment the count for list items, not
345                # definition items
346                if showitem:
347                    # Assume properly formatted input
348                    try:
349                        self.olstarts[tl] = self.olstarts[tl] + 1
350                    # if we get here, we've got some poor textile formatting.
351                    # add this type of list to olstarts and assume we'll start
352                    # it at 1. expect screwy output.
353                    except KeyError:
354                        self.olstarts[tl] = 1
355
356            nm = re.match(r"^(?P<nextlistitem>[#\*;:]+)(_|[\d]+)?{0}"
357                    r"[ .].*".format(cls_re_s), nextline)
358            if nm:
359                nl = nm.group('nextlistitem')
360
361            # We need to handle nested definition lists differently.  If
362            # the next tag is a dt (';') of a lower nested level than the
363            # current dd (':'),
364            if ';' in pt and ':' in tl:
365                ls[tl] = 2
366
367            atts = pba(atts, restricted=self.restricted)
368            tabs = '\t' * len(tl)
369            # If start is still None, set it to '', else leave the value that
370            # we've already formatted.
371            start = start or ''
372            # if this item tag isn't in the list, create a new list and
373            # item, else just create the item
374            if tl not in ls:
375                ls[tl] = 1
376                itemtag = ("\n{0}\t<{1}>{2}".format(tabs, litem, content) if
377                            showitem else '')
378                line = "<{0}l{1}{2}>{3}".format(ltype, atts, start, itemtag)
379            else:
380                line = ("\t<{0}{1}>{2}".format(litem, atts, content) if
381                        showitem else '')
382            line = '{0}{1}'.format(tabs, line)
383            if len(nl) <= len(tl):
384                if showitem:
385                    line = "{0}</{1}>".format(line, litem)
386            # work backward through the list closing nested lists/items
387            for k, v in reversed(list(ls.items())):
388                if len(k) > len(nl):
389                    if v != 2:
390                        line = "{0}\n{1}</{2}l>".format(line, tabs,
391                                list_type(k))
392                    if len(k) > 1 and v != 2:
393                        line = "{0}</{1}>".format(line, litem)
394                    del ls[k]
395            # Remember the current Textile tag:
396            pt = tl
397            # This else exists in the original php version.  I'm not sure how
398            # to come up with a case where the line would not match.  I think
399            # it may have been necessary due to the way php returns matches.
400            # else:
401                #line = "{0}\n".format(line)
402            result.append(line)
403        return self.doTagBr(litem, "\n".join(result))
404
405    def doTagBr(self, tag, input):
406        return re.compile(r'<({0})([^>]*?)>(.*)(</\1>)'.format(re.escape(tag)),
407                          re.S).sub(self.doBr, input)
408
409    def doPBr(self, in_):
410        return re.compile(r'<(p)([^>]*?)>(.*)(</\1>)', re.S).sub(self.doBr,
411                                                                 in_)
412
413    def doBr(self, match):
414        content = re.sub(r'(.+)(?:(?<!<br>)|(?<!<br />))\n(?![#*;:\s|])',
415                         r'\1<br />', match.group(3))
416        return '<{0}{1}>{2}{3}'.format(match.group(1), match.group(2), content,
417                match.group(4))
418
419    def block(self, text):
420        if not self.lite:
421            tre = '|'.join(self.btag)
422        else:
423            tre = '|'.join(self.btag_lite)
424        # split the text by two or more newlines, retaining the newlines in the
425        # split list
426        text = re.split(r'(\n{2,})', text)
427
428        # some blocks, when processed, will ask us to output nothing, if that's
429        # the case, we'd want to drop the whitespace which follows it.
430        eat_whitespace = False
431
432        # check to see if previous block has already been escaped
433        escaped = False
434
435        # check if multiline paragraph (p..) tags <p>..</p> are added to line
436        multiline_para = False
437
438        tag = 'p'
439        atts = cite = ext = ''
440
441        out = []
442
443        for line in text:
444            # the line is just whitespace, add it to the output, and move on
445            if not line.strip():
446                if not eat_whitespace:
447                    out.append(line)
448                continue
449
450            eat_whitespace = False
451
452            pattern = (r'^(?P<tag>{0})(?P<atts>{1}{2})\.(?P<ext>\.?)'
453                    r'(?::(?P<cite>\S+))? (?P<content>.*)$'.format(tre,
454                        align_re_s, cls_re_s))
455            match = re.search(pattern, line, flags=re.S | re.U)
456            # tag specified on this line.
457            if match:
458                # if we had a previous extended tag but not this time, close up
459                # the tag
460                if ext and out:
461                    # it's out[-2] because the last element in out is the
462                    # whitespace that preceded this line
463                    if not escaped:
464                        content = encode_html(out[-2], quotes=True)
465                        escaped = True
466                    else:
467                        content = out[-2]
468
469                    if not multiline_para:
470                        content = generate_tag(block.inner_tag, content,
471                                block.inner_atts)
472                        content = generate_tag(block.outer_tag, content,
473                            block.outer_atts)
474                    out[-2] = content
475                tag, atts, ext, cite, content = match.groups()
476                block = Block(self, **match.groupdict())
477                inner_block = generate_tag(block.inner_tag, block.content,
478                        block.inner_atts)
479                # code tags and raw text won't be indented inside outer_tag.
480                if block.inner_tag != 'code' and not has_raw_text(inner_block):
481                    inner_block = "\n\t\t{0}\n\t".format(inner_block)
482                if ext:
483                    line = block.content
484                else:
485                    line = generate_tag(block.outer_tag, inner_block,
486                            block.outer_atts)
487                    # pre tags and raw text won't be indented.
488                    if block.outer_tag != 'pre' and not has_raw_text(line):
489                        line = "\t{0}".format(line)
490
491            # set having paragraph tags to false
492                if block.tag == 'p' and ext:
493                    multiline_para = False
494            # no tag specified
495            else:
496                # if we're inside an extended block, add the text from the
497                # previous line to the front.
498                if ext and out:
499                    if block.tag == 'p':
500                        line = generate_tag(block.tag, line, block.outer_atts)
501                        multiline_para = True
502                    line = '{0}{1}'.format(out.pop(), line)
503                # the logic in the if statement below is a bit confusing in
504                # php-textile. I'm still not sure I understand what the php
505                # code is doing. Something tells me it's a phpsadness. Anyway,
506                # this works, and is much easier to understand: if we're not in
507                # an extension, and the line doesn't begin with a space, treat
508                # it like a block to insert. Lines that begin with a space are
509                # not processed as a block.
510                if not ext and not line[0] == ' ':
511                    block = Block(self, tag, atts, ext, cite, line)
512                    # if the block contains html tags, generate_tag would
513                    # mangle it, so process as is.
514                    if block.tag == 'p' and not has_raw_text(block.content):
515                        line = block.content
516                    else:
517                        line = generate_tag(block.outer_tag, block.content,
518                                block.outer_atts)
519                        line = "\t{0}".format(line)
520                else:
521                    if block.tag == 'pre' or block.inner_tag == 'code':
522                        line = self.shelve(encode_html(line, quotes=True))
523                    else:
524                        line = self.graf(line)
525
526                    if block.tag == 'p':
527                        escaped = True
528
529            if block.tag == 'p' and ext and not multiline_para:
530                line = generate_tag(block.tag, line, block.outer_atts)
531                multiline_para = True
532            else:
533                line = self.doPBr(line)
534            if not block.tag == 'p':
535                multiline_para = False
536
537            line = line.replace('<br>', '<br />')
538
539            # if we're in an extended block, and we haven't specified a new
540            # tag, join this line to the last item of the output
541            if ext and not match:
542                last_item = out.pop()
543                out.append('{0}{1}'.format(last_item, line))
544            elif not block.eat:
545                # or if it's a type of block which indicates we shouldn't drop
546                # it, add it to the output.
547                out.append(line)
548
549            if not ext:
550                tag = 'p'
551                atts = ''
552                cite = ''
553
554            # if it's a block we should drop, don't keep the whitespace which
555            # will come after it.
556            if block.eat:
557                eat_whitespace = True
558
559        # at this point, we've gone through all the lines. if there's still an
560        # extension in effect, we close it here
561        if ext and out and not block.tag == 'p':
562            block.content = out.pop()
563            block.process()
564            final = generate_tag(block.outer_tag, block.content,
565                                 block.outer_atts)
566            out.append(final)
567        return ''.join(out)
568
569    def footnoteRef(self, text):
570        # somehow php-textile gets away with not capturing the space.
571        return re.compile(r'(?<=\S)\[(?P<id>{0}+)(?P<nolink>!?)\]'
572                r'(?P<space>{1}?)'.format(regex_snippets['digit'],
573                    regex_snippets['space']), re.U).sub(self.footnoteID, text)
574
575    def footnoteID(self, m):
576        fn_att = OrderedDict({'class': 'footnote'})
577        if m.group('id') not in self.fn:
578            self.fn[m.group('id')] = '{0}{1}'.format(self.linkPrefix,
579                    self._increment_link_index())
580            fnid = self.fn[m.group('id')]
581            fn_att['id'] = 'fnrev{0}'.format(fnid)
582        fnid = self.fn[m.group('id')]
583        footref = generate_tag('a', m.group('id'), {'href': '#fn{0}'.format(
584            fnid)})
585        if '!' == m.group('nolink'):
586            footref = m.group('id')
587        footref = generate_tag('sup', footref, fn_att)
588        return '{0}{1}'.format(footref, m.group('space'))
589
590    def glyphs(self, text):
591        """
592        Because of the split command, the regular expressions are different for
593        when the text at the beginning and the rest of the text.
594        for example:
595        let's say the raw text provided is "*Here*'s some textile"
596        before it gets to this glyphs method, the text has been converted to
597        "<strong>Here</strong>'s some textile"
598        When run through the split, we end up with ["<strong>", "Here",
599        "</strong>", "'s some textile"].  The re.search that follows tells it
600        not to ignore html tags.
601        If the single quote is the first character on the line, it's an open
602        single quote.  If it's the first character of one of those splits, it's
603        an apostrophe or closed single quote, but the regex will bear that out.
604        A similar situation occurs for double quotes as well.
605        So, for the first pass, we use the glyph_search_initial set of
606        regexes.  For all remaining passes, we use glyph_search
607        """
608        text = text.rstrip('\n')
609        result = []
610        searchlist = self.glyph_search_initial
611        # split the text by any angle-bracketed tags
612        for i, line in enumerate(re.compile(r'(<[\w\/!?].*?>)', re.U).split(
613            text)):
614            if not i % 2:
615                for s, r in zip(searchlist, self.glyph_replace):
616                    line = s.sub(r, line)
617            result.append(line)
618            if i == 0:
619                searchlist = self.glyph_search
620        return ''.join(result)
621
622    def getRefs(self, text):
623        """Capture and store URL references in self.urlrefs."""
624        return self.url_ref_regex.sub(self.refs, text)
625
626    def refs(self, match):
627        flag, url = match.groups()
628        self.urlrefs[flag] = url
629        return ''
630
631    def relURL(self, url):
632        scheme = urlparse(url)[0]
633        if scheme and scheme not in self.url_schemes:
634            return '#'
635        return url
636
637    def shelve(self, text):
638        self.refIndex = self.refIndex + 1
639        itemID = '{0}{1}:shelve'.format(self.uid, self.refIndex)
640        self.shelf[itemID] = text
641        return itemID
642
643    def retrieve(self, text):
644        while True:
645            old = text
646            for k, v in self.shelf.items():
647                text = text.replace(k, v)
648            if text == old:
649                break
650        return text
651
652    def graf(self, text):
653        if not self.lite:
654            text = self.noTextile(text)
655            text = self.code(text)
656
657        text = self.getHTMLComments(text)
658
659        text = self.getRefs(text)
660        text = self.links(text)
661
662        if not self.noimage:
663            text = self.image(text)
664
665        if not self.lite:
666            text = self.table(text)
667            text = self.redcloth_list(text)
668            text = self.textileLists(text)
669
670        text = self.span(text)
671        text = self.footnoteRef(text)
672        text = self.noteRef(text)
673        text = self.glyphs(text)
674
675        return text.rstrip('\n')
676
677    def links(self, text):
678        """For some reason, the part of the regex below that matches the url
679        does not match a trailing parenthesis.  It gets caught by tail, and
680        we check later to see if it should be included as part of the url."""
681        text = self.markStartOfLinks(text)
682
683        return self.replaceLinks(text)
684
685    def markStartOfLinks(self, text):
686        """Finds and marks the start of well formed links in the input text."""
687        # Slice text on '":<not space>' boundaries. These always occur in
688        # inline links between the link text and the url part and are much more
689        # infrequent than '"' characters so we have less possible links to
690        # process.
691        slice_re = re.compile(r'":(?={0})'.format(regex_snippets['char']))
692        slices = slice_re.split(text)
693        output = []
694
695        if len(slices) > 1:
696            # There are never any start of links in the last slice, so pop it
697            # off (we'll glue it back later).
698            last_slice = slices.pop()
699
700            for s in slices:
701                # If there is no possible start quote then this slice is not
702                # a link
703                if '"' not in s:
704                    output.append(s)
705                    continue
706                # Cut this slice into possible starting points wherever we find
707                # a '"' character. Any of these parts could represent the start
708                # of the link text - we have to find which one.
709                possible_start_quotes = s.split('"')
710
711                # Start our search for the start of the link with the closest
712                # prior quote mark.
713                possibility = possible_start_quotes.pop()
714
715                # Init the balanced count. If this is still zero at the end of
716                # our do loop we'll mark the " that caused it to balance as the
717                # start of the link and move on to the next slice.
718                balanced = 0
719                linkparts = []
720                i = 0
721
722                while balanced != 0 or i == 0: # pragma: no branch
723                    # Starting at the end, pop off the previous part of the
724                    # slice's fragments.
725
726                    # Add this part to those parts that make up the link text.
727                    linkparts.append(possibility)
728
729                    if len(possibility) > 0:
730                        # did this part inc or dec the balanced count?
731                        if re.search(r'^\S|=$', possibility, flags=re.U): # pragma: no branch
732                            balanced = balanced - 1
733                        if re.search(r'\S$', possibility, flags=re.U): # pragma: no branch
734                            balanced = balanced + 1
735                        try:
736                            possibility = possible_start_quotes.pop()
737                        except IndexError:
738                            break
739                    else:
740                        # If quotes occur next to each other, we get zero
741                        # length strings.  eg. ...""Open the door,
742                        # HAL!"":url...  In this case we count a zero length in
743                        # the last position as a closing quote and others as
744                        # opening quotes.
745                        if i == 0:
746                            balanced = balanced + 1
747                        else:
748                            balanced = balanced - 1
749                        i = i + 1
750
751                        try:
752                            possibility = possible_start_quotes.pop()
753                        except IndexError: # pragma: no cover
754                            # If out of possible starting segments we back the
755                            # last one from the linkparts array
756                            linkparts.pop()
757                            break
758                        # If the next possibility is empty or ends in a space
759                        # we have a closing ".
760                        if (possibility == '' or possibility.endswith(' ')):
761                            # force search exit
762                            balanced = 0;
763
764                    if balanced <= 0:
765                        possible_start_quotes.append(possibility)
766                        break
767
768                # Rebuild the link's text by reversing the parts and sticking
769                # them back together with quotes.
770                linkparts.reverse()
771                link_content = '"'.join(linkparts)
772                # Rebuild the remaining stuff that goes before the link but
773                # that's already in order.
774                pre_link = '"'.join(possible_start_quotes)
775                # Re-assemble the link starts with a specific marker for the
776                # next regex.
777                o = '{0}{1}linkStartMarker:"{2}'.format(pre_link, self.uid,
778                        link_content)
779                output.append(o)
780
781            # Add the last part back
782            output.append(last_slice)
783            # Re-assemble the full text with the start and end markers
784            text = '":'.join(output)
785
786        return text
787
788    def replaceLinks(self, text):
789        """Replaces links with tokens and stores them on the shelf."""
790        stopchars = r"\s|^'\"*"
791        pattern = r"""
792            (?P<pre>\[)?           # Optionally open with a square bracket eg. Look ["here":url]
793            {0}linkStartMarker:"   # marks start of the link
794            (?P<inner>(?:.|\n)*?)  # grab the content of the inner "..." part of the link, can be anything but
795                                   # do not worry about matching class, id, lang or title yet
796            ":                     # literal ": marks end of atts + text + title block
797            (?P<urlx>[^{1}]*)      # url upto a stopchar
798        """.format(self.uid, stopchars)
799        text = re.compile(pattern, flags=re.X | re.U).sub(self.fLink, text)
800        return text
801
802    def fLink(self, m):
803        in_ = m.group()
804        pre, inner, url = m.groups()
805        pre = pre or ''
806
807        if inner == '':
808            return '{0}"{1}":{2}'.format(pre, inner, url)
809
810        m = re.search(r'''^
811            (?P<atts>{0})                # $atts (if any)
812            {1}*                         # any optional spaces
813            (?P<text>                    # $text is...
814                (!.+!)                   #     an image
815            |                            #   else...
816                .+?                      #     link text
817            )                            # end of $text
818            (?:\((?P<title>[^)]+?)\))?   # $title (if any)
819            $'''.format(cls_re_s, regex_snippets['space']), inner,
820                flags=re.X | re.U)
821
822        atts = (m and m.group('atts')) or ''
823        text = (m and m.group('text')) or inner
824        title = (m and m.group('title')) or ''
825
826        pop, tight = '', ''
827        counts = { '[': None, ']': url.count(']'), '(': None, ')': None }
828
829        # Look for footnotes or other square-bracket delimited stuff at the end
830        # of the url...
831        #
832        # eg. "text":url][otherstuff... will have "[otherstuff" popped back
833        # out.
834        #
835        # "text":url?q[]=x][123]    will have "[123]" popped off the back, the
836        # remaining closing square brackets will later be tested for balance
837        if (counts[']']):
838            m = re.search(r'(?P<url>^.*\])(?P<tight>\[.*?)$', url, flags=re.U)
839            if m:
840                url, tight = m.groups()
841
842        # Split off any trailing text that isn't part of an array assignment.
843        # eg. "text":...?q[]=value1&q[]=value2 ... is ok
844        # "text":...?q[]=value1]following  ... would have "following" popped
845        # back out and the remaining square bracket will later be tested for
846        # balance
847        if (counts[']']):
848            m = re.search(r'(?P<url>^.*\])(?!=)(?P<end>.*?)$', url, flags=re.U)
849            url = m.group('url')
850            tight = '{0}{1}'.format(m.group('end'), tight)
851
852        # Now we have the array of all the multi-byte chars in the url we will
853        # parse the  uri backwards and pop off  any chars that don't belong
854        # there (like . or , or unmatched brackets of various kinds).
855        first = True
856        popped = True
857
858        counts[']'] = url.count(']')
859        url_chars = list(url)
860
861        def _endchar(c, pop, popped, url_chars, counts, pre):
862            """Textile URL shouldn't end in these characters, we pop them off
863            the end and push them out the back of the url again."""
864            pop = '{0}{1}'.format(c, pop)
865            url_chars.pop()
866            popped = True
867            return pop, popped, url_chars, counts, pre
868
869        def _rightanglebracket(c, pop, popped, url_chars, counts, pre):
870            url_chars.pop()
871            urlLeft = ''.join(url_chars)
872
873            m = re.search(r'(?P<url_chars>.*)(?P<tag><\/[a-z]+)$', urlLeft)
874            url_chars = m.group('url_chars')
875            pop = '{0}{1}{2}'.format(m.group('tag'), c, pop)
876            popped = True
877            return pop, popped, url_chars, counts, pre
878
879        def _closingsquarebracket(c, pop, popped, url_chars, counts, pre):
880            """If we find a closing square bracket we are going to see if it is
881            balanced.  If it is balanced with matching opening bracket then it
882            is part of the URL else we spit it back out of the URL."""
883            # If counts['['] is None, count the occurrences of '['
884            counts['['] = counts['['] or url.count('[')
885
886            if counts['['] == counts[']']:
887                # It is balanced, so keep it
888                url_chars.append(c)
889            else:
890                # In the case of un-matched closing square brackets we just eat
891                # it
892                popped = True
893                url_chars.pop()
894                counts[']'] = counts[']'] - 1;
895                if first: # pragma: no branch
896                    pre = ''
897            return pop, popped, url_chars, counts, pre
898
899        def _closingparenthesis(c, pop, popped, url_chars, counts, pre):
900            if counts[')'] is None: # pragma: no branch
901                counts['('] = url.count('(')
902                counts[')'] = url.count(')')
903
904            if counts['('] != counts[')']:
905                # Unbalanced so spit it out the back end
906                popped = True
907                pop = '{0}{1}'.format(url_chars.pop(), pop)
908                counts[')'] = counts[')'] - 1
909            return pop, popped, url_chars, counts, pre
910
911        def _casesdefault(c, pop, popped, url_chars, counts, pre):
912            return pop, popped, url_chars, counts, pre
913
914        cases = {
915                '!': _endchar,
916                '?': _endchar,
917                ':': _endchar,
918                ';': _endchar,
919                '.': _endchar,
920                ',': _endchar,
921                '>': _rightanglebracket,
922                ']': _closingsquarebracket,
923                ')': _closingparenthesis,
924                }
925        for c in url_chars[-1::-1]: # pragma: no branch
926            popped = False
927            pop, popped, url_chars, counts, pre = cases.get(c,
928                    _casesdefault)(c, pop, popped, url_chars, counts, pre)
929            first = False
930            if popped is False:
931                break
932
933        url = ''.join(url_chars)
934        uri_parts = urlsplit(url)
935
936        scheme_in_list = uri_parts.scheme in self.url_schemes
937        valid_scheme = (uri_parts.scheme and scheme_in_list)
938        if not is_valid_url(url) and not valid_scheme:
939            return in_.replace('{0}linkStartMarker:'.format(self.uid), '')
940
941        if text == '$':
942            text = url
943            if "://" in text:
944                text = text.split("://")[1]
945            elif ":" in text:
946                text = text.split(":")[1]
947
948        text = text.strip()
949        title = encode_html(title)
950
951        if not self.noimage: # pragma: no branch
952            text = self.image(text)
953        text = self.span(text)
954        text = self.glyphs(text)
955        url = self.shelveURL(self.encode_url(urlunsplit(uri_parts)))
956        attributes = parse_attributes(atts, restricted=self.restricted)
957        attributes['href'] = url
958        if title:
959            # if the title contains unicode data, it is annoying to get Python
960            # 2.6 and all the latter versions working properly.  But shelving
961            # the title is a quick and dirty solution.
962            attributes['title'] = self.shelve(title)
963        if self.rel:
964            attributes['rel'] = self.rel
965        a_text = generate_tag('a', text, attributes)
966        a_shelf_id = self.shelve(a_text)
967
968        out = '{0}{1}{2}{3}'.format(pre, a_shelf_id, pop, tight)
969
970        return out
971
972    def encode_url(self, url):
973        """
974        Converts a (unicode) URL to an ASCII URL, with the domain part
975        IDNA-encoded and the path part %-encoded (as per RFC 3986).
976
977        Fixed version of the following code fragment from Stack Overflow:
978            http://stackoverflow.com/a/804380/72656
979        """
980        # parse it
981        parsed = urlsplit(url)
982
983        if parsed.netloc:
984            # divide the netloc further
985            netloc_pattern = re.compile(r"""
986                (?:(?P<user>[^:@]+)(?::(?P<password>[^:@]+))?@)?
987                (?P<host>[^:]+)
988                (?::(?P<port>[0-9]+))?
989            """, re.X | re.U)
990            netloc_parsed = netloc_pattern.match(parsed.netloc).groupdict()
991        else:
992            netloc_parsed = {'user': '', 'password': '', 'host': '', 'port':
993                    ''}
994
995        # encode each component
996        scheme = parsed.scheme
997        user = netloc_parsed['user'] and quote(netloc_parsed['user'])
998        password = (netloc_parsed['password'] and
999                    quote(netloc_parsed['password']))
1000        host = netloc_parsed['host']
1001        port = netloc_parsed['port'] and netloc_parsed['port']
1002        # the below splits the path portion of the url by slashes, translates
1003        # percent-encoded characters back into strings, then re-percent-encodes
1004        # what's necessary. Sounds screwy, but the url could include encoded
1005        # slashes, and this is a way to clean that up. It branches for PY2/3
1006        # because the quote and unquote functions expects different input
1007        # types: unicode strings for PY2 and str for PY3.
1008        path_parts = (quote(unquote(pce), b'') for pce in
1009                parsed.path.split('/'))
1010        path = '/'.join(path_parts)
1011
1012        # put it back together
1013        netloc = ''
1014        if user:
1015            netloc = '{0}{1}'.format(netloc, user)
1016            if password:
1017                netloc = '{0}:{1}'.format(netloc, password)
1018            netloc = '{0}@'.format(netloc)
1019        netloc = '{0}{1}'.format(netloc, host)
1020        if port:
1021            netloc = '{0}:{1}'.format(netloc, port)
1022        return urlunsplit((scheme, netloc, path, parsed.query, parsed.fragment))
1023
1024    def span(self, text):
1025        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__',
1026                 r'_', r'%', r'\+', r'~', r'\^')
1027        pnct = r""".,"'?!;:‹›«»„“”‚‘’"""
1028        self.span_depth = self.span_depth + 1
1029
1030        if self.span_depth <= self.max_span_depth:
1031            for tag in qtags:
1032                pattern = re.compile(r"""
1033                    (?P<pre>^|(?<=[\s>{pnct}\(])|[{{[])
1034                    (?P<tag>{tag})(?!{tag})
1035                    (?P<atts>{cls})
1036                    (?!{tag})
1037                    (?::(?P<cite>\S+[^{tag}]{space}))?
1038                    (?P<content>[^{space}{tag}]+|\S.*?[^\s{tag}\n])
1039                    (?P<end>[{pnct}]*)
1040                    {tag}
1041                    (?P<tail>$|[\[\]}}<]|(?=[{pnct}]{{1,2}}[^0-9]|\s|\)))
1042                """.format(**{'tag': tag, 'cls': cls_re_s, 'pnct': pnct,
1043                    'space': regex_snippets['space']}), flags=re.X | re.U)
1044                text = pattern.sub(self.fSpan, text)
1045        self.span_depth = self.span_depth - 1
1046        return text
1047
1048    def fSpan(self, match):
1049        pre, tag, atts, cite, content, end, tail = match.groups()
1050
1051        qtags = {
1052            '*':  'strong',
1053            '**': 'b',
1054            '??': 'cite',
1055            '_':  'em',
1056            '__': 'i',
1057            '-':  'del',
1058            '%':  'span',
1059            '+':  'ins',
1060            '~':  'sub',
1061            '^':  'sup'
1062        }
1063
1064        tag = qtags[tag]
1065        atts = pba(atts, restricted=self.restricted)
1066        if cite:
1067            atts = '{0} cite="{1}"'.format(atts, cite.rstrip())
1068
1069        content = self.span(content)
1070
1071        out = "<{0}{1}>{2}{3}</{4}>".format(tag, atts, content, end, tag)
1072        if pre and not tail or tail and not pre:
1073            out = '{0}{1}{2}'.format(pre, out, tail)
1074        return out
1075
1076    def image(self, text):
1077        pattern = re.compile(r"""
1078            (?:[\[{{])?         # pre
1079            \!                  # opening !
1080            (\<|\=|\>)?         # optional alignment atts
1081            ({0})               # optional style,class atts
1082            (?:\.\s)?           # optional dot-space
1083            ([^\s(!]+)          # presume this is the src
1084            \s?                 # optional space
1085            (?:\(([^\)]+)\))?   # optional title
1086            \!                  # closing
1087            (?::(\S+))?         # optional href
1088            (?:[\]}}]|(?=\s|$)) # lookahead: space or end of string
1089        """.format(cls_re_s), re.U | re.X)
1090        return pattern.sub(self.fImage, text)
1091
1092    def fImage(self, match):
1093        # (None, '', '/imgs/myphoto.jpg', None, None)
1094        align, attributes, url, title, href = match.groups()
1095        atts = OrderedDict()
1096        size = None
1097
1098        alignments = {'<': 'left', '=': 'center', '>': 'right'}
1099
1100        if not title:
1101            title = ''
1102
1103        if not is_rel_url(url) and self.get_sizes:
1104            size = imagesize.getimagesize(url)
1105
1106        if href:
1107            href = self.shelveURL(href)
1108
1109        url = self.shelveURL(url)
1110
1111        if align:
1112            atts.update(align=alignments[align])
1113        atts.update(alt=title)
1114        if size:
1115            atts.update(height="{0}".format(size[1]))
1116        atts.update(src=url)
1117        if attributes:
1118            atts.update(parse_attributes(attributes, restricted=self.restricted))
1119        if title:
1120            atts.update(title=title)
1121        if size:
1122            atts.update(width="{0}".format(size[0]))
1123        img = generate_tag('img', ' /', atts)
1124        if href:
1125            a_atts = OrderedDict(href=href)
1126            if self.rel:
1127                a_atts.update(rel=self.rel)
1128            img = generate_tag('a', img, a_atts)
1129        return img
1130
1131    def code(self, text):
1132        text = self.doSpecial(text, '<code>', '</code>', self.fCode)
1133        text = self.doSpecial(text, '@', '@', self.fCode)
1134        text = self.doSpecial(text, '<pre>', '</pre>', self.fPre)
1135        return text
1136
1137    def fCode(self, match):
1138        before, text, after = match.groups()
1139        after = after or ''
1140        # text needs to be escaped
1141        text = encode_html(text, quotes=False)
1142        return ''.join([before, self.shelve('<code>{0}</code>'.format(text)), after])
1143
1144    def fPre(self, match):
1145        before, text, after = match.groups()
1146        if after is None:
1147            after = ''
1148        # text needs to be escaped
1149        text = encode_html(text)
1150        return ''.join([before, '<pre>', self.shelve(text), '</pre>', after])
1151
1152    def doSpecial(self, text, start, end, method):
1153        pattern = re.compile(r'(^|\s|[\[({{>|]){0}(.*?){1}($|[\])}}])?'.format(
1154            re.escape(start), re.escape(end)), re.M | re.S)
1155        return pattern.sub(method, text)
1156
1157    def noTextile(self, text):
1158        text = self.doSpecial(text, '<notextile>', '</notextile>',
1159                              self.fTextile)
1160        return self.doSpecial(text, '==', '==', self.fTextile)
1161
1162    def fTextile(self, match):
1163        before, notextile, after = match.groups()
1164        if after is None: # pragma: no branch
1165            after = ''
1166        return ''.join([before, self.shelve(notextile), after])
1167
1168    def getHTMLComments(self, text):
1169        """Search the string for HTML comments, e.g. <!-- comment text -->.  We
1170        send the text that matches this to fParseHTMLComments."""
1171        return self.doSpecial(text, '<!--', '-->', self.fParseHTMLComments)
1172
1173    def fParseHTMLComments(self, match):
1174        """If self.restricted is True, clean the matched contents of the HTML
1175        comment.  Otherwise, return the comments unchanged.
1176        The original php had an if statement in here regarding restricted mode.
1177        nose reported that this line wasn't covered.  It's correct.  In
1178        restricted mode, the html comment tags have already been converted to
1179        &lt;!*#8212; and &#8212;&gt; so they don't match in getHTMLComments,
1180        and never arrive here.
1181        """
1182        before, commenttext, after = match.groups()
1183        commenttext = self.shelve(commenttext)
1184        return '{0}<!--{1}-->'.format(before, commenttext)
1185
1186    def redcloth_list(self, text):
1187        """Parse the text for definition lists and send them to be
1188        formatted."""
1189        pattern = re.compile(r"^([-]+{0}[ .].*:=.*)$(?![^-])".format(cls_re_s),
1190                re.M | re.U | re.S)
1191        return pattern.sub(self.fRCList, text)
1192
1193    def fRCList(self, match):
1194        """Format a definition list."""
1195        out = []
1196        text = re.split(r'\n(?=[-])', match.group(), flags=re.M)
1197        for line in text:
1198            # parse the attributes and content
1199            m = re.match(r'^[-]+({0})[ .](.*)$'.format(cls_re_s), line,
1200                    flags=re.M | re.S)
1201            if not m:
1202                continue
1203
1204            atts, content = m.groups()
1205            # cleanup
1206            content = content.strip()
1207            atts = pba(atts, restricted=self.restricted)
1208
1209            # split the content into the term and definition
1210            xm = re.match(r'^(.*?)[\s]*:=(.*?)[\s]*(=:|:=)?[\s]*$', content,
1211                          re.S)
1212            term, definition, ending = xm.groups()
1213            # cleanup
1214            term = term.strip()
1215            definition = definition.strip(' ')
1216
1217            # if this is the first time through, out as a bool is False
1218            if not out:
1219                if definition == '':
1220                    dltag = "<dl{0}>".format(atts)
1221                else:
1222                    dltag = "<dl>"
1223                out.append(dltag)
1224
1225            if definition != '' and term != '':
1226                if definition.startswith('\n'):
1227                    definition = '<p>{0}</p>'.format(definition.lstrip())
1228                definition = definition.replace('\n', '<br />').strip()
1229
1230                term = self.graf(term)
1231                definition = self.graf(definition)
1232
1233                out.extend(['\t<dt{0}>{1}</dt>'.format(atts, term),
1234                    '\t<dd>{0}</dd>'.format(definition)])
1235
1236        out.append('</dl>')
1237        out = '\n'.join(out)
1238        return out
1239
1240    def placeNoteLists(self, text):
1241        """Parse the text for endnotes."""
1242        if self.notes:
1243            o = OrderedDict()
1244            for label, info in self.notes.items():
1245                if 'seq' in info:
1246                    i = info['seq']
1247                    info['seq'] = label
1248                    o[i] = info
1249                else:
1250                    self.unreferencedNotes[label] = info
1251
1252            if o: # pragma: no branch
1253                # sort o by key
1254                o = OrderedDict(sorted(o.items(), key=lambda t: t[0]))
1255            self.notes = o
1256        text_re = re.compile(r'<p>notelist({0})(?:\:([\w|{1}]))?([\^!]?)(\+?)'
1257                r'\.?[\s]*</p>'.format(cls_re_s, syms_re_s), re.U)
1258        text = text_re.sub(self.fNoteLists, text)
1259        return text
1260
1261    def fNoteLists(self, match):
1262        """Given the text that matches as a note, format it into HTML."""
1263        att, start_char, g_links, extras = match.groups()
1264        start_char = start_char or 'a'
1265        index = '{0}{1}{2}'.format(g_links, extras, start_char)
1266        result = ''
1267
1268        if index not in self.notelist_cache: # pragma: no branch
1269            o = []
1270            if self.notes: # pragma: no branch
1271                for seq, info in self.notes.items():
1272                    links = self.makeBackrefLink(info, g_links, start_char)
1273                    atts = ''
1274                    if 'def' in info:
1275                        infoid = info['id']
1276                        atts = info['def']['atts']
1277                        content = info['def']['content']
1278                        li = ('\t\t<li{0}>{1}<span id="note{2}"> '
1279                                '</span>{3}</li>').format(atts, links, infoid,
1280                                        content)
1281                    else:
1282                        li = ('\t\t<li{0}>{1} Undefined Note [#{2}].<li>'
1283                                ).format(atts, links, info['seq'])
1284                    o.append(li)
1285            if '+' == extras and self.unreferencedNotes:
1286                for seq, info in self.unreferencedNotes.items():
1287                    atts = info['def']['atts']
1288                    content = info['def']['content']
1289                    li = '\t\t<li{0}>{1}</li>'.format(atts, content)
1290                    o.append(li)
1291            self.notelist_cache[index] = "\n".join(o)
1292            result = self.notelist_cache[index]
1293        list_atts = pba(att, restricted=self.restricted)
1294        result = '<ol{0}>\n{1}\n\t</ol>'.format(list_atts, result)
1295        return result
1296
1297    def makeBackrefLink(self, info, g_links, i):
1298        """Given the pieces of a back reference link, create an <a> tag."""
1299        atts, content, infoid, link = '', '', '', ''
1300        if 'def' in info:
1301            link = info['def']['link']
1302        backlink_type = link or g_links
1303        i_ = encode_high(i)
1304        allow_inc = i not in syms_re_s
1305        i_ = int(i_)
1306
1307        if backlink_type == "!":
1308            return ''
1309        elif backlink_type == '^':
1310            return """<sup><a href="#noteref{0}">{1}</a></sup>""".format(
1311                info['refids'][0], i)
1312        else:
1313            result = []
1314            for refid in info['refids']:
1315                i_entity = decode_high(i_)
1316                sup = """<sup><a href="#noteref{0}">{1}</a></sup>""".format(
1317                        refid, i_entity)
1318                if allow_inc:
1319                    i_ = i_ + 1
1320                result.append(sup)
1321            result = ' '.join(result)
1322            return result
1323
1324    def fParseNoteDefs(self, m):
1325        """Parse the note definitions and format them as HTML"""
1326        label = m.group('label')
1327        link = m.group('link')
1328        att = m.group('att')
1329        content = m.group('content')
1330
1331        # Assign an id if the note reference parse hasn't found the label yet.
1332        if label not in self.notes:
1333            self.notes[label] = {'id': '{0}{1}'.format(self.linkPrefix,
1334                self._increment_link_index())}
1335
1336        # Ignores subsequent defs using the same label
1337        if 'def' not in self.notes[label]: # pragma: no branch
1338            self.notes[label]['def'] = {'atts': pba(att, restricted=self.restricted), 'content':
1339                    self.graf(content), 'link': link}
1340        return ''
1341
1342    def noteRef(self, text):
1343        """Search the text looking for note references."""
1344        text_re = re.compile(r"""
1345        \[          # start
1346        ({0})       # !atts
1347        \#
1348        ([^\]!]+)   # !label
1349        ([!]?)      # !nolink
1350        \]""".format(cls_re_s), re.X)
1351        text = text_re.sub(self.fParseNoteRefs, text)
1352        return text
1353
1354    def fParseNoteRefs(self, match):
1355        """Parse and format the matched text into note references.
1356        By the time this function is called, all the defs will have been
1357        processed into the notes array. So now we can resolve the link numbers
1358        in the order we process the refs..."""
1359        atts, label, nolink = match.groups()
1360        atts = pba(atts, restricted=self.restricted)
1361        nolink = nolink == '!'
1362
1363        # Assign a sequence number to this reference if there isn't one already
1364        if label in self.notes:
1365            num = self.notes[label]['seq']
1366        else:
1367            self.notes[label] = {
1368                'seq': self.note_index, 'refids': [], 'id': ''
1369            }
1370            num = self.note_index
1371            self.note_index = self.note_index + 1
1372
1373        # Make our anchor point and stash it for possible use in backlinks when
1374        # the note list is generated later...
1375        refid = '{0}{1}'.format(self.linkPrefix, self._increment_link_index())
1376        self.notes[label]['refids'].append(refid)
1377
1378        # If we are referencing a note that hasn't had the definition parsed
1379        # yet, then assign it an ID...
1380        if not self.notes[label]['id']:
1381            self.notes[label]['id'] = '{0}{1}'.format(self.linkPrefix,
1382                    self._increment_link_index())
1383        labelid = self.notes[label]['id']
1384
1385        # Build the link (if any)...
1386        result = '<span id="noteref{0}">{1}</span>'.format(refid, num)
1387        if not nolink:
1388            result = '<a href="#note{0}">{1}</a>'.format(labelid, result)
1389
1390        # Build the reference...
1391        result = '<sup{0}>{1}</sup>'.format(atts, result)
1392        return result
1393
1394    def shelveURL(self, text):
1395        if text == '':
1396            return ''
1397        self.refIndex = self.refIndex + 1
1398        self.refCache[self.refIndex] = text
1399        output = '{0}{1}{2}'.format(self.uid, self.refIndex, ':url')
1400        return output
1401
1402    def retrieveURLs(self, text):
1403        return re.sub(r'{0}(?P<token>[0-9]+):url'.format(self.uid), self.retrieveURL, text)
1404
1405    def retrieveURL(self, match):
1406        url = self.refCache.get(int(match.group('token')), '')
1407        if url == '':
1408            return url
1409
1410        if url in self.urlrefs:
1411            url = self.urlrefs[url]
1412
1413        return url
1414
1415    def _increment_link_index(self):
1416        """The self.linkIndex property needs to be incremented in various
1417        places.  Don't Repeat Yourself."""
1418        self.linkIndex = self.linkIndex + 1
1419        return self.linkIndex
1420
1421
1422def textile(text, html_type='xhtml'):
1423    """
1424    Apply Textile to a block of text.
1425
1426    This function takes the following additional parameters:
1427
1428    html_type - 'xhtml' or 'html5' style tags (default: 'xhtml')
1429
1430    """
1431    return Textile(html_type=html_type).parse(text)
1432
1433
1434def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
1435    """
1436    Apply Textile to a block of text, with restrictions designed for weblog
1437    comments and other untrusted input.  Raw HTML is escaped, style attributes
1438    are disabled, and rel='nofollow' is added to external links.
1439
1440    This function takes the following additional parameters:
1441
1442    html_type - 'xhtml' or 'html5' style tags (default: 'xhtml')
1443    lite - restrict block tags to p, bq, and bc, disable tables (default: True)
1444    noimage - disable image tags (default: True)
1445
1446    """
1447    return Textile(restricted=True, lite=lite, noimage=noimage,
1448            html_type=html_type, rel='nofollow').parse(
1449                    text)
1450