1"""
2Python Markdown
3
4A Python implementation of John Gruber's Markdown.
5
6Documentation: https://python-markdown.github.io/
7GitHub: https://github.com/Python-Markdown/markdown/
8PyPI: https://pypi.org/project/Markdown/
9
10Started by Manfred Stienstra (http://www.dwerg.net/).
11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
12Currently maintained by Waylan Limberg (https://github.com/waylan),
13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14
15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
17Copyright 2004 Manfred Stienstra (the original version)
18
19License: BSD (see LICENSE.md for details).
20
21CORE MARKDOWN BLOCKPARSER
22===========================================================================
23
24This parser handles basic parsing of Markdown blocks.  It doesn't concern
25itself with inline elements such as **bold** or *italics*, but rather just
26catches blocks, lists, quotes, etc.
27
28The BlockParser is made up of a bunch of BlockProcessors, each handling a
29different type of block. Extensions may add/replace/remove BlockProcessors
30as they need to alter how markdown blocks are parsed.
31"""
32
33import logging
34import re
35import xml.etree.ElementTree as etree
36from . import util
37from .blockparser import BlockParser
38
39logger = logging.getLogger('MARKDOWN')
40
41
42def build_block_parser(md, **kwargs):
43    """ Build the default block parser used by Markdown. """
44    parser = BlockParser(md)
45    parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100)
46    parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90)
47    parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80)
48    parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70)
49    parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60)
50    parser.blockprocessors.register(HRProcessor(parser), 'hr', 50)
51    parser.blockprocessors.register(OListProcessor(parser), 'olist', 40)
52    parser.blockprocessors.register(UListProcessor(parser), 'ulist', 30)
53    parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 20)
54    parser.blockprocessors.register(ReferenceProcessor(parser), 'reference', 15)
55    parser.blockprocessors.register(ParagraphProcessor(parser), 'paragraph', 10)
56    return parser
57
58
59class BlockProcessor:
60    """ Base class for block processors.
61
62    Each subclass will provide the methods below to work with the source and
63    tree. Each processor will need to define it's own ``test`` and ``run``
64    methods. The ``test`` method should return True or False, to indicate
65    whether the current block should be processed by this processor. If the
66    test passes, the parser will call the processors ``run`` method.
67
68    """
69
70    def __init__(self, parser):
71        self.parser = parser
72        self.tab_length = parser.md.tab_length
73
74    def lastChild(self, parent):
75        """ Return the last child of an etree element. """
76        if len(parent):
77            return parent[-1]
78        else:
79            return None
80
81    def detab(self, text, length=None):
82        """ Remove a tab from the front of each line of the given text. """
83        if length is None:
84            length = self.tab_length
85        newtext = []
86        lines = text.split('\n')
87        for line in lines:
88            if line.startswith(' ' * length):
89                newtext.append(line[length:])
90            elif not line.strip():
91                newtext.append('')
92            else:
93                break
94        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
95
96    def looseDetab(self, text, level=1):
97        """ Remove a tab from front of lines but allowing dedented lines. """
98        lines = text.split('\n')
99        for i in range(len(lines)):
100            if lines[i].startswith(' '*self.tab_length*level):
101                lines[i] = lines[i][self.tab_length*level:]
102        return '\n'.join(lines)
103
104    def test(self, parent, block):
105        """ Test for block type. Must be overridden by subclasses.
106
107        As the parser loops through processors, it will call the ``test``
108        method on each to determine if the given block of text is of that
109        type. This method must return a boolean ``True`` or ``False``. The
110        actual method of testing is left to the needs of that particular
111        block type. It could be as simple as ``block.startswith(some_string)``
112        or a complex regular expression. As the block type may be different
113        depending on the parent of the block (i.e. inside a list), the parent
114        etree element is also provided and may be used as part of the test.
115
116        Keywords:
117
118        * ``parent``: A etree element which will be the parent of the block.
119        * ``block``: A block of text from the source which has been split at
120            blank lines.
121        """
122        pass  # pragma: no cover
123
124    def run(self, parent, blocks):
125        """ Run processor. Must be overridden by subclasses.
126
127        When the parser determines the appropriate type of a block, the parser
128        will call the corresponding processor's ``run`` method. This method
129        should parse the individual lines of the block and append them to
130        the etree.
131
132        Note that both the ``parent`` and ``etree`` keywords are pointers
133        to instances of the objects which should be edited in place. Each
134        processor must make changes to the existing objects as there is no
135        mechanism to return new/different objects to replace them.
136
137        This means that this method should be adding SubElements or adding text
138        to the parent, and should remove (``pop``) or add (``insert``) items to
139        the list of blocks.
140
141        Keywords:
142
143        * ``parent``: A etree element which is the parent of the current block.
144        * ``blocks``: A list of all remaining blocks of the document.
145        """
146        pass  # pragma: no cover
147
148
149class ListIndentProcessor(BlockProcessor):
150    """ Process children of list items.
151
152    Example:
153        * a list item
154            process this part
155
156            or this part
157
158    """
159
160    ITEM_TYPES = ['li']
161    LIST_TYPES = ['ul', 'ol']
162
163    def __init__(self, *args):
164        super().__init__(*args)
165        self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length)
166
167    def test(self, parent, block):
168        return block.startswith(' '*self.tab_length) and \
169            not self.parser.state.isstate('detabbed') and \
170            (parent.tag in self.ITEM_TYPES or
171                (len(parent) and parent[-1] is not None and
172                    (parent[-1].tag in self.LIST_TYPES)))
173
174    def run(self, parent, blocks):
175        block = blocks.pop(0)
176        level, sibling = self.get_level(parent, block)
177        block = self.looseDetab(block, level)
178
179        self.parser.state.set('detabbed')
180        if parent.tag in self.ITEM_TYPES:
181            # It's possible that this parent has a 'ul' or 'ol' child list
182            # with a member.  If that is the case, then that should be the
183            # parent.  This is intended to catch the edge case of an indented
184            # list whose first member was parsed previous to this point
185            # see OListProcessor
186            if len(parent) and parent[-1].tag in self.LIST_TYPES:
187                self.parser.parseBlocks(parent[-1], [block])
188            else:
189                # The parent is already a li. Just parse the child block.
190                self.parser.parseBlocks(parent, [block])
191        elif sibling.tag in self.ITEM_TYPES:
192            # The sibling is a li. Use it as parent.
193            self.parser.parseBlocks(sibling, [block])
194        elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
195            # The parent is a list (``ol`` or ``ul``) which has children.
196            # Assume the last child li is the parent of this block.
197            if sibling[-1].text:
198                # If the parent li has text, that text needs to be moved to a p
199                # The p must be 'inserted' at beginning of list in the event
200                # that other children already exist i.e.; a nested sublist.
201                p = etree.Element('p')
202                p.text = sibling[-1].text
203                sibling[-1].text = ''
204                sibling[-1].insert(0, p)
205            self.parser.parseChunk(sibling[-1], block)
206        else:
207            self.create_item(sibling, block)
208        self.parser.state.reset()
209
210    def create_item(self, parent, block):
211        """ Create a new li and parse the block with it as the parent. """
212        li = etree.SubElement(parent, 'li')
213        self.parser.parseBlocks(li, [block])
214
215    def get_level(self, parent, block):
216        """ Get level of indent based on list level. """
217        # Get indent level
218        m = self.INDENT_RE.match(block)
219        if m:
220            indent_level = len(m.group(1))/self.tab_length
221        else:
222            indent_level = 0
223        if self.parser.state.isstate('list'):
224            # We're in a tightlist - so we already are at correct parent.
225            level = 1
226        else:
227            # We're in a looselist - so we need to find parent.
228            level = 0
229        # Step through children of tree to find matching indent level.
230        while indent_level > level:
231            child = self.lastChild(parent)
232            if (child is not None and
233               (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
234                if child.tag in self.LIST_TYPES:
235                    level += 1
236                parent = child
237            else:
238                # No more child levels. If we're short of indent_level,
239                # we have a code block. So we stop here.
240                break
241        return level, parent
242
243
244class CodeBlockProcessor(BlockProcessor):
245    """ Process code blocks. """
246
247    def test(self, parent, block):
248        return block.startswith(' '*self.tab_length)
249
250    def run(self, parent, blocks):
251        sibling = self.lastChild(parent)
252        block = blocks.pop(0)
253        theRest = ''
254        if (sibling is not None and sibling.tag == "pre" and
255           len(sibling) and sibling[0].tag == "code"):
256            # The previous block was a code block. As blank lines do not start
257            # new code blocks, append this block to the previous, adding back
258            # linebreaks removed from the split into a list.
259            code = sibling[0]
260            block, theRest = self.detab(block)
261            code.text = util.AtomicString(
262                '{}\n{}\n'.format(code.text, util.code_escape(block.rstrip()))
263            )
264        else:
265            # This is a new codeblock. Create the elements and insert text.
266            pre = etree.SubElement(parent, 'pre')
267            code = etree.SubElement(pre, 'code')
268            block, theRest = self.detab(block)
269            code.text = util.AtomicString('%s\n' % util.code_escape(block.rstrip()))
270        if theRest:
271            # This block contained unindented line(s) after the first indented
272            # line. Insert these lines as the first block of the master blocks
273            # list for future processing.
274            blocks.insert(0, theRest)
275
276
277class BlockQuoteProcessor(BlockProcessor):
278
279    RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
280
281    def test(self, parent, block):
282        return bool(self.RE.search(block)) and not util.nearing_recursion_limit()
283
284    def run(self, parent, blocks):
285        block = blocks.pop(0)
286        m = self.RE.search(block)
287        if m:
288            before = block[:m.start()]  # Lines before blockquote
289            # Pass lines before blockquote in recursively for parsing forst.
290            self.parser.parseBlocks(parent, [before])
291            # Remove ``> `` from beginning of each line.
292            block = '\n'.join(
293                [self.clean(line) for line in block[m.start():].split('\n')]
294            )
295        sibling = self.lastChild(parent)
296        if sibling is not None and sibling.tag == "blockquote":
297            # Previous block was a blockquote so set that as this blocks parent
298            quote = sibling
299        else:
300            # This is a new blockquote. Create a new parent element.
301            quote = etree.SubElement(parent, 'blockquote')
302        # Recursively parse block with blockquote as parent.
303        # change parser state so blockquotes embedded in lists use p tags
304        self.parser.state.set('blockquote')
305        self.parser.parseChunk(quote, block)
306        self.parser.state.reset()
307
308    def clean(self, line):
309        """ Remove ``>`` from beginning of a line. """
310        m = self.RE.match(line)
311        if line.strip() == ">":
312            return ""
313        elif m:
314            return m.group(2)
315        else:
316            return line
317
318
319class OListProcessor(BlockProcessor):
320    """ Process ordered list blocks. """
321
322    TAG = 'ol'
323    # The integer (python string) with which the lists starts (default=1)
324    # Eg: If list is intialized as)
325    #   3. Item
326    # The ol tag will get starts="3" attribute
327    STARTSWITH = '1'
328    # Lazy ol - ignore startswith
329    LAZY_OL = True
330    # List of allowed sibling tags.
331    SIBLING_TAGS = ['ol', 'ul']
332
333    def __init__(self, parser):
334        super().__init__(parser)
335        # Detect an item (``1. item``). ``group(1)`` contains contents of item.
336        self.RE = re.compile(r'^[ ]{0,%d}\d+\.[ ]+(.*)' % (self.tab_length - 1))
337        # Detect items on secondary lines. they can be of either list type.
338        self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.)|[*+-])[ ]+(.*)' %
339                                   (self.tab_length - 1))
340        # Detect indented (nested) items of either type
341        self.INDENT_RE = re.compile(r'^[ ]{%d,%d}((\d+\.)|[*+-])[ ]+.*' %
342                                    (self.tab_length, self.tab_length * 2 - 1))
343
344    def test(self, parent, block):
345        return bool(self.RE.match(block))
346
347    def run(self, parent, blocks):
348        # Check fr multiple items in one block.
349        items = self.get_items(blocks.pop(0))
350        sibling = self.lastChild(parent)
351
352        if sibling is not None and sibling.tag in self.SIBLING_TAGS:
353            # Previous block was a list item, so set that as parent
354            lst = sibling
355            # make sure previous item is in a p- if the item has text,
356            # then it isn't in a p
357            if lst[-1].text:
358                # since it's possible there are other children for this
359                # sibling, we can't just SubElement the p, we need to
360                # insert it as the first item.
361                p = etree.Element('p')
362                p.text = lst[-1].text
363                lst[-1].text = ''
364                lst[-1].insert(0, p)
365            # if the last item has a tail, then the tail needs to be put in a p
366            # likely only when a header is not followed by a blank line
367            lch = self.lastChild(lst[-1])
368            if lch is not None and lch.tail:
369                p = etree.SubElement(lst[-1], 'p')
370                p.text = lch.tail.lstrip()
371                lch.tail = ''
372
373            # parse first block differently as it gets wrapped in a p.
374            li = etree.SubElement(lst, 'li')
375            self.parser.state.set('looselist')
376            firstitem = items.pop(0)
377            self.parser.parseBlocks(li, [firstitem])
378            self.parser.state.reset()
379        elif parent.tag in ['ol', 'ul']:
380            # this catches the edge case of a multi-item indented list whose
381            # first item is in a blank parent-list item:
382            # * * subitem1
383            #     * subitem2
384            # see also ListIndentProcessor
385            lst = parent
386        else:
387            # This is a new list so create parent with appropriate tag.
388            lst = etree.SubElement(parent, self.TAG)
389            # Check if a custom start integer is set
390            if not self.LAZY_OL and self.STARTSWITH != '1':
391                lst.attrib['start'] = self.STARTSWITH
392
393        self.parser.state.set('list')
394        # Loop through items in block, recursively parsing each with the
395        # appropriate parent.
396        for item in items:
397            if item.startswith(' '*self.tab_length):
398                # Item is indented. Parse with last item as parent
399                self.parser.parseBlocks(lst[-1], [item])
400            else:
401                # New item. Create li and parse with it as parent
402                li = etree.SubElement(lst, 'li')
403                self.parser.parseBlocks(li, [item])
404        self.parser.state.reset()
405
406    def get_items(self, block):
407        """ Break a block into list items. """
408        items = []
409        for line in block.split('\n'):
410            m = self.CHILD_RE.match(line)
411            if m:
412                # This is a new list item
413                # Check first item for the start index
414                if not items and self.TAG == 'ol':
415                    # Detect the integer value of first list item
416                    INTEGER_RE = re.compile(r'(\d+)')
417                    self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
418                # Append to the list
419                items.append(m.group(3))
420            elif self.INDENT_RE.match(line):
421                # This is an indented (possibly nested) item.
422                if items[-1].startswith(' '*self.tab_length):
423                    # Previous item was indented. Append to that item.
424                    items[-1] = '{}\n{}'.format(items[-1], line)
425                else:
426                    items.append(line)
427            else:
428                # This is another line of previous item. Append to that item.
429                items[-1] = '{}\n{}'.format(items[-1], line)
430        return items
431
432
433class UListProcessor(OListProcessor):
434    """ Process unordered list blocks. """
435
436    TAG = 'ul'
437
438    def __init__(self, parser):
439        super().__init__(parser)
440        # Detect an item (``1. item``). ``group(1)`` contains contents of item.
441        self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1))
442
443
444class HashHeaderProcessor(BlockProcessor):
445    """ Process Hash Headers. """
446
447    # Detect a header at start of any line in block
448    RE = re.compile(r'(?:^|\n)(?P<level>#{1,6})(?P<header>(?:\\.|[^\\])*?)#*(?:\n|$)')
449
450    def test(self, parent, block):
451        return bool(self.RE.search(block))
452
453    def run(self, parent, blocks):
454        block = blocks.pop(0)
455        m = self.RE.search(block)
456        if m:
457            before = block[:m.start()]  # All lines before header
458            after = block[m.end():]     # All lines after header
459            if before:
460                # As the header was not the first line of the block and the
461                # lines before the header must be parsed first,
462                # recursively parse this lines as a block.
463                self.parser.parseBlocks(parent, [before])
464            # Create header using named groups from RE
465            h = etree.SubElement(parent, 'h%d' % len(m.group('level')))
466            h.text = m.group('header').strip()
467            if after:
468                # Insert remaining lines as first block for future parsing.
469                blocks.insert(0, after)
470        else:  # pragma: no cover
471            # This should never happen, but just in case...
472            logger.warn("We've got a problem header: %r" % block)
473
474
475class SetextHeaderProcessor(BlockProcessor):
476    """ Process Setext-style Headers. """
477
478    # Detect Setext-style header. Must be first 2 lines of block.
479    RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE)
480
481    def test(self, parent, block):
482        return bool(self.RE.match(block))
483
484    def run(self, parent, blocks):
485        lines = blocks.pop(0).split('\n')
486        # Determine level. ``=`` is 1 and ``-`` is 2.
487        if lines[1].startswith('='):
488            level = 1
489        else:
490            level = 2
491        h = etree.SubElement(parent, 'h%d' % level)
492        h.text = lines[0].strip()
493        if len(lines) > 2:
494            # Block contains additional lines. Add to  master blocks for later.
495            blocks.insert(0, '\n'.join(lines[2:]))
496
497
498class HRProcessor(BlockProcessor):
499    """ Process Horizontal Rules. """
500
501    # Python's re module doesn't officially support atomic grouping. However you can fake it.
502    # See https://stackoverflow.com/a/13577411/866026
503    RE = r'^[ ]{0,3}(?=(?P<atomicgroup>(-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,}))(?P=atomicgroup)[ ]*$'
504    # Detect hr on any line of a block.
505    SEARCH_RE = re.compile(RE, re.MULTILINE)
506
507    def test(self, parent, block):
508        m = self.SEARCH_RE.search(block)
509        if m:
510            # Save match object on class instance so we can use it later.
511            self.match = m
512            return True
513        return False
514
515    def run(self, parent, blocks):
516        block = blocks.pop(0)
517        match = self.match
518        # Check for lines in block before hr.
519        prelines = block[:match.start()].rstrip('\n')
520        if prelines:
521            # Recursively parse lines before hr so they get parsed first.
522            self.parser.parseBlocks(parent, [prelines])
523        # create hr
524        etree.SubElement(parent, 'hr')
525        # check for lines in block after hr.
526        postlines = block[match.end():].lstrip('\n')
527        if postlines:
528            # Add lines after hr to master blocks for later parsing.
529            blocks.insert(0, postlines)
530
531
532class EmptyBlockProcessor(BlockProcessor):
533    """ Process blocks that are empty or start with an empty line. """
534
535    def test(self, parent, block):
536        return not block or block.startswith('\n')
537
538    def run(self, parent, blocks):
539        block = blocks.pop(0)
540        filler = '\n\n'
541        if block:
542            # Starts with empty line
543            # Only replace a single line.
544            filler = '\n'
545            # Save the rest for later.
546            theRest = block[1:]
547            if theRest:
548                # Add remaining lines to master blocks for later.
549                blocks.insert(0, theRest)
550        sibling = self.lastChild(parent)
551        if (sibling is not None and sibling.tag == 'pre' and
552           len(sibling) and sibling[0].tag == 'code'):
553            # Last block is a codeblock. Append to preserve whitespace.
554            sibling[0].text = util.AtomicString(
555                '{}{}'.format(sibling[0].text, filler)
556            )
557
558
559class ReferenceProcessor(BlockProcessor):
560    """ Process link references. """
561    RE = re.compile(
562        r'^[ ]{0,3}\[([^\]]*)\]:[ ]*\n?[ ]*([^\s]+)[ ]*\n?[ ]*((["\'])(.*)\4|\((.*)\))?[ ]*$', re.MULTILINE
563    )
564
565    def test(self, parent, block):
566        return True
567
568    def run(self, parent, blocks):
569        block = blocks.pop(0)
570        m = self.RE.search(block)
571        if m:
572            id = m.group(1).strip().lower()
573            link = m.group(2).lstrip('<').rstrip('>')
574            title = m.group(5) or m.group(6)
575            self.parser.md.references[id] = (link, title)
576            if block[m.end():].strip():
577                # Add any content after match back to blocks as separate block
578                blocks.insert(0, block[m.end():].lstrip('\n'))
579            if block[:m.start()].strip():
580                # Add any content before match back to blocks as separate block
581                blocks.insert(0, block[:m.start()].rstrip('\n'))
582            return True
583        # No match. Restore block.
584        blocks.insert(0, block)
585        return False
586
587
588class ParagraphProcessor(BlockProcessor):
589    """ Process Paragraph blocks. """
590
591    def test(self, parent, block):
592        return True
593
594    def run(self, parent, blocks):
595        block = blocks.pop(0)
596        if block.strip():
597            # Not a blank block. Add to parent, otherwise throw it away.
598            if self.parser.state.isstate('list'):
599                # The parent is a tight-list.
600                #
601                # Check for any children. This will likely only happen in a
602                # tight-list when a header isn't followed by a blank line.
603                # For example:
604                #
605                #     * # Header
606                #     Line 2 of list item - not part of header.
607                sibling = self.lastChild(parent)
608                if sibling is not None:
609                    # Insetrt after sibling.
610                    if sibling.tail:
611                        sibling.tail = '{}\n{}'.format(sibling.tail, block)
612                    else:
613                        sibling.tail = '\n%s' % block
614                else:
615                    # Append to parent.text
616                    if parent.text:
617                        parent.text = '{}\n{}'.format(parent.text, block)
618                    else:
619                        parent.text = block.lstrip()
620            else:
621                # Create a regular paragraph
622                p = etree.SubElement(parent, 'p')
623                p.text = block.lstrip()
624