1# -*- coding: iso-8859-1 -*-
2"""
3    Creole wiki markup parser
4
5    See http://wikicreole.org/ for latest specs.
6
7    Notes:
8    * No markup allowed in headings.
9      Creole 1.0 does not require us to support this.
10    * No markup allowed in table headings.
11      Creole 1.0 does not require us to support this.
12    * No (non-bracketed) generic url recognition: this is "mission impossible"
13      except if you want to risk lots of false positives. Only known protocols
14      are recognized.
15    * We do not allow ":" before "//" italic markup to avoid urls with
16      unrecognized schemes (like wtf://server/path) triggering italic rendering
17      for the rest of the paragraph.
18
19    @copyright: 2007 MoinMoin:RadomirDopieralski (creole 0.5 implementation),
20                2007 MoinMoin:ThomasWaldmann (updates)
21    @license: GNU GPL, see COPYING for details.
22    @license: BSD, see COPYING for details.
23"""
24
25import re
26import sys
27
28__version__ = '1.1'
29
30
31class Rules:
32    """Hold all the rules for generating regular expressions."""
33
34    # For the inline elements:
35    proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc'
36    link = r'''(?P<link>
37            \[\[
38            (?P<link_target>.+?) \s*
39            ([|] \s* (?P<link_text>.+?) \s*)?
40            ]]
41        )'''
42    image = r'''(?P<image>
43            {{
44            (?P<image_target>.+?) \s*
45            ([|] \s* (?P<image_text>.+?) \s*)?
46            }}
47        )'''
48    macro = r'''(?P<macro>
49            <<
50            (?P<macro_name> \w+)
51            (\( (?P<macro_args> .*?) \))? \s*
52            ([|] \s* (?P<macro_text> .+?) \s* )?
53            >>
54        )'''
55    code = r'(?P<code> {{{ (?P<code_text>.*?) }}} )'
56    emph = r'(?P<emph> (?<!:)// )' # there must be no : in front of the //
57                                   # avoids italic rendering in urls with
58                                   # unknown protocols
59    strong = r'(?P<strong> \*\* )'
60    linebreak = r'(?P<break> \\\\ )'
61    escape = r'(?P<escape> ~ (?P<escaped_char>\S) )'
62    char =  r'(?P<char> . )'
63
64    # For the block elements:
65    separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line
66    line = r'(?P<line> ^ \s* $ )' # empty line that separates paragraphs
67    head = r'''(?P<head>
68            ^ \s*
69            (?P<head_head>=+) \s*
70            (?P<head_text> .*? ) \s*
71            (?P<head_tail>=*) \s*
72            $
73        )'''
74    text = r'(?P<text> .+ )'
75    list = r'''(?P<list>
76            ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $
77            ( \n[ \t]* [*\#]+.* $ )*
78        )''' # Matches the whole list, separate items are parsed later. The
79             # list *must* start with a single bullet.
80    item = r'''(?P<item>
81            ^ \s*
82            (?P<item_head> [\#*]+) \s*
83            (?P<item_text> .*?)
84            $
85        )''' # Matches single list items
86    pre = r'''(?P<pre>
87            ^{{{ \s* $
88            (\n)?
89            (?P<pre_text>
90                ([\#]!(?P<pre_kind>\w*?)(\s+.*)?$)?
91                (.|\n)+?
92            )
93            (\n)?
94            ^}}} \s*$
95        )'''
96    pre_escape = r' ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $'
97    table = r'''(?P<table>
98            ^ \s*
99            [|].*? \s*
100            [|]? \s*
101            $
102        )'''
103
104    # For splitting table cells:
105    cell = r'''
106            \| \s*
107            (
108                (?P<head> [=][^|]+ ) |
109                (?P<cell> (  %s | [^|])+ )
110            ) \s*
111        ''' % '|'.join([link, macro, image, code])
112
113    def __init__(self, bloglike_lines=False, url_protocols=None,
114                 wiki_words=False):
115        c = re.compile
116        # For pre escaping, in creole 1.0 done with ~:
117        self.pre_escape_re = c(self.pre_escape, re.M | re.X)
118        # for link descriptions
119        self.link_re = c('|'.join([self.image, self.linebreak,
120                                   self.char]), re.X | re.U)
121        # for list items
122        self.item_re = c(self.item, re.X | re.U | re.M)
123        # for table cells
124        self.cell_re = c(self.cell, re.X | re.U)
125
126        # For block elements:
127        if bloglike_lines:
128            self.text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?'
129        self.block_re = c('|'.join([self.line, self.head, self.separator,
130                                    self.pre, self.list, self.table,
131                                    self.text]), re.X | re.U | re.M)
132
133        # For inline elements:
134        if url_protocols is not None:
135            self.proto = '|'.join(re.escape(p) for p in url_protocols)
136        self.url =  r'''(?P<url>
137            (^ | (?<=\s | [.,:;!?()/=]))
138            (?P<escaped_url>~)?
139            (?P<url_target> (?P<url_proto> %s ):\S+? )
140            ($ | (?=\s | [,.:;!?()] (\s | $))))''' % self.proto
141        inline_elements = [self.link, self.url, self.macro,
142                           self.code, self.image, self.strong,
143                           self.emph, self.linebreak,
144                           self.escape, self.char]
145        if wiki_words:
146            import unicodedata
147            up_case = u''.join(unichr(i) for i in xrange(sys.maxunicode)
148                               if unicodedata.category(unichr(i))=='Lu')
149            self.wiki = ur'''(?P<wiki>[%s]\w+[%s]\w+)''' % (up_case, up_case)
150            inline_elements.insert(3, self.wiki)
151        self.inline_re = c('|'.join(inline_elements), re.X | re.U)
152
153class Parser:
154    """
155    Parse the raw text and create a document object
156    that can be converted into output using Emitter.
157
158    A separate instance should be created for parsing a new document.
159    The first parameter is the raw text to be parsed. An optional second
160    argument is the Rules object to use. You can customize the parsing
161    rules to enable optional features or extend the parser.
162    """
163
164    def __init__(self, raw, rules=None):
165        self.rules = rules or Rules()
166        self.raw = raw
167        self.root = DocNode('document', None)
168        self.cur = self.root        # The most recent document node
169        self.text = None            # The node to add inline characters to
170
171    def _upto(self, node, kinds):
172        """
173        Look up the tree to the first occurence
174        of one of the listed kinds of nodes or root.
175        Start at the node node.
176        """
177        while node.parent is not None and not node.kind in kinds:
178            node = node.parent
179        return node
180
181    # The _*_repl methods called for matches in regexps. Sometimes the
182    # same method needs several names, because of group names in regexps.
183
184    def _url_repl(self, groups):
185        """Handle raw urls in text."""
186
187        if not groups.get('escaped_url'):
188            # this url is NOT escaped
189            target = groups.get('url_target', '')
190            node = DocNode('link', self.cur)
191            node.content = target
192            DocNode('text', node, node.content)
193            self.text = None
194        else:
195            # this url is escaped, we render it as text
196            if self.text is None:
197                self.text = DocNode('text', self.cur, u'')
198            self.text.content += groups.get('url_target')
199    _url_target_repl = _url_repl
200    _url_proto_repl = _url_repl
201    _escaped_url = _url_repl
202
203    def _link_repl(self, groups):
204        """Handle all kinds of links."""
205
206        target = groups.get('link_target', '')
207        text = (groups.get('link_text', '') or '').strip()
208        parent = self.cur
209        self.cur = DocNode('link', self.cur)
210        self.cur.content = target
211        self.text = None
212        re.sub(self.rules.link_re, self._replace, text)
213        self.cur = parent
214        self.text = None
215    _link_target_repl = _link_repl
216    _link_text_repl = _link_repl
217
218    def _wiki_repl(self, groups):
219        """Handle WikiWord links, if enabled."""
220
221        text = groups.get('wiki', '')
222        node = DocNode('link', self.cur)
223        node.content = text
224        DocNode('text', node, node.content)
225        self.text = None
226
227    def _macro_repl(self, groups):
228        """Handles macros using the placeholder syntax."""
229
230        name = groups.get('macro_name', '')
231        text = (groups.get('macro_text', '') or '').strip()
232        node = DocNode('macro', self.cur, name)
233        node.args = groups.get('macro_args', '') or ''
234        DocNode('text', node, text or name)
235        self.text = None
236    _macro_name_repl = _macro_repl
237    _macro_args_repl = _macro_repl
238    _macro_text_repl = _macro_repl
239
240    def _image_repl(self, groups):
241        """Handles images and attachemnts included in the page."""
242
243        target = groups.get('image_target', '').strip()
244        text = (groups.get('image_text', '') or '').strip()
245        node = DocNode("image", self.cur, target)
246        DocNode('text', node, text or node.content)
247        self.text = None
248    _image_target_repl = _image_repl
249    _image_text_repl = _image_repl
250
251    def _separator_repl(self, groups):
252        self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
253        DocNode('separator', self.cur)
254
255    def _item_repl(self, groups):
256        bullet = groups.get('item_head', u'')
257        text = groups.get('item_text', u'')
258        if bullet[-1] == '#':
259            kind = 'number_list'
260        else:
261            kind = 'bullet_list'
262        level = len(bullet)
263        lst = self.cur
264        # Find a list of the same kind and level up the tree
265        while (lst and
266                   not (lst.kind in ('number_list', 'bullet_list') and
267                        lst.level == level) and
268                    not lst.kind in ('document', 'section', 'blockquote')):
269            lst = lst.parent
270        if lst and lst.kind == kind:
271            self.cur = lst
272        else:
273            # Create a new level of list
274            self.cur = self._upto(self.cur,
275                ('list_item', 'document', 'section', 'blockquote'))
276            self.cur = DocNode(kind, self.cur)
277            self.cur.level = level
278        self.cur = DocNode('list_item', self.cur)
279        self.parse_inline(text)
280        self.text = None
281    _item_text_repl = _item_repl
282    _item_head_repl = _item_repl
283
284    def _list_repl(self, groups):
285        text = groups.get('list', u'')
286        self.rules.item_re.sub(self._replace, text)
287
288    def _head_repl(self, groups):
289        self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
290        node = DocNode('header', self.cur, groups.get('head_text', '').strip())
291        node.level = len(groups.get('head_head', ' '))
292    _head_head_repl = _head_repl
293    _head_text_repl = _head_repl
294
295    def _text_repl(self, groups):
296        text = groups.get('text', '')
297        if self.cur.kind in ('table', 'table_row', 'bullet_list',
298            'number_list'):
299            self.cur = self._upto(self.cur,
300                ('document', 'section', 'blockquote'))
301        if self.cur.kind in ('document', 'section', 'blockquote'):
302            self.cur = DocNode('paragraph', self.cur)
303        else:
304            text = u' ' + text
305        self.parse_inline(text)
306        if groups.get('break') and self.cur.kind in ('paragraph',
307            'emphasis', 'strong', 'code'):
308            DocNode('break', self.cur, '')
309        self.text = None
310    _break_repl = _text_repl
311
312    def _table_repl(self, groups):
313        row = groups.get('table', '|').strip()
314        self.cur = self._upto(self.cur, (
315            'table', 'document', 'section', 'blockquote'))
316        if self.cur.kind != 'table':
317            self.cur = DocNode('table', self.cur)
318        tb = self.cur
319        tr = DocNode('table_row', tb)
320
321        text = ''
322        for m in self.rules.cell_re.finditer(row):
323            cell = m.group('cell')
324            if cell:
325                self.cur = DocNode('table_cell', tr)
326                self.text = None
327                self.parse_inline(cell)
328            else:
329                cell = m.group('head')
330                self.cur = DocNode('table_head', tr)
331                self.text = DocNode('text', self.cur, u'')
332                self.text.content = cell.strip('=')
333        self.cur = tb
334        self.text = None
335
336    def _pre_repl(self, groups):
337        self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
338        kind = groups.get('pre_kind', None)
339        text = groups.get('pre_text', u'')
340        def remove_tilde(m):
341            return m.group('indent') + m.group('rest')
342        text = self.rules.pre_escape_re.sub(remove_tilde, text)
343        node = DocNode('preformatted', self.cur, text)
344        node.sect = kind or ''
345        self.text = None
346    _pre_text_repl = _pre_repl
347    _pre_head_repl = _pre_repl
348    _pre_kind_repl = _pre_repl
349
350    def _line_repl(self, groups):
351        self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
352
353    def _code_repl(self, groups):
354        DocNode('code', self.cur, groups.get('code_text', u'').strip())
355        self.text = None
356    _code_text_repl = _code_repl
357    _code_head_repl = _code_repl
358
359    def _emph_repl(self, groups):
360        if self.cur.kind != 'emphasis':
361            self.cur = DocNode('emphasis', self.cur)
362        else:
363            self.cur = self._upto(self.cur, ('emphasis', )).parent
364        self.text = None
365
366    def _strong_repl(self, groups):
367        if self.cur.kind != 'strong':
368            self.cur = DocNode('strong', self.cur)
369        else:
370            self.cur = self._upto(self.cur, ('strong', )).parent
371        self.text = None
372
373    def _break_repl(self, groups):
374        DocNode('break', self.cur, None)
375        self.text = None
376
377    def _escape_repl(self, groups):
378        if self.text is None:
379            self.text = DocNode('text', self.cur, u'')
380        self.text.content += groups.get('escaped_char', u'')
381
382    def _char_repl(self, groups):
383        if self.text is None:
384            self.text = DocNode('text', self.cur, u'')
385        self.text.content += groups.get('char', u'')
386
387    def _replace(self, match):
388        """Invoke appropriate _*_repl method. Called for every matched group."""
389
390        groups = match.groupdict()
391        for name, text in groups.iteritems():
392            if text is not None:
393                replace = getattr(self, '_%s_repl' % name)
394                replace(groups)
395                return
396
397    def parse_inline(self, raw):
398        """Recognize inline elements inside blocks."""
399
400        re.sub(self.rules.inline_re, self._replace, raw)
401
402    def parse_block(self, raw):
403        """Recognize block elements."""
404
405        re.sub(self.rules.block_re, self._replace, raw)
406
407    def parse(self):
408        """Parse the text given as self.raw and return DOM tree."""
409
410        self.parse_block(self.raw)
411        return self.root
412
413#################### Helper classes
414
415### The document model
416
417class DocNode:
418    """
419    A node in the document.
420    """
421
422    def __init__(self, kind='', parent=None, content=None):
423        self.children = []
424        self.parent = parent
425        self.kind = kind
426        self.content = content
427        if self.parent is not None:
428            self.parent.children.append(self)
429
430
431