1# -*- coding: iso-8859-1 -*- 2""" 3 Creole wiki markup parser 4 5 See http://wikicreole.org/ for latest specs. 6 7 Notes: 8 * No markup allowed in headings. 9 Creole 1.0 does not require us to support this. 10 * No markup allowed in table headings. 11 Creole 1.0 does not require us to support this. 12 * No (non-bracketed) generic url recognition: this is "mission impossible" 13 except if you want to risk lots of false positives. Only known protocols 14 are recognized. 15 * We do not allow ":" before "//" italic markup to avoid urls with 16 unrecognized schemes (like wtf://server/path) triggering italic rendering 17 for the rest of the paragraph. 18 19 @copyright: 2007 MoinMoin:RadomirDopieralski (creole 0.5 implementation), 20 2007 MoinMoin:ThomasWaldmann (updates) 21 @license: GNU GPL, see COPYING for details. 22 @license: BSD, see COPYING for details. 23""" 24 25import re 26import sys 27 28__version__ = '1.1' 29 30 31class Rules: 32 """Hold all the rules for generating regular expressions.""" 33 34 # For the inline elements: 35 proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc' 36 link = r'''(?P<link> 37 \[\[ 38 (?P<link_target>.+?) \s* 39 ([|] \s* (?P<link_text>.+?) \s*)? 40 ]] 41 )''' 42 image = r'''(?P<image> 43 {{ 44 (?P<image_target>.+?) \s* 45 ([|] \s* (?P<image_text>.+?) \s*)? 46 }} 47 )''' 48 macro = r'''(?P<macro> 49 << 50 (?P<macro_name> \w+) 51 (\( (?P<macro_args> .*?) \))? \s* 52 ([|] \s* (?P<macro_text> .+?) \s* )? 53 >> 54 )''' 55 code = r'(?P<code> {{{ (?P<code_text>.*?) }}} )' 56 emph = r'(?P<emph> (?<!:)// )' # there must be no : in front of the // 57 # avoids italic rendering in urls with 58 # unknown protocols 59 strong = r'(?P<strong> \*\* )' 60 linebreak = r'(?P<break> \\\\ )' 61 escape = r'(?P<escape> ~ (?P<escaped_char>\S) )' 62 char = r'(?P<char> . )' 63 64 # For the block elements: 65 separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line 66 line = r'(?P<line> ^ \s* $ )' # empty line that separates paragraphs 67 head = r'''(?P<head> 68 ^ \s* 69 (?P<head_head>=+) \s* 70 (?P<head_text> .*? ) \s* 71 (?P<head_tail>=*) \s* 72 $ 73 )''' 74 text = r'(?P<text> .+ )' 75 list = r'''(?P<list> 76 ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $ 77 ( \n[ \t]* [*\#]+.* $ )* 78 )''' # Matches the whole list, separate items are parsed later. The 79 # list *must* start with a single bullet. 80 item = r'''(?P<item> 81 ^ \s* 82 (?P<item_head> [\#*]+) \s* 83 (?P<item_text> .*?) 84 $ 85 )''' # Matches single list items 86 pre = r'''(?P<pre> 87 ^{{{ \s* $ 88 (\n)? 89 (?P<pre_text> 90 ([\#]!(?P<pre_kind>\w*?)(\s+.*)?$)? 91 (.|\n)+? 92 ) 93 (\n)? 94 ^}}} \s*$ 95 )''' 96 pre_escape = r' ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $' 97 table = r'''(?P<table> 98 ^ \s* 99 [|].*? \s* 100 [|]? \s* 101 $ 102 )''' 103 104 # For splitting table cells: 105 cell = r''' 106 \| \s* 107 ( 108 (?P<head> [=][^|]+ ) | 109 (?P<cell> ( %s | [^|])+ ) 110 ) \s* 111 ''' % '|'.join([link, macro, image, code]) 112 113 def __init__(self, bloglike_lines=False, url_protocols=None, 114 wiki_words=False): 115 c = re.compile 116 # For pre escaping, in creole 1.0 done with ~: 117 self.pre_escape_re = c(self.pre_escape, re.M | re.X) 118 # for link descriptions 119 self.link_re = c('|'.join([self.image, self.linebreak, 120 self.char]), re.X | re.U) 121 # for list items 122 self.item_re = c(self.item, re.X | re.U | re.M) 123 # for table cells 124 self.cell_re = c(self.cell, re.X | re.U) 125 126 # For block elements: 127 if bloglike_lines: 128 self.text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?' 129 self.block_re = c('|'.join([self.line, self.head, self.separator, 130 self.pre, self.list, self.table, 131 self.text]), re.X | re.U | re.M) 132 133 # For inline elements: 134 if url_protocols is not None: 135 self.proto = '|'.join(re.escape(p) for p in url_protocols) 136 self.url = r'''(?P<url> 137 (^ | (?<=\s | [.,:;!?()/=])) 138 (?P<escaped_url>~)? 139 (?P<url_target> (?P<url_proto> %s ):\S+? ) 140 ($ | (?=\s | [,.:;!?()] (\s | $))))''' % self.proto 141 inline_elements = [self.link, self.url, self.macro, 142 self.code, self.image, self.strong, 143 self.emph, self.linebreak, 144 self.escape, self.char] 145 if wiki_words: 146 import unicodedata 147 up_case = u''.join(unichr(i) for i in xrange(sys.maxunicode) 148 if unicodedata.category(unichr(i))=='Lu') 149 self.wiki = ur'''(?P<wiki>[%s]\w+[%s]\w+)''' % (up_case, up_case) 150 inline_elements.insert(3, self.wiki) 151 self.inline_re = c('|'.join(inline_elements), re.X | re.U) 152 153class Parser: 154 """ 155 Parse the raw text and create a document object 156 that can be converted into output using Emitter. 157 158 A separate instance should be created for parsing a new document. 159 The first parameter is the raw text to be parsed. An optional second 160 argument is the Rules object to use. You can customize the parsing 161 rules to enable optional features or extend the parser. 162 """ 163 164 def __init__(self, raw, rules=None): 165 self.rules = rules or Rules() 166 self.raw = raw 167 self.root = DocNode('document', None) 168 self.cur = self.root # The most recent document node 169 self.text = None # The node to add inline characters to 170 171 def _upto(self, node, kinds): 172 """ 173 Look up the tree to the first occurence 174 of one of the listed kinds of nodes or root. 175 Start at the node node. 176 """ 177 while node.parent is not None and not node.kind in kinds: 178 node = node.parent 179 return node 180 181 # The _*_repl methods called for matches in regexps. Sometimes the 182 # same method needs several names, because of group names in regexps. 183 184 def _url_repl(self, groups): 185 """Handle raw urls in text.""" 186 187 if not groups.get('escaped_url'): 188 # this url is NOT escaped 189 target = groups.get('url_target', '') 190 node = DocNode('link', self.cur) 191 node.content = target 192 DocNode('text', node, node.content) 193 self.text = None 194 else: 195 # this url is escaped, we render it as text 196 if self.text is None: 197 self.text = DocNode('text', self.cur, u'') 198 self.text.content += groups.get('url_target') 199 _url_target_repl = _url_repl 200 _url_proto_repl = _url_repl 201 _escaped_url = _url_repl 202 203 def _link_repl(self, groups): 204 """Handle all kinds of links.""" 205 206 target = groups.get('link_target', '') 207 text = (groups.get('link_text', '') or '').strip() 208 parent = self.cur 209 self.cur = DocNode('link', self.cur) 210 self.cur.content = target 211 self.text = None 212 re.sub(self.rules.link_re, self._replace, text) 213 self.cur = parent 214 self.text = None 215 _link_target_repl = _link_repl 216 _link_text_repl = _link_repl 217 218 def _wiki_repl(self, groups): 219 """Handle WikiWord links, if enabled.""" 220 221 text = groups.get('wiki', '') 222 node = DocNode('link', self.cur) 223 node.content = text 224 DocNode('text', node, node.content) 225 self.text = None 226 227 def _macro_repl(self, groups): 228 """Handles macros using the placeholder syntax.""" 229 230 name = groups.get('macro_name', '') 231 text = (groups.get('macro_text', '') or '').strip() 232 node = DocNode('macro', self.cur, name) 233 node.args = groups.get('macro_args', '') or '' 234 DocNode('text', node, text or name) 235 self.text = None 236 _macro_name_repl = _macro_repl 237 _macro_args_repl = _macro_repl 238 _macro_text_repl = _macro_repl 239 240 def _image_repl(self, groups): 241 """Handles images and attachemnts included in the page.""" 242 243 target = groups.get('image_target', '').strip() 244 text = (groups.get('image_text', '') or '').strip() 245 node = DocNode("image", self.cur, target) 246 DocNode('text', node, text or node.content) 247 self.text = None 248 _image_target_repl = _image_repl 249 _image_text_repl = _image_repl 250 251 def _separator_repl(self, groups): 252 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote')) 253 DocNode('separator', self.cur) 254 255 def _item_repl(self, groups): 256 bullet = groups.get('item_head', u'') 257 text = groups.get('item_text', u'') 258 if bullet[-1] == '#': 259 kind = 'number_list' 260 else: 261 kind = 'bullet_list' 262 level = len(bullet) 263 lst = self.cur 264 # Find a list of the same kind and level up the tree 265 while (lst and 266 not (lst.kind in ('number_list', 'bullet_list') and 267 lst.level == level) and 268 not lst.kind in ('document', 'section', 'blockquote')): 269 lst = lst.parent 270 if lst and lst.kind == kind: 271 self.cur = lst 272 else: 273 # Create a new level of list 274 self.cur = self._upto(self.cur, 275 ('list_item', 'document', 'section', 'blockquote')) 276 self.cur = DocNode(kind, self.cur) 277 self.cur.level = level 278 self.cur = DocNode('list_item', self.cur) 279 self.parse_inline(text) 280 self.text = None 281 _item_text_repl = _item_repl 282 _item_head_repl = _item_repl 283 284 def _list_repl(self, groups): 285 text = groups.get('list', u'') 286 self.rules.item_re.sub(self._replace, text) 287 288 def _head_repl(self, groups): 289 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote')) 290 node = DocNode('header', self.cur, groups.get('head_text', '').strip()) 291 node.level = len(groups.get('head_head', ' ')) 292 _head_head_repl = _head_repl 293 _head_text_repl = _head_repl 294 295 def _text_repl(self, groups): 296 text = groups.get('text', '') 297 if self.cur.kind in ('table', 'table_row', 'bullet_list', 298 'number_list'): 299 self.cur = self._upto(self.cur, 300 ('document', 'section', 'blockquote')) 301 if self.cur.kind in ('document', 'section', 'blockquote'): 302 self.cur = DocNode('paragraph', self.cur) 303 else: 304 text = u' ' + text 305 self.parse_inline(text) 306 if groups.get('break') and self.cur.kind in ('paragraph', 307 'emphasis', 'strong', 'code'): 308 DocNode('break', self.cur, '') 309 self.text = None 310 _break_repl = _text_repl 311 312 def _table_repl(self, groups): 313 row = groups.get('table', '|').strip() 314 self.cur = self._upto(self.cur, ( 315 'table', 'document', 'section', 'blockquote')) 316 if self.cur.kind != 'table': 317 self.cur = DocNode('table', self.cur) 318 tb = self.cur 319 tr = DocNode('table_row', tb) 320 321 text = '' 322 for m in self.rules.cell_re.finditer(row): 323 cell = m.group('cell') 324 if cell: 325 self.cur = DocNode('table_cell', tr) 326 self.text = None 327 self.parse_inline(cell) 328 else: 329 cell = m.group('head') 330 self.cur = DocNode('table_head', tr) 331 self.text = DocNode('text', self.cur, u'') 332 self.text.content = cell.strip('=') 333 self.cur = tb 334 self.text = None 335 336 def _pre_repl(self, groups): 337 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote')) 338 kind = groups.get('pre_kind', None) 339 text = groups.get('pre_text', u'') 340 def remove_tilde(m): 341 return m.group('indent') + m.group('rest') 342 text = self.rules.pre_escape_re.sub(remove_tilde, text) 343 node = DocNode('preformatted', self.cur, text) 344 node.sect = kind or '' 345 self.text = None 346 _pre_text_repl = _pre_repl 347 _pre_head_repl = _pre_repl 348 _pre_kind_repl = _pre_repl 349 350 def _line_repl(self, groups): 351 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote')) 352 353 def _code_repl(self, groups): 354 DocNode('code', self.cur, groups.get('code_text', u'').strip()) 355 self.text = None 356 _code_text_repl = _code_repl 357 _code_head_repl = _code_repl 358 359 def _emph_repl(self, groups): 360 if self.cur.kind != 'emphasis': 361 self.cur = DocNode('emphasis', self.cur) 362 else: 363 self.cur = self._upto(self.cur, ('emphasis', )).parent 364 self.text = None 365 366 def _strong_repl(self, groups): 367 if self.cur.kind != 'strong': 368 self.cur = DocNode('strong', self.cur) 369 else: 370 self.cur = self._upto(self.cur, ('strong', )).parent 371 self.text = None 372 373 def _break_repl(self, groups): 374 DocNode('break', self.cur, None) 375 self.text = None 376 377 def _escape_repl(self, groups): 378 if self.text is None: 379 self.text = DocNode('text', self.cur, u'') 380 self.text.content += groups.get('escaped_char', u'') 381 382 def _char_repl(self, groups): 383 if self.text is None: 384 self.text = DocNode('text', self.cur, u'') 385 self.text.content += groups.get('char', u'') 386 387 def _replace(self, match): 388 """Invoke appropriate _*_repl method. Called for every matched group.""" 389 390 groups = match.groupdict() 391 for name, text in groups.iteritems(): 392 if text is not None: 393 replace = getattr(self, '_%s_repl' % name) 394 replace(groups) 395 return 396 397 def parse_inline(self, raw): 398 """Recognize inline elements inside blocks.""" 399 400 re.sub(self.rules.inline_re, self._replace, raw) 401 402 def parse_block(self, raw): 403 """Recognize block elements.""" 404 405 re.sub(self.rules.block_re, self._replace, raw) 406 407 def parse(self): 408 """Parse the text given as self.raw and return DOM tree.""" 409 410 self.parse_block(self.raw) 411 return self.root 412 413#################### Helper classes 414 415### The document model 416 417class DocNode: 418 """ 419 A node in the document. 420 """ 421 422 def __init__(self, kind='', parent=None, content=None): 423 self.children = [] 424 self.parent = parent 425 self.kind = kind 426 self.content = content 427 if self.parent is not None: 428 self.parent.children.append(self) 429 430 431