1 2# Copyright 2016-2017 Jaap Karssenberg <jaap.karssenberg@gmail.com> 3 4# Tokens come in 3 variants 5# tuple((tag, attrib)) e.g. (HEADING, {'level': 3}) 6# tuple((TEXT, string)) e.g. (TEXT, 'Some heading ...') 7# tuple((END, tag)) e.g. (END, HEADING) 8# 9# Extra constraint is parsing must be per line, therefore a TEXT 10# item cannot contain newline other than at the end of the string 11 12 13from zim.parser import Builder 14from zim.formats import NUMBEREDLIST, BULLETLIST, LISTITEM, PARAGRAPH 15 16TEXT = 'T' 17END = '/' 18 19 20class EndOfTokenListError(AssertionError): 21 pass 22 23 24def collect_untill_end_token(token_iter, end_token): 25 nesting = 0 26 tokens = [] 27 for t in token_iter: 28 if t[0] == end_token: 29 nesting += 1 30 elif t == (END, end_token): 31 nesting -= 1 32 if nesting < 0: 33 break 34 35 tokens.append(t) 36 else: 37 raise EndOfTokenListError('Did not find "%s" closing tag' % end_token) 38 39 return tokens 40 41 42def tokens_to_text(token_iter): 43 text = [] 44 for t in token_iter: 45 if t[0] == TEXT: 46 text.append(t[1]) 47 return ''.join(text) 48 49 50class TokenBuilder(Builder): 51 52 def __init__(self): 53 self._tokens = [] 54 55 @property 56 def tokens(self): 57 return topLevelLists(self._tokens) 58 59 def start(self, tag, attrib=None): 60 self._tokens.append((tag, attrib)) 61 62 def text(self, text): 63 if '\n' in text: 64 for line in text.splitlines(True): 65 self._tokens.append((TEXT, line)) 66 else: 67 self._tokens.append((TEXT, text)) 68 69 def end(self, tag): 70 self._tokens.append((END, tag)) 71 72 def append(self, tag, attrib=None, text=None): 73 if text: 74 if '\n' in text: 75 self._tokens.append((tag, attrib)) 76 for line in text.splitlines(True): 77 self._tokens.append((TEXT, line)) 78 self._tokens.append((END, tag)) 79 else: 80 self._tokens.extend([ 81 (tag, attrib), 82 (TEXT, text), 83 (END, tag) 84 ]) 85 else: 86 self._tokens.extend([ 87 (tag, attrib), 88 (END, tag) 89 ]) 90 91 92class TokenParser(object): 93 94 def __init__(self, builder): 95 self.builder = builder 96 97 def parse(self, tokens): 98 for t in reverseTopLevelLists(tokens): 99 if t[0] == END: 100 self.builder.end(t[1]) 101 elif t[0] == TEXT: 102 self.builder.text(t[1]) 103 else: 104 self.builder.start(*t) 105 106 107class TokenVisitor(object): 108 # Adaptor for the visit interface 109 110 def __init__(self, tokens): 111 self.tokens = tokens 112 113 def visit(self, builder): 114 parser = TokenParser(builder) 115 builder.parse(self.tokens) 116 117 118def skip_to_end_token(token_iter, end_token): 119 eol = 0 120 nesting = 0 121 for t in token_iter: 122 if t[0] == end_token: 123 nesting += 1 124 elif t == (END, end_token): 125 nesting -= 1 126 if nesting < 0: 127 break 128 elif t[0] == TEXT: 129 eol += t[1].count('\n') 130 131 return eol 132 133 134def topLevelLists(tokens): 135 # Make tree more HTML-like: 136 # - Move UL / OL to top level, outside P 137 # - Put sub-UL / sub-OL inside LI element 138 # - Make indent blocks their own para 139 # 140 # <p><ul>...</ul></p> --> <ul>...</ul> 141 # <p><ul>...</ul>.. --> <ul>...</ul><p>.. 142 # ..<ul>...</ul>.. --> ..</p><ul>...</ul><p>.. 143 # ..<ul>...</ul></p> --> ..</p><ul>...</ul> 144 # 145 146 tokeniter = iter(tokens) 147 newtokens = [] 148 for t in tokeniter: 149 if t[0] in (NUMBEREDLIST, BULLETLIST): 150 if newtokens[-1][0] == PARAGRAPH: 151 newtokens.pop() 152 else: 153 newtokens.append((END, PARAGRAPH)) 154 155 newtokens.append(t) 156 newtokens.extend(_changeList(tokeniter)) 157 158 nexttoken = next(tokeniter) 159 while nexttoken[0] in (BULLETLIST, NUMBEREDLIST): 160 # edge case due to messed up indenting: jumping back to 161 # lower level than start of list will cause new list 162 newtokens.append(nexttoken) 163 newtokens.extend(_changeList(tokeniter)) 164 nexttoken = next(tokeniter) 165 166 assert not (nexttoken[0] == END and nexttoken[1] in (BULLETLIST, NUMBEREDLIST)) 167 168 if nexttoken == (END, PARAGRAPH): 169 pass 170 else: 171 newtokens.append((PARAGRAPH, None)) 172 newtokens.append(nexttoken) 173 else: 174 newtokens.append(t) 175 176 return newtokens 177 178def _changeList(tokeniter): 179 # </li><ul>...</ul> --> <ul>...</ul></li> 180 newtokens = [] 181 for t in tokeniter: 182 if t[0] in (NUMBEREDLIST, BULLETLIST): 183 if newtokens: 184 listend = newtokens.pop() 185 if not listend == (END, LISTITEM): 186 raise AssertionError 187 newtokens.append(t) 188 newtokens.extend(_changeList(tokeniter)) # recurs 189 newtokens.append(listend) 190 else: 191 # edge case, list skipped a level without LISTITEM -- remove 192 # one nesting level by recursing while dropping start and end 193 newtokens.extend(_changeList(tokeniter)) # recurs 194 if not newtokens.pop() == (END, t[0]): 195 raise AssertionError 196 else: 197 newtokens.append(t) 198 199 if t[0] == END and t[1] in (NUMBEREDLIST, BULLETLIST): 200 break 201 202 return newtokens 203 204 205def reverseTopLevelLists(tokens): 206 # Undo effect of topLevelLists() 207 # 208 # <br><ul>...</ul><br> --> <p><ul>...</ul></p> 209 # <br><ul>...</ul><p>.. --> <p><ul>...</ul>.. 210 # ..</p><ul>...</ul><p>.. ..<ul>...</ul>.. 211 # ..</p><ul>...</ul><br> --> ..<ul>...</ul></p> 212 # 213 214 def isbr(token): 215 return token[0] == TEXT and token[1].isspace() and '\n' in token[1] 216 217 tokeniter = iter(tokens) 218 newtokens = [] 219 for t in tokeniter: 220 if t[0] in (NUMBEREDLIST, BULLETLIST): 221 if newtokens and newtokens[-1] == (END, PARAGRAPH): 222 newtokens.pop() 223 else: 224 newtokens.append((PARAGRAPH, None)) 225 226 newtokens.append(t) 227 newtokens.extend(_reverseChangeList(tokeniter)) 228 229 nexttoken = next(tokeniter) 230 if nexttoken[0] in (BULLETLIST, NUMBEREDLIST) \ 231 or nexttoken[0] == END and nexttoken[1] in (BULLETLIST, NUMBEREDLIST): 232 raise AssertionError 233 234 if nexttoken[0] == PARAGRAPH: 235 pass 236 else: 237 newtokens.append((END, PARAGRAPH)) 238 newtokens.append(nexttoken) 239 else: 240 newtokens.append(t) 241 242 return newtokens 243 244 245def _reverseChangeList(tokeniter): 246 # <ul>...</ul></li> --> </li><ul>...</ul> 247 newtokens = [] 248 for t in tokeniter: 249 if t[0] in (NUMBEREDLIST, BULLETLIST): 250 listtokens = _reverseChangeList(tokeniter) # recurs 251 liend = next(tokeniter) 252 if not liend == (END, LISTITEM): 253 raise AssertionError 254 newtokens.append(liend) 255 newtokens.append(t) 256 newtokens.extend(listtokens) 257 else: 258 newtokens.append(t) 259 260 if t[0] == END and t[1] in (NUMBEREDLIST, BULLETLIST): 261 break 262 263 return newtokens 264 265 266def testTokenStream(token_iter): 267 nesting = [] 268 for t in token_iter: 269 assert isinstance(t, tuple) and len(t) == 2, 'Malformed token' 270 if t[0] == END: 271 assert nesting[-1] == t[1], 'Got /%s, expected /%s' % (t[1], nesting[-1]) 272 nesting.pop() 273 elif t[0] == TEXT: 274 assert isinstance(t[1], str), 'Wrong type for text' 275 assert not '\n' in t[1][:-1], 'Text token should not cross line break: %r' % (t,) 276 else: 277 assert t[1] is None or isinstance(t[1], dict), 'Wrong type for attributes' 278 279 if t[0] in (BULLETLIST, NUMBEREDLIST): 280 assert PARAGRAPH not in nesting, 'Lists should not appear inside paragraphs' 281 elif t[0] == PARAGRAPH: 282 assert len(nesting) == 1, 'Paragraphs should only appear in top level - got %r' % nesting 283 # TODO more semantic rules 284 285 nesting.append(t[0]) 286 287 assert len(nesting) == 0, 'Open tags: %r' % nesting 288