1
2# Copyright 2016-2017 Jaap Karssenberg <jaap.karssenberg@gmail.com>
3
4# Tokens come in 3 variants
5#   tuple((tag, attrib))  e.g. (HEADING, {'level': 3})
6#   tuple((TEXT, string))   e.g. (TEXT, 'Some heading ...')
7#   tuple((END, tag))     e.g. (END, HEADING)
8#
9# Extra constraint is parsing must be per line, therefore a TEXT
10# item cannot contain newline other than at the end of the string
11
12
13from zim.parser import Builder
14from zim.formats import NUMBEREDLIST, BULLETLIST, LISTITEM, PARAGRAPH
15
16TEXT = 'T'
17END = '/'
18
19
20class EndOfTokenListError(AssertionError):
21	pass
22
23
24def collect_untill_end_token(token_iter, end_token):
25	nesting = 0
26	tokens = []
27	for t in token_iter:
28		if t[0] == end_token:
29			nesting += 1
30		elif t == (END, end_token):
31			nesting -= 1
32			if nesting < 0:
33				break
34
35		tokens.append(t)
36	else:
37		raise EndOfTokenListError('Did not find "%s" closing tag' % end_token)
38
39	return tokens
40
41
42def tokens_to_text(token_iter):
43	text = []
44	for t in token_iter:
45		if t[0] == TEXT:
46			text.append(t[1])
47	return ''.join(text)
48
49
50class TokenBuilder(Builder):
51
52	def __init__(self):
53		self._tokens = []
54
55	@property
56	def tokens(self):
57		return topLevelLists(self._tokens)
58
59	def start(self, tag, attrib=None):
60		self._tokens.append((tag, attrib))
61
62	def text(self, text):
63		if '\n' in text:
64			for line in text.splitlines(True):
65				self._tokens.append((TEXT, line))
66		else:
67			self._tokens.append((TEXT, text))
68
69	def end(self, tag):
70		self._tokens.append((END, tag))
71
72	def append(self, tag, attrib=None, text=None):
73		if text:
74			if '\n' in text:
75				self._tokens.append((tag, attrib))
76				for line in text.splitlines(True):
77					self._tokens.append((TEXT, line))
78				self._tokens.append((END, tag))
79			else:
80				self._tokens.extend([
81					(tag, attrib),
82					(TEXT, text),
83					(END, tag)
84				])
85		else:
86			self._tokens.extend([
87				(tag, attrib),
88				(END, tag)
89			])
90
91
92class TokenParser(object):
93
94	def __init__(self, builder):
95		self.builder = builder
96
97	def parse(self, tokens):
98		for t in reverseTopLevelLists(tokens):
99			if t[0] == END:
100				self.builder.end(t[1])
101			elif t[0] == TEXT:
102				self.builder.text(t[1])
103			else:
104				self.builder.start(*t)
105
106
107class TokenVisitor(object):
108	# Adaptor for the visit interface
109
110	def __init__(self, tokens):
111		self.tokens = tokens
112
113	def visit(self, builder):
114		parser = TokenParser(builder)
115		builder.parse(self.tokens)
116
117
118def skip_to_end_token(token_iter, end_token):
119	eol = 0
120	nesting = 0
121	for t in token_iter:
122		if t[0] == end_token:
123			nesting += 1
124		elif t == (END, end_token):
125			nesting -= 1
126			if nesting < 0:
127				break
128		elif t[0] == TEXT:
129			eol += t[1].count('\n')
130
131	return eol
132
133
134def topLevelLists(tokens):
135	# Make tree more HTML-like:
136	# - Move UL / OL to top level, outside P
137	# - Put sub-UL / sub-OL inside LI element
138	# - Make indent blocks their own para
139	#
140	# <p><ul>...</ul></p> --> <ul>...</ul>
141	# <p><ul>...</ul>.. --> <ul>...</ul><p>..
142	# ..<ul>...</ul>.. --> ..</p><ul>...</ul><p>..
143	# ..<ul>...</ul></p> --> ..</p><ul>...</ul>
144	#
145
146	tokeniter = iter(tokens)
147	newtokens = []
148	for t in tokeniter:
149		if t[0] in (NUMBEREDLIST, BULLETLIST):
150			if newtokens[-1][0] == PARAGRAPH:
151				newtokens.pop()
152			else:
153				newtokens.append((END, PARAGRAPH))
154
155			newtokens.append(t)
156			newtokens.extend(_changeList(tokeniter))
157
158			nexttoken = next(tokeniter)
159			while nexttoken[0] in (BULLETLIST, NUMBEREDLIST):
160				# edge case due to messed up indenting: jumping back to
161				# lower level than start of list will cause new list
162				newtokens.append(nexttoken)
163				newtokens.extend(_changeList(tokeniter))
164				nexttoken = next(tokeniter)
165
166			assert not (nexttoken[0] == END and nexttoken[1] in (BULLETLIST, NUMBEREDLIST))
167
168			if nexttoken == (END, PARAGRAPH):
169				pass
170			else:
171				newtokens.append((PARAGRAPH, None))
172				newtokens.append(nexttoken)
173		else:
174			newtokens.append(t)
175
176	return newtokens
177
178def _changeList(tokeniter):
179	# </li><ul>...</ul> --> <ul>...</ul></li>
180	newtokens = []
181	for t in tokeniter:
182		if t[0] in (NUMBEREDLIST, BULLETLIST):
183			if newtokens:
184				listend = newtokens.pop()
185				if not listend == (END, LISTITEM):
186					raise AssertionError
187				newtokens.append(t)
188				newtokens.extend(_changeList(tokeniter)) # recurs
189				newtokens.append(listend)
190			else:
191				# edge case, list skipped a level without LISTITEM -- remove
192				# one nesting level by recursing while dropping start and end
193				newtokens.extend(_changeList(tokeniter)) # recurs
194				if not newtokens.pop() == (END, t[0]):
195					raise AssertionError
196		else:
197			newtokens.append(t)
198
199		if t[0] == END and t[1] in (NUMBEREDLIST, BULLETLIST):
200			break
201
202	return newtokens
203
204
205def reverseTopLevelLists(tokens):
206	# Undo effect of topLevelLists()
207	#
208	# <br><ul>...</ul><br> --> <p><ul>...</ul></p>
209	# <br><ul>...</ul><p>.. --> <p><ul>...</ul>..
210	# ..</p><ul>...</ul><p>.. ..<ul>...</ul>..
211	# ..</p><ul>...</ul><br> --> ..<ul>...</ul></p>
212	#
213
214	def isbr(token):
215		return token[0] == TEXT and token[1].isspace() and '\n' in token[1]
216
217	tokeniter = iter(tokens)
218	newtokens = []
219	for t in tokeniter:
220		if t[0] in (NUMBEREDLIST, BULLETLIST):
221			if newtokens and newtokens[-1] == (END, PARAGRAPH):
222				newtokens.pop()
223			else:
224				newtokens.append((PARAGRAPH, None))
225
226			newtokens.append(t)
227			newtokens.extend(_reverseChangeList(tokeniter))
228
229			nexttoken = next(tokeniter)
230			if nexttoken[0] in (BULLETLIST, NUMBEREDLIST) \
231			or nexttoken[0] == END and nexttoken[1] in (BULLETLIST, NUMBEREDLIST):
232				raise AssertionError
233
234			if nexttoken[0] == PARAGRAPH:
235				pass
236			else:
237				newtokens.append((END, PARAGRAPH))
238				newtokens.append(nexttoken)
239		else:
240			newtokens.append(t)
241
242	return newtokens
243
244
245def _reverseChangeList(tokeniter):
246	# <ul>...</ul></li> --> </li><ul>...</ul>
247	newtokens = []
248	for t in tokeniter:
249		if t[0] in (NUMBEREDLIST, BULLETLIST):
250			listtokens = _reverseChangeList(tokeniter) # recurs
251			liend = next(tokeniter)
252			if not liend == (END, LISTITEM):
253				raise AssertionError
254			newtokens.append(liend)
255			newtokens.append(t)
256			newtokens.extend(listtokens)
257		else:
258			newtokens.append(t)
259
260		if t[0] == END and t[1] in (NUMBEREDLIST, BULLETLIST):
261			break
262
263	return newtokens
264
265
266def testTokenStream(token_iter):
267	nesting = []
268	for t in token_iter:
269		assert isinstance(t, tuple) and len(t) == 2, 'Malformed token'
270		if t[0] == END:
271			assert nesting[-1] == t[1], 'Got /%s, expected /%s' % (t[1], nesting[-1])
272			nesting.pop()
273		elif t[0] == TEXT:
274			assert isinstance(t[1], str), 'Wrong type for text'
275			assert not '\n' in t[1][:-1], 'Text token should not cross line break: %r' % (t,)
276		else:
277			assert t[1] is None or isinstance(t[1], dict), 'Wrong type for attributes'
278
279			if t[0] in (BULLETLIST, NUMBEREDLIST):
280				assert PARAGRAPH not in nesting, 'Lists should not appear inside paragraphs'
281			elif t[0] == PARAGRAPH:
282				assert len(nesting) == 1, 'Paragraphs should only appear in top level - got %r' % nesting
283			# TODO more semantic rules
284
285			nesting.append(t[0])
286
287	assert len(nesting) == 0, 'Open tags: %r' % nesting
288