1from __future__ import absolute_import, division, unicode_literals
2
3from . import _base
4
5
6class Filter(_base.Filter):
7    def slider(self):
8        previous1 = previous2 = None
9        for token in self.source:
10            if previous1 is not None:
11                yield previous2, previous1, token
12            previous2 = previous1
13            previous1 = token
14        yield previous2, previous1, None
15
16    def __iter__(self):
17        for previous, token, next in self.slider():
18            type = token["type"]
19            if type == "StartTag":
20                if (token["data"] or
21                        not self.is_optional_start(token["name"], previous, next)):
22                    yield token
23            elif type == "EndTag":
24                if not self.is_optional_end(token["name"], next):
25                    yield token
26            else:
27                yield token
28
29    def is_optional_start(self, tagname, previous, next):
30        type = next and next["type"] or None
31        if tagname in 'html':
32            # An html element's start tag may be omitted if the first thing
33            # inside the html element is not a space character or a comment.
34            return type not in ("Comment", "SpaceCharacters")
35        elif tagname == 'head':
36            # A head element's start tag may be omitted if the first thing
37            # inside the head element is an element.
38            # XXX: we also omit the start tag if the head element is empty
39            if type in ("StartTag", "EmptyTag"):
40                return True
41            elif type == "EndTag":
42                return next["name"] == "head"
43        elif tagname == 'body':
44            # A body element's start tag may be omitted if the first thing
45            # inside the body element is not a space character or a comment,
46            # except if the first thing inside the body element is a script
47            # or style element and the node immediately preceding the body
48            # element is a head element whose end tag has been omitted.
49            if type in ("Comment", "SpaceCharacters"):
50                return False
51            elif type == "StartTag":
52                # XXX: we do not look at the preceding event, so we never omit
53                # the body element's start tag if it's followed by a script or
54                # a style element.
55                return next["name"] not in ('script', 'style')
56            else:
57                return True
58        elif tagname == 'colgroup':
59            # A colgroup element's start tag may be omitted if the first thing
60            # inside the colgroup element is a col element, and if the element
61            # is not immediately preceeded by another colgroup element whose
62            # end tag has been omitted.
63            if type in ("StartTag", "EmptyTag"):
64                # XXX: we do not look at the preceding event, so instead we never
65                # omit the colgroup element's end tag when it is immediately
66                # followed by another colgroup element. See is_optional_end.
67                return next["name"] == "col"
68            else:
69                return False
70        elif tagname == 'tbody':
71            # A tbody element's start tag may be omitted if the first thing
72            # inside the tbody element is a tr element, and if the element is
73            # not immediately preceeded by a tbody, thead, or tfoot element
74            # whose end tag has been omitted.
75            if type == "StartTag":
76                # omit the thead and tfoot elements' end tag when they are
77                # immediately followed by a tbody element. See is_optional_end.
78                if previous and previous['type'] == 'EndTag' and \
79                        previous['name'] in ('tbody', 'thead', 'tfoot'):
80                    return False
81                return next["name"] == 'tr'
82            else:
83                return False
84        return False
85
86    def is_optional_end(self, tagname, next):
87        type = next and next["type"] or None
88        if tagname in ('html', 'head', 'body'):
89            # An html element's end tag may be omitted if the html element
90            # is not immediately followed by a space character or a comment.
91            return type not in ("Comment", "SpaceCharacters")
92        elif tagname in ('li', 'optgroup', 'tr'):
93            # A li element's end tag may be omitted if the li element is
94            # immediately followed by another li element or if there is
95            # no more content in the parent element.
96            # An optgroup element's end tag may be omitted if the optgroup
97            # element is immediately followed by another optgroup element,
98            # or if there is no more content in the parent element.
99            # A tr element's end tag may be omitted if the tr element is
100            # immediately followed by another tr element, or if there is
101            # no more content in the parent element.
102            if type == "StartTag":
103                return next["name"] == tagname
104            else:
105                return type == "EndTag" or type is None
106        elif tagname in ('dt', 'dd'):
107            # A dt element's end tag may be omitted if the dt element is
108            # immediately followed by another dt element or a dd element.
109            # A dd element's end tag may be omitted if the dd element is
110            # immediately followed by another dd element or a dt element,
111            # or if there is no more content in the parent element.
112            if type == "StartTag":
113                return next["name"] in ('dt', 'dd')
114            elif tagname == 'dd':
115                return type == "EndTag" or type is None
116            else:
117                return False
118        elif tagname == 'p':
119            # A p element's end tag may be omitted if the p element is
120            # immediately followed by an address, article, aside,
121            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
122            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
123            # nav, ol, p, pre, section, table, or ul, element, or if
124            # there is no more content in the parent element.
125            if type in ("StartTag", "EmptyTag"):
126                return next["name"] in ('address', 'article', 'aside',
127                                        'blockquote', 'datagrid', 'dialog',
128                                        'dir', 'div', 'dl', 'fieldset', 'footer',
129                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
130                                        'header', 'hr', 'menu', 'nav', 'ol',
131                                        'p', 'pre', 'section', 'table', 'ul')
132            else:
133                return type == "EndTag" or type is None
134        elif tagname == 'option':
135            # An option element's end tag may be omitted if the option
136            # element is immediately followed by another option element,
137            # or if it is immediately followed by an <code>optgroup</code>
138            # element, or if there is no more content in the parent
139            # element.
140            if type == "StartTag":
141                return next["name"] in ('option', 'optgroup')
142            else:
143                return type == "EndTag" or type is None
144        elif tagname in ('rt', 'rp'):
145            # An rt element's end tag may be omitted if the rt element is
146            # immediately followed by an rt or rp element, or if there is
147            # no more content in the parent element.
148            # An rp element's end tag may be omitted if the rp element is
149            # immediately followed by an rt or rp element, or if there is
150            # no more content in the parent element.
151            if type == "StartTag":
152                return next["name"] in ('rt', 'rp')
153            else:
154                return type == "EndTag" or type is None
155        elif tagname == 'colgroup':
156            # A colgroup element's end tag may be omitted if the colgroup
157            # element is not immediately followed by a space character or
158            # a comment.
159            if type in ("Comment", "SpaceCharacters"):
160                return False
161            elif type == "StartTag":
162                # XXX: we also look for an immediately following colgroup
163                # element. See is_optional_start.
164                return next["name"] != 'colgroup'
165            else:
166                return True
167        elif tagname in ('thead', 'tbody'):
168            # A thead element's end tag may be omitted if the thead element
169            # is immediately followed by a tbody or tfoot element.
170            # A tbody element's end tag may be omitted if the tbody element
171            # is immediately followed by a tbody or tfoot element, or if
172            # there is no more content in the parent element.
173            # A tfoot element's end tag may be omitted if the tfoot element
174            # is immediately followed by a tbody element, or if there is no
175            # more content in the parent element.
176            # XXX: we never omit the end tag when the following element is
177            # a tbody. See is_optional_start.
178            if type == "StartTag":
179                return next["name"] in ['tbody', 'tfoot']
180            elif tagname == 'tbody':
181                return type == "EndTag" or type is None
182            else:
183                return False
184        elif tagname == 'tfoot':
185            # A tfoot element's end tag may be omitted if the tfoot element
186            # is immediately followed by a tbody element, or if there is no
187            # more content in the parent element.
188            # XXX: we never omit the end tag when the following element is
189            # a tbody. See is_optional_start.
190            if type == "StartTag":
191                return next["name"] == 'tbody'
192            else:
193                return type == "EndTag" or type is None
194        elif tagname in ('td', 'th'):
195            # A td element's end tag may be omitted if the td element is
196            # immediately followed by a td or th element, or if there is
197            # no more content in the parent element.
198            # A th element's end tag may be omitted if the th element is
199            # immediately followed by a td or th element, or if there is
200            # no more content in the parent element.
201            if type == "StartTag":
202                return next["name"] in ('td', 'th')
203            else:
204                return type == "EndTag" or type is None
205        return False
206