1from __future__ import absolute_import, division, unicode_literals 2 3from . import _base 4 5 6class Filter(_base.Filter): 7 def slider(self): 8 previous1 = previous2 = None 9 for token in self.source: 10 if previous1 is not None: 11 yield previous2, previous1, token 12 previous2 = previous1 13 previous1 = token 14 yield previous2, previous1, None 15 16 def __iter__(self): 17 for previous, token, next in self.slider(): 18 type = token["type"] 19 if type == "StartTag": 20 if (token["data"] or 21 not self.is_optional_start(token["name"], previous, next)): 22 yield token 23 elif type == "EndTag": 24 if not self.is_optional_end(token["name"], next): 25 yield token 26 else: 27 yield token 28 29 def is_optional_start(self, tagname, previous, next): 30 type = next and next["type"] or None 31 if tagname in 'html': 32 # An html element's start tag may be omitted if the first thing 33 # inside the html element is not a space character or a comment. 34 return type not in ("Comment", "SpaceCharacters") 35 elif tagname == 'head': 36 # A head element's start tag may be omitted if the first thing 37 # inside the head element is an element. 38 # XXX: we also omit the start tag if the head element is empty 39 if type in ("StartTag", "EmptyTag"): 40 return True 41 elif type == "EndTag": 42 return next["name"] == "head" 43 elif tagname == 'body': 44 # A body element's start tag may be omitted if the first thing 45 # inside the body element is not a space character or a comment, 46 # except if the first thing inside the body element is a script 47 # or style element and the node immediately preceding the body 48 # element is a head element whose end tag has been omitted. 49 if type in ("Comment", "SpaceCharacters"): 50 return False 51 elif type == "StartTag": 52 # XXX: we do not look at the preceding event, so we never omit 53 # the body element's start tag if it's followed by a script or 54 # a style element. 55 return next["name"] not in ('script', 'style') 56 else: 57 return True 58 elif tagname == 'colgroup': 59 # A colgroup element's start tag may be omitted if the first thing 60 # inside the colgroup element is a col element, and if the element 61 # is not immediately preceeded by another colgroup element whose 62 # end tag has been omitted. 63 if type in ("StartTag", "EmptyTag"): 64 # XXX: we do not look at the preceding event, so instead we never 65 # omit the colgroup element's end tag when it is immediately 66 # followed by another colgroup element. See is_optional_end. 67 return next["name"] == "col" 68 else: 69 return False 70 elif tagname == 'tbody': 71 # A tbody element's start tag may be omitted if the first thing 72 # inside the tbody element is a tr element, and if the element is 73 # not immediately preceeded by a tbody, thead, or tfoot element 74 # whose end tag has been omitted. 75 if type == "StartTag": 76 # omit the thead and tfoot elements' end tag when they are 77 # immediately followed by a tbody element. See is_optional_end. 78 if previous and previous['type'] == 'EndTag' and \ 79 previous['name'] in ('tbody', 'thead', 'tfoot'): 80 return False 81 return next["name"] == 'tr' 82 else: 83 return False 84 return False 85 86 def is_optional_end(self, tagname, next): 87 type = next and next["type"] or None 88 if tagname in ('html', 'head', 'body'): 89 # An html element's end tag may be omitted if the html element 90 # is not immediately followed by a space character or a comment. 91 return type not in ("Comment", "SpaceCharacters") 92 elif tagname in ('li', 'optgroup', 'tr'): 93 # A li element's end tag may be omitted if the li element is 94 # immediately followed by another li element or if there is 95 # no more content in the parent element. 96 # An optgroup element's end tag may be omitted if the optgroup 97 # element is immediately followed by another optgroup element, 98 # or if there is no more content in the parent element. 99 # A tr element's end tag may be omitted if the tr element is 100 # immediately followed by another tr element, or if there is 101 # no more content in the parent element. 102 if type == "StartTag": 103 return next["name"] == tagname 104 else: 105 return type == "EndTag" or type is None 106 elif tagname in ('dt', 'dd'): 107 # A dt element's end tag may be omitted if the dt element is 108 # immediately followed by another dt element or a dd element. 109 # A dd element's end tag may be omitted if the dd element is 110 # immediately followed by another dd element or a dt element, 111 # or if there is no more content in the parent element. 112 if type == "StartTag": 113 return next["name"] in ('dt', 'dd') 114 elif tagname == 'dd': 115 return type == "EndTag" or type is None 116 else: 117 return False 118 elif tagname == 'p': 119 # A p element's end tag may be omitted if the p element is 120 # immediately followed by an address, article, aside, 121 # blockquote, datagrid, dialog, dir, div, dl, fieldset, 122 # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu, 123 # nav, ol, p, pre, section, table, or ul, element, or if 124 # there is no more content in the parent element. 125 if type in ("StartTag", "EmptyTag"): 126 return next["name"] in ('address', 'article', 'aside', 127 'blockquote', 'datagrid', 'dialog', 128 'dir', 'div', 'dl', 'fieldset', 'footer', 129 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 130 'header', 'hr', 'menu', 'nav', 'ol', 131 'p', 'pre', 'section', 'table', 'ul') 132 else: 133 return type == "EndTag" or type is None 134 elif tagname == 'option': 135 # An option element's end tag may be omitted if the option 136 # element is immediately followed by another option element, 137 # or if it is immediately followed by an <code>optgroup</code> 138 # element, or if there is no more content in the parent 139 # element. 140 if type == "StartTag": 141 return next["name"] in ('option', 'optgroup') 142 else: 143 return type == "EndTag" or type is None 144 elif tagname in ('rt', 'rp'): 145 # An rt element's end tag may be omitted if the rt element is 146 # immediately followed by an rt or rp element, or if there is 147 # no more content in the parent element. 148 # An rp element's end tag may be omitted if the rp element is 149 # immediately followed by an rt or rp element, or if there is 150 # no more content in the parent element. 151 if type == "StartTag": 152 return next["name"] in ('rt', 'rp') 153 else: 154 return type == "EndTag" or type is None 155 elif tagname == 'colgroup': 156 # A colgroup element's end tag may be omitted if the colgroup 157 # element is not immediately followed by a space character or 158 # a comment. 159 if type in ("Comment", "SpaceCharacters"): 160 return False 161 elif type == "StartTag": 162 # XXX: we also look for an immediately following colgroup 163 # element. See is_optional_start. 164 return next["name"] != 'colgroup' 165 else: 166 return True 167 elif tagname in ('thead', 'tbody'): 168 # A thead element's end tag may be omitted if the thead element 169 # is immediately followed by a tbody or tfoot element. 170 # A tbody element's end tag may be omitted if the tbody element 171 # is immediately followed by a tbody or tfoot element, or if 172 # there is no more content in the parent element. 173 # A tfoot element's end tag may be omitted if the tfoot element 174 # is immediately followed by a tbody element, or if there is no 175 # more content in the parent element. 176 # XXX: we never omit the end tag when the following element is 177 # a tbody. See is_optional_start. 178 if type == "StartTag": 179 return next["name"] in ['tbody', 'tfoot'] 180 elif tagname == 'tbody': 181 return type == "EndTag" or type is None 182 else: 183 return False 184 elif tagname == 'tfoot': 185 # A tfoot element's end tag may be omitted if the tfoot element 186 # is immediately followed by a tbody element, or if there is no 187 # more content in the parent element. 188 # XXX: we never omit the end tag when the following element is 189 # a tbody. See is_optional_start. 190 if type == "StartTag": 191 return next["name"] == 'tbody' 192 else: 193 return type == "EndTag" or type is None 194 elif tagname in ('td', 'th'): 195 # A td element's end tag may be omitted if the td element is 196 # immediately followed by a td or th element, or if there is 197 # no more content in the parent element. 198 # A th element's end tag may be omitted if the th element is 199 # immediately followed by a td or th element, or if there is 200 # no more content in the parent element. 201 if type == "StartTag": 202 return next["name"] in ('td', 'th') 203 else: 204 return type == "EndTag" or type is None 205 return False 206