1"""Shared support for scanning document type declarations in HTML and XHTML. 2 3Backported for python-future from Python 3.3. Reason: ParserBase is an 4old-style class in the Python 2.7 source of markupbase.py, which I suspect 5might be the cause of sporadic unit-test failures on travis-ci.org with 6test_htmlparser.py. The test failures look like this: 7 8 ====================================================================== 9 10ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStrictTestCase) 11 12---------------------------------------------------------------------- 13 14Traceback (most recent call last): 15 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 661, in test_attr_entity_replacement 16 [("starttag", "a", [("b", "&><\"'")])]) 17 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 93, in _run_check 18 collector = self.get_collector() 19 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 617, in get_collector 20 return EventCollector(strict=True) 21 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 27, in __init__ 22 html.parser.HTMLParser.__init__(self, *args, **kw) 23 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 135, in __init__ 24 self.reset() 25 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 143, in reset 26 _markupbase.ParserBase.reset(self) 27 28TypeError: unbound method reset() must be called with ParserBase instance as first argument (got EventCollector instance instead) 29 30This module is used as a foundation for the html.parser module. It has no 31documented public API and should not be used directly. 32 33""" 34 35import re 36 37_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 38_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 39_commentclose = re.compile(r'--\s*>') 40_markedsectionclose = re.compile(r']\s*]\s*>') 41 42# An analysis of the MS-Word extensions is available at 43# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 44 45_msmarkedsectionclose = re.compile(r']\s*>') 46 47del re 48 49 50class ParserBase(object): 51 """Parser base class which provides some common support methods used 52 by the SGML/HTML and XHTML parsers.""" 53 54 def __init__(self): 55 if self.__class__ is ParserBase: 56 raise RuntimeError( 57 "_markupbase.ParserBase must be subclassed") 58 59 def error(self, message): 60 raise NotImplementedError( 61 "subclasses of ParserBase must override error()") 62 63 def reset(self): 64 self.lineno = 1 65 self.offset = 0 66 67 def getpos(self): 68 """Return current line number and offset.""" 69 return self.lineno, self.offset 70 71 # Internal -- update line number and offset. This should be 72 # called for each piece of data exactly once, in order -- in other 73 # words the concatenation of all the input strings to this 74 # function should be exactly the entire input. 75 def updatepos(self, i, j): 76 if i >= j: 77 return j 78 rawdata = self.rawdata 79 nlines = rawdata.count("\n", i, j) 80 if nlines: 81 self.lineno = self.lineno + nlines 82 pos = rawdata.rindex("\n", i, j) # Should not fail 83 self.offset = j-(pos+1) 84 else: 85 self.offset = self.offset + j-i 86 return j 87 88 _decl_otherchars = '' 89 90 # Internal -- parse declaration (for use by subclasses). 91 def parse_declaration(self, i): 92 # This is some sort of declaration; in "HTML as 93 # deployed," this should only be the document type 94 # declaration ("<!DOCTYPE html...>"). 95 # ISO 8879:1986, however, has more complex 96 # declaration syntax for elements in <!...>, including: 97 # --comment-- 98 # [marked section] 99 # name in the following list: ENTITY, DOCTYPE, ELEMENT, 100 # ATTLIST, NOTATION, SHORTREF, USEMAP, 101 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 102 rawdata = self.rawdata 103 j = i + 2 104 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 105 if rawdata[j:j+1] == ">": 106 # the empty comment <!> 107 return j + 1 108 if rawdata[j:j+1] in ("-", ""): 109 # Start of comment followed by buffer boundary, 110 # or just a buffer boundary. 111 return -1 112 # A simple, practical version could look like: ((name|stringlit) S*) + '>' 113 n = len(rawdata) 114 if rawdata[j:j+2] == '--': #comment 115 # Locate --.*-- as the body of the comment 116 return self.parse_comment(i) 117 elif rawdata[j] == '[': #marked section 118 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 119 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 120 # Note that this is extended by Microsoft Office "Save as Web" function 121 # to include [if...] and [endif]. 122 return self.parse_marked_section(i) 123 else: #all other declaration elements 124 decltype, j = self._scan_name(j, i) 125 if j < 0: 126 return j 127 if decltype == "doctype": 128 self._decl_otherchars = '' 129 while j < n: 130 c = rawdata[j] 131 if c == ">": 132 # end of declaration syntax 133 data = rawdata[i+2:j] 134 if decltype == "doctype": 135 self.handle_decl(data) 136 else: 137 # According to the HTML5 specs sections "8.2.4.44 Bogus 138 # comment state" and "8.2.4.45 Markup declaration open 139 # state", a comment token should be emitted. 140 # Calling unknown_decl provides more flexibility though. 141 self.unknown_decl(data) 142 return j + 1 143 if c in "\"'": 144 m = _declstringlit_match(rawdata, j) 145 if not m: 146 return -1 # incomplete 147 j = m.end() 148 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 149 name, j = self._scan_name(j, i) 150 elif c in self._decl_otherchars: 151 j = j + 1 152 elif c == "[": 153 # this could be handled in a separate doctype parser 154 if decltype == "doctype": 155 j = self._parse_doctype_subset(j + 1, i) 156 elif decltype in set(["attlist", "linktype", "link", "element"]): 157 # must tolerate []'d groups in a content model in an element declaration 158 # also in data attribute specifications of attlist declaration 159 # also link type declaration subsets in linktype declarations 160 # also link attribute specification lists in link declarations 161 self.error("unsupported '[' char in %s declaration" % decltype) 162 else: 163 self.error("unexpected '[' char in declaration") 164 else: 165 self.error( 166 "unexpected %r char in declaration" % rawdata[j]) 167 if j < 0: 168 return j 169 return -1 # incomplete 170 171 # Internal -- parse a marked section 172 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 173 def parse_marked_section(self, i, report=1): 174 rawdata= self.rawdata 175 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 176 sectName, j = self._scan_name( i+3, i ) 177 if j < 0: 178 return j 179 if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]): 180 # look for standard ]]> ending 181 match= _markedsectionclose.search(rawdata, i+3) 182 elif sectName in set(["if", "else", "endif"]): 183 # look for MS Office ]> ending 184 match= _msmarkedsectionclose.search(rawdata, i+3) 185 else: 186 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) 187 if not match: 188 return -1 189 if report: 190 j = match.start(0) 191 self.unknown_decl(rawdata[i+3: j]) 192 return match.end(0) 193 194 # Internal -- parse comment, return length or -1 if not terminated 195 def parse_comment(self, i, report=1): 196 rawdata = self.rawdata 197 if rawdata[i:i+4] != '<!--': 198 self.error('unexpected call to parse_comment()') 199 match = _commentclose.search(rawdata, i+4) 200 if not match: 201 return -1 202 if report: 203 j = match.start(0) 204 self.handle_comment(rawdata[i+4: j]) 205 return match.end(0) 206 207 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 208 # returning the index just past any whitespace following the trailing ']'. 209 def _parse_doctype_subset(self, i, declstartpos): 210 rawdata = self.rawdata 211 n = len(rawdata) 212 j = i 213 while j < n: 214 c = rawdata[j] 215 if c == "<": 216 s = rawdata[j:j+2] 217 if s == "<": 218 # end of buffer; incomplete 219 return -1 220 if s != "<!": 221 self.updatepos(declstartpos, j + 1) 222 self.error("unexpected char in internal subset (in %r)" % s) 223 if (j + 2) == n: 224 # end of buffer; incomplete 225 return -1 226 if (j + 4) > n: 227 # end of buffer; incomplete 228 return -1 229 if rawdata[j:j+4] == "<!--": 230 j = self.parse_comment(j, report=0) 231 if j < 0: 232 return j 233 continue 234 name, j = self._scan_name(j + 2, declstartpos) 235 if j == -1: 236 return -1 237 if name not in set(["attlist", "element", "entity", "notation"]): 238 self.updatepos(declstartpos, j + 2) 239 self.error( 240 "unknown declaration %r in internal subset" % name) 241 # handle the individual names 242 meth = getattr(self, "_parse_doctype_" + name) 243 j = meth(j, declstartpos) 244 if j < 0: 245 return j 246 elif c == "%": 247 # parameter entity reference 248 if (j + 1) == n: 249 # end of buffer; incomplete 250 return -1 251 s, j = self._scan_name(j + 1, declstartpos) 252 if j < 0: 253 return j 254 if rawdata[j] == ";": 255 j = j + 1 256 elif c == "]": 257 j = j + 1 258 while j < n and rawdata[j].isspace(): 259 j = j + 1 260 if j < n: 261 if rawdata[j] == ">": 262 return j 263 self.updatepos(declstartpos, j) 264 self.error("unexpected char after internal subset") 265 else: 266 return -1 267 elif c.isspace(): 268 j = j + 1 269 else: 270 self.updatepos(declstartpos, j) 271 self.error("unexpected char %r in internal subset" % c) 272 # end of buffer reached 273 return -1 274 275 # Internal -- scan past <!ELEMENT declarations 276 def _parse_doctype_element(self, i, declstartpos): 277 name, j = self._scan_name(i, declstartpos) 278 if j == -1: 279 return -1 280 # style content model; just skip until '>' 281 rawdata = self.rawdata 282 if '>' in rawdata[j:]: 283 return rawdata.find(">", j) + 1 284 return -1 285 286 # Internal -- scan past <!ATTLIST declarations 287 def _parse_doctype_attlist(self, i, declstartpos): 288 rawdata = self.rawdata 289 name, j = self._scan_name(i, declstartpos) 290 c = rawdata[j:j+1] 291 if c == "": 292 return -1 293 if c == ">": 294 return j + 1 295 while 1: 296 # scan a series of attribute descriptions; simplified: 297 # name type [value] [#constraint] 298 name, j = self._scan_name(j, declstartpos) 299 if j < 0: 300 return j 301 c = rawdata[j:j+1] 302 if c == "": 303 return -1 304 if c == "(": 305 # an enumerated type; look for ')' 306 if ")" in rawdata[j:]: 307 j = rawdata.find(")", j) + 1 308 else: 309 return -1 310 while rawdata[j:j+1].isspace(): 311 j = j + 1 312 if not rawdata[j:]: 313 # end of buffer, incomplete 314 return -1 315 else: 316 name, j = self._scan_name(j, declstartpos) 317 c = rawdata[j:j+1] 318 if not c: 319 return -1 320 if c in "'\"": 321 m = _declstringlit_match(rawdata, j) 322 if m: 323 j = m.end() 324 else: 325 return -1 326 c = rawdata[j:j+1] 327 if not c: 328 return -1 329 if c == "#": 330 if rawdata[j:] == "#": 331 # end of buffer 332 return -1 333 name, j = self._scan_name(j + 1, declstartpos) 334 if j < 0: 335 return j 336 c = rawdata[j:j+1] 337 if not c: 338 return -1 339 if c == '>': 340 # all done 341 return j + 1 342 343 # Internal -- scan past <!NOTATION declarations 344 def _parse_doctype_notation(self, i, declstartpos): 345 name, j = self._scan_name(i, declstartpos) 346 if j < 0: 347 return j 348 rawdata = self.rawdata 349 while 1: 350 c = rawdata[j:j+1] 351 if not c: 352 # end of buffer; incomplete 353 return -1 354 if c == '>': 355 return j + 1 356 if c in "'\"": 357 m = _declstringlit_match(rawdata, j) 358 if not m: 359 return -1 360 j = m.end() 361 else: 362 name, j = self._scan_name(j, declstartpos) 363 if j < 0: 364 return j 365 366 # Internal -- scan past <!ENTITY declarations 367 def _parse_doctype_entity(self, i, declstartpos): 368 rawdata = self.rawdata 369 if rawdata[i:i+1] == "%": 370 j = i + 1 371 while 1: 372 c = rawdata[j:j+1] 373 if not c: 374 return -1 375 if c.isspace(): 376 j = j + 1 377 else: 378 break 379 else: 380 j = i 381 name, j = self._scan_name(j, declstartpos) 382 if j < 0: 383 return j 384 while 1: 385 c = self.rawdata[j:j+1] 386 if not c: 387 return -1 388 if c in "'\"": 389 m = _declstringlit_match(rawdata, j) 390 if m: 391 j = m.end() 392 else: 393 return -1 # incomplete 394 elif c == ">": 395 return j + 1 396 else: 397 name, j = self._scan_name(j, declstartpos) 398 if j < 0: 399 return j 400 401 # Internal -- scan a name token and the new position and the token, or 402 # return -1 if we've reached the end of the buffer. 403 def _scan_name(self, i, declstartpos): 404 rawdata = self.rawdata 405 n = len(rawdata) 406 if i == n: 407 return None, -1 408 m = _declname_match(rawdata, i) 409 if m: 410 s = m.group() 411 name = s.strip() 412 if (i + len(s)) == n: 413 return None, -1 # end of buffer 414 return name.lower(), m.end() 415 else: 416 self.updatepos(declstartpos, i) 417 self.error("expected name token at %r" 418 % rawdata[declstartpos:declstartpos+20]) 419 420 # To be overridden -- handlers for unknown objects 421 def unknown_decl(self, data): 422 pass 423