1# encoding: utf-8 2"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 3 4# Use of this source code is governed by the MIT license. 5__license__ = "MIT" 6 7__all__ = [ 8 'HTMLParserTreeBuilder', 9 ] 10 11from html.parser import HTMLParser 12 13try: 14 from html.parser import HTMLParseError 15except ImportError as e: 16 # HTMLParseError is removed in Python 3.5. Since it can never be 17 # thrown in 3.5, we can just define our own class as a placeholder. 18 class HTMLParseError(Exception): 19 pass 20 21import sys 22import warnings 23 24# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 25# argument, which we'd like to set to False. Unfortunately, 26# http://bugs.python.org/issue13273 makes strict=True a better bet 27# before Python 3.2.3. 28# 29# At the end of this file, we monkeypatch HTMLParser so that 30# strict=True works well on Python 3.2.2. 31major, minor, release = sys.version_info[:3] 32CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 33CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 34CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 35 36 37from bs4.element import ( 38 CData, 39 Comment, 40 Declaration, 41 Doctype, 42 ProcessingInstruction, 43 ) 44from bs4.dammit import EntitySubstitution, UnicodeDammit 45 46from bs4.builder import ( 47 HTML, 48 HTMLTreeBuilder, 49 STRICT, 50 ) 51 52 53HTMLPARSER = 'html.parser' 54 55class BeautifulSoupHTMLParser(HTMLParser): 56 """A subclass of the Python standard library's HTMLParser class, which 57 listens for HTMLParser events and translates them into calls 58 to Beautiful Soup's tree construction API. 59 """ 60 61 # Strategies for handling duplicate attributes 62 IGNORE = 'ignore' 63 REPLACE = 'replace' 64 65 def __init__(self, *args, **kwargs): 66 """Constructor. 67 68 :param on_duplicate_attribute: A strategy for what to do if a 69 tag includes the same attribute more than once. Accepted 70 values are: REPLACE (replace earlier values with later 71 ones, the default), IGNORE (keep the earliest value 72 encountered), or a callable. A callable must take three 73 arguments: the dictionary of attributes already processed, 74 the name of the duplicate attribute, and the most recent value 75 encountered. 76 """ 77 self.on_duplicate_attribute = kwargs.pop( 78 'on_duplicate_attribute', self.REPLACE 79 ) 80 HTMLParser.__init__(self, *args, **kwargs) 81 82 # Keep a list of empty-element tags that were encountered 83 # without an explicit closing tag. If we encounter a closing tag 84 # of this type, we'll associate it with one of those entries. 85 # 86 # This isn't a stack because we don't care about the 87 # order. It's a list of closing tags we've already handled and 88 # will ignore, assuming they ever show up. 89 self.already_closed_empty_element = [] 90 91 def error(self, msg): 92 """In Python 3, HTMLParser subclasses must implement error(), although 93 this requirement doesn't appear to be documented. 94 95 In Python 2, HTMLParser implements error() by raising an exception, 96 which we don't want to do. 97 98 In any event, this method is called only on very strange 99 markup and our best strategy is to pretend it didn't happen 100 and keep going. 101 """ 102 warnings.warn(msg) 103 104 def handle_startendtag(self, name, attrs): 105 """Handle an incoming empty-element tag. 106 107 This is only called when the markup looks like <tag/>. 108 109 :param name: Name of the tag. 110 :param attrs: Dictionary of the tag's attributes. 111 """ 112 # is_startend() tells handle_starttag not to close the tag 113 # just because its name matches a known empty-element tag. We 114 # know that this is an empty-element tag and we want to call 115 # handle_endtag ourselves. 116 tag = self.handle_starttag(name, attrs, handle_empty_element=False) 117 self.handle_endtag(name) 118 119 def handle_starttag(self, name, attrs, handle_empty_element=True): 120 """Handle an opening tag, e.g. '<tag>' 121 122 :param name: Name of the tag. 123 :param attrs: Dictionary of the tag's attributes. 124 :param handle_empty_element: True if this tag is known to be 125 an empty-element tag (i.e. there is not expected to be any 126 closing tag). 127 """ 128 # XXX namespace 129 attr_dict = {} 130 for key, value in attrs: 131 # Change None attribute values to the empty string 132 # for consistency with the other tree builders. 133 if value is None: 134 value = '' 135 if key in attr_dict: 136 # A single attribute shows up multiple times in this 137 # tag. How to handle it depends on the 138 # on_duplicate_attribute setting. 139 on_dupe = self.on_duplicate_attribute 140 if on_dupe == self.IGNORE: 141 pass 142 elif on_dupe in (None, self.REPLACE): 143 attr_dict[key] = value 144 else: 145 on_dupe(attr_dict, key, value) 146 else: 147 attr_dict[key] = value 148 attrvalue = '""' 149 #print("START", name) 150 sourceline, sourcepos = self.getpos() 151 tag = self.soup.handle_starttag( 152 name, None, None, attr_dict, sourceline=sourceline, 153 sourcepos=sourcepos 154 ) 155 if tag and tag.is_empty_element and handle_empty_element: 156 # Unlike other parsers, html.parser doesn't send separate end tag 157 # events for empty-element tags. (It's handled in 158 # handle_startendtag, but only if the original markup looked like 159 # <tag/>.) 160 # 161 # So we need to call handle_endtag() ourselves. Since we 162 # know the start event is identical to the end event, we 163 # don't want handle_endtag() to cross off any previous end 164 # events for tags of this name. 165 self.handle_endtag(name, check_already_closed=False) 166 167 # But we might encounter an explicit closing tag for this tag 168 # later on. If so, we want to ignore it. 169 self.already_closed_empty_element.append(name) 170 171 def handle_endtag(self, name, check_already_closed=True): 172 """Handle a closing tag, e.g. '</tag>' 173 174 :param name: A tag name. 175 :param check_already_closed: True if this tag is expected to 176 be the closing portion of an empty-element tag, 177 e.g. '<tag></tag>'. 178 """ 179 #print("END", name) 180 if check_already_closed and name in self.already_closed_empty_element: 181 # This is a redundant end tag for an empty-element tag. 182 # We've already called handle_endtag() for it, so just 183 # check it off the list. 184 #print("ALREADY CLOSED", name) 185 self.already_closed_empty_element.remove(name) 186 else: 187 self.soup.handle_endtag(name) 188 189 def handle_data(self, data): 190 """Handle some textual data that shows up between tags.""" 191 self.soup.handle_data(data) 192 193 def handle_charref(self, name): 194 """Handle a numeric character reference by converting it to the 195 corresponding Unicode character and treating it as textual 196 data. 197 198 :param name: Character number, possibly in hexadecimal. 199 """ 200 # XXX workaround for a bug in HTMLParser. Remove this once 201 # it's fixed in all supported versions. 202 # http://bugs.python.org/issue13633 203 if name.startswith('x'): 204 real_name = int(name.lstrip('x'), 16) 205 elif name.startswith('X'): 206 real_name = int(name.lstrip('X'), 16) 207 else: 208 real_name = int(name) 209 210 data = None 211 if real_name < 256: 212 # HTML numeric entities are supposed to reference Unicode 213 # code points, but sometimes they reference code points in 214 # some other encoding (ahem, Windows-1252). E.g. “ 215 # instead of É for LEFT DOUBLE QUOTATION MARK. This 216 # code tries to detect this situation and compensate. 217 for encoding in (self.soup.original_encoding, 'windows-1252'): 218 if not encoding: 219 continue 220 try: 221 data = bytearray([real_name]).decode(encoding) 222 except UnicodeDecodeError as e: 223 pass 224 if not data: 225 try: 226 data = chr(real_name) 227 except (ValueError, OverflowError) as e: 228 pass 229 data = data or "\N{REPLACEMENT CHARACTER}" 230 self.handle_data(data) 231 232 def handle_entityref(self, name): 233 """Handle a named entity reference by converting it to the 234 corresponding Unicode character(s) and treating it as textual 235 data. 236 237 :param name: Name of the entity reference. 238 """ 239 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 240 if character is not None: 241 data = character 242 else: 243 # If this were XML, it would be ambiguous whether "&foo" 244 # was an character entity reference with a missing 245 # semicolon or the literal string "&foo". Since this is 246 # HTML, we have a complete list of all character entity references, 247 # and this one wasn't found, so assume it's the literal string "&foo". 248 data = "&%s" % name 249 self.handle_data(data) 250 251 def handle_comment(self, data): 252 """Handle an HTML comment. 253 254 :param data: The text of the comment. 255 """ 256 self.soup.endData() 257 self.soup.handle_data(data) 258 self.soup.endData(Comment) 259 260 def handle_decl(self, data): 261 """Handle a DOCTYPE declaration. 262 263 :param data: The text of the declaration. 264 """ 265 self.soup.endData() 266 data = data[len("DOCTYPE "):] 267 self.soup.handle_data(data) 268 self.soup.endData(Doctype) 269 270 def unknown_decl(self, data): 271 """Handle a declaration of unknown type -- probably a CDATA block. 272 273 :param data: The text of the declaration. 274 """ 275 if data.upper().startswith('CDATA['): 276 cls = CData 277 data = data[len('CDATA['):] 278 else: 279 cls = Declaration 280 self.soup.endData() 281 self.soup.handle_data(data) 282 self.soup.endData(cls) 283 284 def handle_pi(self, data): 285 """Handle a processing instruction. 286 287 :param data: The text of the instruction. 288 """ 289 self.soup.endData() 290 self.soup.handle_data(data) 291 self.soup.endData(ProcessingInstruction) 292 293 294class HTMLParserTreeBuilder(HTMLTreeBuilder): 295 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, 296 found in the Python standard library. 297 """ 298 is_xml = False 299 picklable = True 300 NAME = HTMLPARSER 301 features = [NAME, HTML, STRICT] 302 303 # The html.parser knows which line number and position in the 304 # original file is the source of an element. 305 TRACKS_LINE_NUMBERS = True 306 307 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): 308 """Constructor. 309 310 :param parser_args: Positional arguments to pass into 311 the BeautifulSoupHTMLParser constructor, once it's 312 invoked. 313 :param parser_kwargs: Keyword arguments to pass into 314 the BeautifulSoupHTMLParser constructor, once it's 315 invoked. 316 :param kwargs: Keyword arguments for the superclass constructor. 317 """ 318 # Some keyword arguments will be pulled out of kwargs and placed 319 # into parser_kwargs. 320 extra_parser_kwargs = dict() 321 for arg in ('on_duplicate_attribute',): 322 if arg in kwargs: 323 value = kwargs.pop(arg) 324 extra_parser_kwargs[arg] = value 325 super(HTMLParserTreeBuilder, self).__init__(**kwargs) 326 parser_args = parser_args or [] 327 parser_kwargs = parser_kwargs or {} 328 parser_kwargs.update(extra_parser_kwargs) 329 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 330 parser_kwargs['strict'] = False 331 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 332 parser_kwargs['convert_charrefs'] = False 333 self.parser_args = (parser_args, parser_kwargs) 334 335 def prepare_markup(self, markup, user_specified_encoding=None, 336 document_declared_encoding=None, exclude_encodings=None): 337 338 """Run any preliminary steps necessary to make incoming markup 339 acceptable to the parser. 340 341 :param markup: Some markup -- probably a bytestring. 342 :param user_specified_encoding: The user asked to try this encoding. 343 :param document_declared_encoding: The markup itself claims to be 344 in this encoding. 345 :param exclude_encodings: The user asked _not_ to try any of 346 these encodings. 347 348 :yield: A series of 4-tuples: 349 (markup, encoding, declared encoding, 350 has undergone character replacement) 351 352 Each 4-tuple represents a strategy for converting the 353 document to Unicode and parsing it. Each strategy will be tried 354 in turn. 355 """ 356 if isinstance(markup, str): 357 # Parse Unicode as-is. 358 yield (markup, None, None, False) 359 return 360 361 # Ask UnicodeDammit to sniff the most likely encoding. 362 363 # This was provided by the end-user; treat it as a known 364 # definite encoding per the algorithm laid out in the HTML5 365 # spec. (See the EncodingDetector class for details.) 366 known_definite_encodings = [user_specified_encoding] 367 368 # This was found in the document; treat it as a slightly lower-priority 369 # user encoding. 370 user_encodings = [document_declared_encoding] 371 372 try_encodings = [user_specified_encoding, document_declared_encoding] 373 dammit = UnicodeDammit( 374 markup, 375 known_definite_encodings=known_definite_encodings, 376 user_encodings=user_encodings, 377 is_html=True, 378 exclude_encodings=exclude_encodings 379 ) 380 yield (dammit.markup, dammit.original_encoding, 381 dammit.declared_html_encoding, 382 dammit.contains_replacement_characters) 383 384 def feed(self, markup): 385 """Run some incoming markup through some parsing process, 386 populating the `BeautifulSoup` object in self.soup. 387 """ 388 args, kwargs = self.parser_args 389 parser = BeautifulSoupHTMLParser(*args, **kwargs) 390 parser.soup = self.soup 391 try: 392 parser.feed(markup) 393 parser.close() 394 except HTMLParseError as e: 395 warnings.warn(RuntimeWarning( 396 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 397 raise e 398 parser.already_closed_empty_element = [] 399 400# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 401# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 402# string. 403# 404# XXX This code can be removed once most Python 3 users are on 3.2.3. 405if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 406 import re 407 attrfind_tolerant = re.compile( 408 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 409 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 410 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 411 412 locatestarttagend = re.compile(r""" 413 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 414 (?:\s+ # whitespace before attribute name 415 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 416 (?:\s*=\s* # value indicator 417 (?:'[^']*' # LITA-enclosed value 418 |\"[^\"]*\" # LIT-enclosed value 419 |[^'\">\s]+ # bare value 420 ) 421 )? 422 ) 423 )* 424 \s* # trailing whitespace 425""", re.VERBOSE) 426 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 427 428 from html.parser import tagfind, attrfind 429 430 def parse_starttag(self, i): 431 self.__starttag_text = None 432 endpos = self.check_for_whole_start_tag(i) 433 if endpos < 0: 434 return endpos 435 rawdata = self.rawdata 436 self.__starttag_text = rawdata[i:endpos] 437 438 # Now parse the data between i+1 and j into a tag and attrs 439 attrs = [] 440 match = tagfind.match(rawdata, i+1) 441 assert match, 'unexpected call to parse_starttag()' 442 k = match.end() 443 self.lasttag = tag = rawdata[i+1:k].lower() 444 while k < endpos: 445 if self.strict: 446 m = attrfind.match(rawdata, k) 447 else: 448 m = attrfind_tolerant.match(rawdata, k) 449 if not m: 450 break 451 attrname, rest, attrvalue = m.group(1, 2, 3) 452 if not rest: 453 attrvalue = None 454 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 455 attrvalue[:1] == '"' == attrvalue[-1:]: 456 attrvalue = attrvalue[1:-1] 457 if attrvalue: 458 attrvalue = self.unescape(attrvalue) 459 attrs.append((attrname.lower(), attrvalue)) 460 k = m.end() 461 462 end = rawdata[k:endpos].strip() 463 if end not in (">", "/>"): 464 lineno, offset = self.getpos() 465 if "\n" in self.__starttag_text: 466 lineno = lineno + self.__starttag_text.count("\n") 467 offset = len(self.__starttag_text) \ 468 - self.__starttag_text.rfind("\n") 469 else: 470 offset = offset + len(self.__starttag_text) 471 if self.strict: 472 self.error("junk characters in start tag: %r" 473 % (rawdata[k:endpos][:20],)) 474 self.handle_data(rawdata[i:endpos]) 475 return endpos 476 if end.endswith('/>'): 477 # XHTML-style empty tag: <span attr="value" /> 478 self.handle_startendtag(tag, attrs) 479 else: 480 self.handle_starttag(tag, attrs) 481 if tag in self.CDATA_CONTENT_ELEMENTS: 482 self.set_cdata_mode(tag) 483 return endpos 484 485 def set_cdata_mode(self, elem): 486 self.cdata_elem = elem.lower() 487 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 488 489 BeautifulSoupHTMLParser.parse_starttag = parse_starttag 490 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 491 492 CONSTRUCTOR_TAKES_STRICT = True 493