#!/usr/bin/env python # vim:fileencoding=utf-8 # Copyright: 2017, Kovid Goyal from __future__ import (absolute_import, division, print_function, unicode_literals) import re import string from ._entities import html5_entities from .polyglot import codepoint_to_chr space_chars = frozenset(("\t", "\n", "\u000C", " ", "\r")) space_chars_bytes = frozenset(item.encode("ascii") for item in space_chars) ascii_letters_bytes = frozenset( item.encode("ascii") for item in string.ascii_letters) spaces_angle_brackets = space_chars_bytes | frozenset((b">", b"<")) skip1 = space_chars_bytes | frozenset((b"/", )) head_elems = frozenset(( b"html", b"head", b"title", b"base", b"script", b"style", b"meta", b"link", b"object")) def my_unichr(num): try: return codepoint_to_chr(num) except (ValueError, OverflowError): return '?' def replace_entity(match): ent = match.group(1).lower() if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software return "'" if ent == 'hellips': ent = 'hellip' if ent.startswith('#'): try: if ent[1] in ('x', 'X'): num = int(ent[2:], 16) else: num = int(ent[1:]) except Exception: return '&' + ent + ';' if num > 255: return my_unichr(num) try: return chr(num).decode('cp1252') except UnicodeDecodeError: return my_unichr(num) try: return html5_entities[ent] except KeyError: pass return '&' + ent + ';' class Bytes(bytes): """String-like object with an associated position and various extra methods If the position is ever greater than the string length then an exception is raised""" def __init__(self, value): self._position = -1 def __iter__(self): return self def __next__(self): p = self._position = self._position + 1 if p >= len(self): raise StopIteration elif p < 0: raise TypeError return self[p:p + 1] def next(self): # Py2 compat return self.__next__() def previous(self): p = self._position if p >= len(self): raise StopIteration elif p < 0: raise TypeError self._position = p = p - 1 return self[p:p + 1] @property def position(self): if self._position >= len(self): raise StopIteration if self._position >= 0: return self._position @position.setter def position(self, position): if self._position >= len(self): raise StopIteration self._position = position @property def current_byte(self): return self[self.position:self.position + 1] def skip(self, chars=space_chars_bytes): """Skip past a list of characters""" p = self.position # use property for the error-checking while p < len(self): c = self[p:p + 1] if c not in chars: self._position = p return c p += 1 self._position = p return def skip_until(self, chars): p = pos = self.position while p < len(self): c = self[p:p + 1] if c in chars: self._position = p return self[pos:p], c p += 1 self._position = p return b'', b'' def match_bytes(self, bytes): """Look for a sequence of bytes at the start of a string. If the bytes are found return True and advance the position to the byte after the match. Otherwise return False and leave the position alone""" p = self.position data = self[p:p + len(bytes)] rv = data.startswith(bytes) if rv: self.position += len(bytes) return rv def match_bytes_pat(self, pat): bytes = pat.pattern m = pat.match(self, self.position) if m is None: return False bytes = m.group() self.position += len(bytes) return True def jump_to(self, bytes): """Look for the next sequence of bytes matching a given sequence. If a match is found advance the position to the last byte of the match""" new_pos = self.find(bytes, max(0, self.position)) if new_pos > -1: new_pos -= self.position if self._position == -1: self._position = 0 self._position += (new_pos + len(bytes) - 1) return True else: raise StopIteration class HTTPEquivParser(object): """Mini parser for detecting http-equiv headers from meta tags """ def __init__(self, data): """string - the data to work on """ self.data = Bytes(data) self.headers = [] def __call__(self): mb, mbp = self.data.match_bytes, self.data.match_bytes_pat dispatch = ( (mb, b"") def handle_meta(self): if self.data.current_byte not in space_chars_bytes: # if we have ") def get_attribute(self): """Return a name,value pair for the next attribute in the stream, if one is found, or None""" data = self.data # Step 1 (skip chars) c = data.skip(skip1) assert c is None or len(c) == 1 # Step 2 if c in (b">", None): return None # Step 3 attr_name = [] attr_value = [] # Step 4 attribute name while True: if c == b"=" and attr_name: break elif c in space_chars_bytes: # Step 6! c = data.skip() break elif c in (b"/", b">"): return b"".join(attr_name), b"" elif c is None: return None else: attr_name.append(c) # Step 5 c = next(data) # Step 7 if c != b"=": data.previous() return b"".join(attr_name), b"" # Step 8 next(data) # Step 9 c = data.skip() # Step 10 if c in (b"'", b'"'): # 10.1 quote_char = c while True: # 10.2 c = next(data) # 10.3 if c == quote_char: next(data) return b"".join(attr_name), b"".join(attr_value) # 10.4 else: attr_value.append(c) elif c == b">": return b"".join(attr_name), b"" elif c is None: return None else: attr_value.append(c) # Step 11 while True: c = next(data) if c in spaces_angle_brackets: return b"".join(attr_name), b"".join(attr_value) elif c is None: return None else: attr_value.append(c)