1__all__ = ['findHTMLMeta', 'MetaNotFound'] 2 3from html.parser import HTMLParser 4import html.entities 5import re 6import sys 7 8from openid.yadis.constants import YADIS_HEADER_NAME 9 10# Size of the chunks to search at a time (also the amount that gets 11# read at a time) 12CHUNK_SIZE = 1024 * 16 # 16 KB 13 14 15class ParseDone(Exception): 16 """Exception to hold the URI that was located when the parse is 17 finished. If the parse finishes without finding the URI, set it to 18 None.""" 19 20 21class MetaNotFound(Exception): 22 """Exception to hold the content of the page if we did not find 23 the appropriate <meta> tag""" 24 25 26re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE 27ent_pat = r''' 28& 29 30(?: \#x (?P<hex> [a-f0-9]+ ) 31| \# (?P<dec> \d+ ) 32| (?P<word> \w+ ) 33) 34 35;''' 36 37ent_re = re.compile(ent_pat, re_flags) 38 39 40def substituteMO(mo): 41 if mo.lastgroup == 'hex': 42 codepoint = int(mo.group('hex'), 16) 43 elif mo.lastgroup == 'dec': 44 codepoint = int(mo.group('dec')) 45 else: 46 assert mo.lastgroup == 'word' 47 codepoint = html.entities.name2codepoint.get(mo.group('word')) 48 49 if codepoint is None: 50 return mo.group() 51 else: 52 return chr(codepoint) 53 54 55def substituteEntities(s): 56 return ent_re.sub(substituteMO, s) 57 58 59class YadisHTMLParser(HTMLParser): 60 """Parser that finds a meta http-equiv tag in the head of a html 61 document. 62 63 When feeding in data, if the tag is matched or it will never be 64 found, the parser will raise ParseDone with the uri as the first 65 attribute. 66 67 Parsing state diagram 68 ===================== 69 70 Any unlisted input does not affect the state:: 71 72 1, 2, 5 8 73 +--------------------------+ +-+ 74 | | | | 75 4 | 3 1, 2, 5, 7 v | v 76 TOP -> HTML -> HEAD ----------> TERMINATED 77 | | ^ | ^ ^ 78 | | 3 | | | | 79 | +------------+ +-> FOUND ------+ | 80 | 6 8 | 81 | 1, 2 | 82 +------------------------------------+ 83 84 1. any of </body>, </html>, </head> -> TERMINATE 85 2. <body> -> TERMINATE 86 3. <head> -> HEAD 87 4. <html> -> HTML 88 5. <html> -> TERMINATE 89 6. <meta http-equiv='X-XRDS-Location'> -> FOUND 90 7. <head> -> TERMINATE 91 8. Any input -> TERMINATE 92 """ 93 TOP = 0 94 HTML = 1 95 HEAD = 2 96 FOUND = 3 97 TERMINATED = 4 98 99 def __init__(self): 100 if (sys.version_info.minor <= 2): 101 # Python 3.2 and below actually require the `strict` argument 102 # to `html.parser.HTMLParser` -- otherwise it's deprecated and 103 # we don't want to pass it 104 super(YadisHTMLParser, self).__init__(strict=False) 105 else: 106 super(YadisHTMLParser, self).__init__() 107 self.phase = self.TOP 108 109 def _terminate(self): 110 self.phase = self.TERMINATED 111 raise ParseDone(None) 112 113 def handle_endtag(self, tag): 114 # If we ever see an end of head, body, or html, bail out right away. 115 # [1] 116 if tag in ['head', 'body', 'html']: 117 self._terminate() 118 119 def handle_starttag(self, tag, attrs): 120 # if we ever see a start body tag, bail out right away, since 121 # we want to prevent the meta tag from appearing in the body 122 # [2] 123 if tag == 'body': 124 self._terminate() 125 126 if self.phase == self.TOP: 127 # At the top level, allow a html tag or a head tag to move 128 # to the head or html phase 129 if tag == 'head': 130 # [3] 131 self.phase = self.HEAD 132 elif tag == 'html': 133 # [4] 134 self.phase = self.HTML 135 136 elif self.phase == self.HTML: 137 # if we are in the html tag, allow a head tag to move to 138 # the HEAD phase. If we get another html tag, then bail 139 # out 140 if tag == 'head': 141 # [3] 142 self.phase = self.HEAD 143 elif tag == 'html': 144 # [5] 145 self._terminate() 146 147 elif self.phase == self.HEAD: 148 # If we are in the head phase, look for the appropriate 149 # meta tag. If we get a head or body tag, bail out. 150 if tag == 'meta': 151 attrs_d = dict(attrs) 152 http_equiv = attrs_d.get('http-equiv', '').lower() 153 if http_equiv == YADIS_HEADER_NAME.lower(): 154 raw_attr = attrs_d.get('content') 155 yadis_loc = substituteEntities(raw_attr) 156 # [6] 157 self.phase = self.FOUND 158 raise ParseDone(yadis_loc) 159 160 elif tag in ('head', 'html'): 161 # [5], [7] 162 self._terminate() 163 164 def feed(self, chars): 165 # [8] 166 if self.phase in (self.TERMINATED, self.FOUND): 167 self._terminate() 168 169 return super(YadisHTMLParser, self).feed(chars) 170 171 172def findHTMLMeta(stream): 173 """Look for a meta http-equiv tag with the YADIS header name. 174 175 @param stream: Source of the html text 176 @type stream: Object that implements a read() method that works 177 like file.read 178 179 @return: The URI from which to fetch the XRDS document 180 @rtype: str 181 182 @raises MetaNotFound: raised with the content that was 183 searched as the first parameter. 184 """ 185 parser = YadisHTMLParser() 186 chunks = [] 187 188 while 1: 189 chunk = stream.read(CHUNK_SIZE) 190 if not chunk: 191 # End of file 192 break 193 194 chunks.append(chunk) 195 try: 196 parser.feed(chunk) 197 except ParseDone as why: 198 uri = why.args[0] 199 if uri is None: 200 # Parse finished, but we may need the rest of the file 201 chunks.append(stream.read()) 202 break 203 else: 204 return uri 205 206 content = ''.join(chunks) 207 raise MetaNotFound(content) 208