1__all__ = ['findHTMLMeta', 'MetaNotFound']
2
3from html.parser import HTMLParser
4import html.entities
5import re
6import sys
7
8from openid.yadis.constants import YADIS_HEADER_NAME
9
10# Size of the chunks to search at a time (also the amount that gets
11# read at a time)
12CHUNK_SIZE = 1024 * 16  # 16 KB
13
14
15class ParseDone(Exception):
16    """Exception to hold the URI that was located when the parse is
17    finished. If the parse finishes without finding the URI, set it to
18    None."""
19
20
21class MetaNotFound(Exception):
22    """Exception to hold the content of the page if we did not find
23    the appropriate <meta> tag"""
24
25
26re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE
27ent_pat = r'''
28&
29
30(?: \#x (?P<hex> [a-f0-9]+ )
31|   \# (?P<dec> \d+ )
32|   (?P<word> \w+ )
33)
34
35;'''
36
37ent_re = re.compile(ent_pat, re_flags)
38
39
40def substituteMO(mo):
41    if mo.lastgroup == 'hex':
42        codepoint = int(mo.group('hex'), 16)
43    elif mo.lastgroup == 'dec':
44        codepoint = int(mo.group('dec'))
45    else:
46        assert mo.lastgroup == 'word'
47        codepoint = html.entities.name2codepoint.get(mo.group('word'))
48
49    if codepoint is None:
50        return mo.group()
51    else:
52        return chr(codepoint)
53
54
55def substituteEntities(s):
56    return ent_re.sub(substituteMO, s)
57
58
59class YadisHTMLParser(HTMLParser):
60    """Parser that finds a meta http-equiv tag in the head of a html
61    document.
62
63    When feeding in data, if the tag is matched or it will never be
64    found, the parser will raise ParseDone with the uri as the first
65    attribute.
66
67    Parsing state diagram
68    =====================
69
70    Any unlisted input does not affect the state::
71
72                1, 2, 5                       8
73               +--------------------------+  +-+
74               |                          |  | |
75            4  |    3       1, 2, 5, 7    v  | v
76        TOP -> HTML -> HEAD ----------> TERMINATED
77        | |            ^  |               ^  ^
78        | | 3          |  |               |  |
79        | +------------+  +-> FOUND ------+  |
80        |                  6         8       |
81        | 1, 2                               |
82        +------------------------------------+
83
84      1. any of </body>, </html>, </head> -> TERMINATE
85      2. <body> -> TERMINATE
86      3. <head> -> HEAD
87      4. <html> -> HTML
88      5. <html> -> TERMINATE
89      6. <meta http-equiv='X-XRDS-Location'> -> FOUND
90      7. <head> -> TERMINATE
91      8. Any input -> TERMINATE
92    """
93    TOP = 0
94    HTML = 1
95    HEAD = 2
96    FOUND = 3
97    TERMINATED = 4
98
99    def __init__(self):
100        if (sys.version_info.minor <= 2):
101            # Python 3.2 and below actually require the `strict` argument
102            # to `html.parser.HTMLParser` -- otherwise it's deprecated and
103            # we don't want to pass it
104            super(YadisHTMLParser, self).__init__(strict=False)
105        else:
106            super(YadisHTMLParser, self).__init__()
107        self.phase = self.TOP
108
109    def _terminate(self):
110        self.phase = self.TERMINATED
111        raise ParseDone(None)
112
113    def handle_endtag(self, tag):
114        # If we ever see an end of head, body, or html, bail out right away.
115        # [1]
116        if tag in ['head', 'body', 'html']:
117            self._terminate()
118
119    def handle_starttag(self, tag, attrs):
120        # if we ever see a start body tag, bail out right away, since
121        # we want to prevent the meta tag from appearing in the body
122        # [2]
123        if tag == 'body':
124            self._terminate()
125
126        if self.phase == self.TOP:
127            # At the top level, allow a html tag or a head tag to move
128            # to the head or html phase
129            if tag == 'head':
130                # [3]
131                self.phase = self.HEAD
132            elif tag == 'html':
133                # [4]
134                self.phase = self.HTML
135
136        elif self.phase == self.HTML:
137            # if we are in the html tag, allow a head tag to move to
138            # the HEAD phase. If we get another html tag, then bail
139            # out
140            if tag == 'head':
141                # [3]
142                self.phase = self.HEAD
143            elif tag == 'html':
144                # [5]
145                self._terminate()
146
147        elif self.phase == self.HEAD:
148            # If we are in the head phase, look for the appropriate
149            # meta tag. If we get a head or body tag, bail out.
150            if tag == 'meta':
151                attrs_d = dict(attrs)
152                http_equiv = attrs_d.get('http-equiv', '').lower()
153                if http_equiv == YADIS_HEADER_NAME.lower():
154                    raw_attr = attrs_d.get('content')
155                    yadis_loc = substituteEntities(raw_attr)
156                    # [6]
157                    self.phase = self.FOUND
158                    raise ParseDone(yadis_loc)
159
160            elif tag in ('head', 'html'):
161                # [5], [7]
162                self._terminate()
163
164    def feed(self, chars):
165        # [8]
166        if self.phase in (self.TERMINATED, self.FOUND):
167            self._terminate()
168
169        return super(YadisHTMLParser, self).feed(chars)
170
171
172def findHTMLMeta(stream):
173    """Look for a meta http-equiv tag with the YADIS header name.
174
175    @param stream: Source of the html text
176    @type stream: Object that implements a read() method that works
177        like file.read
178
179    @return: The URI from which to fetch the XRDS document
180    @rtype: str
181
182    @raises MetaNotFound: raised with the content that was
183        searched as the first parameter.
184    """
185    parser = YadisHTMLParser()
186    chunks = []
187
188    while 1:
189        chunk = stream.read(CHUNK_SIZE)
190        if not chunk:
191            # End of file
192            break
193
194        chunks.append(chunk)
195        try:
196            parser.feed(chunk)
197        except ParseDone as why:
198            uri = why.args[0]
199            if uri is None:
200                # Parse finished, but we may need the rest of the file
201                chunks.append(stream.read())
202                break
203            else:
204                return uri
205
206    content = ''.join(chunks)
207    raise MetaNotFound(content)
208