1"""
2This module implements a VERY limited parser that finds <link> tags in
3the head of HTML or XHTML documents and parses out their attributes
4according to the OpenID spec. It is a liberal parser, but it requires
5these things from the data in order to work:
6
7 - There must be an open <html> tag
8
9 - There must be an open <head> tag inside of the <html> tag
10
11 - Only <link>s that are found inside of the <head> tag are parsed
12   (this is by design)
13
14 - The parser follows the OpenID specification in resolving the
15   attributes of the link tags. This means that the attributes DO NOT
16   get resolved as they would by an XML or HTML parser. In particular,
17   only certain entities get replaced, and href attributes do not get
18   resolved relative to a base URL.
19
20From http://openid.net/specs.bml#linkrel:
21
22 - The openid.server URL MUST be an absolute URL. OpenID consumers
23   MUST NOT attempt to resolve relative URLs.
24
25 - The openid.server URL MUST NOT include entities other than &amp;,
26   &lt;, &gt;, and &quot;.
27
28The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds of
29quoting are allowed for attributes.
30
31The parser deals with invalid markup in these ways:
32
33 - Tag names are not case-sensitive
34
35 - The <html> tag is accepted even when it is not at the top level
36
37 - The <head> tag is accepted even when it is not a direct child of
38   the <html> tag, but a <html> tag must be an ancestor of the <head>
39   tag
40
41 - <link> tags are accepted even when they are not direct children of
42   the <head> tag, but a <head> tag must be an ancestor of the <link>
43   tag
44
45 - If there is no closing tag for an open <html> or <head> tag, the
46   remainder of the document is viewed as being inside of the tag. If
47   there is no closing tag for a <link> tag, the link tag is treated
48   as a short tag. Exceptions to this rule are that <html> closes
49   <html> and <body> or <head> closes <head>
50
51 - Attributes of the <link> tag are not required to be quoted.
52
53 - In the case of duplicated attribute names, the attribute coming
54   last in the tag will be the value returned.
55
56 - Any text that does not parse as an attribute within a link tag will
57   be ignored. (e.g. <link pumpkin rel='openid.server' /> will ignore
58   pumpkin)
59
60 - If there are more than one <html> or <head> tag, the parser only
61   looks inside of the first one.
62
63 - The contents of <script> tags are ignored entirely, except unclosed
64   <script> tags. Unclosed <script> tags are ignored.
65
66 - Any other invalid markup is ignored, including unclosed SGML
67   comments and unclosed <![CDATA[blocks.
68"""
69
70__all__ = ['parseLinkAttrs']
71
72import re
73
74flags = (
75    re.DOTALL  # Match newlines with '.'
76    | re.IGNORECASE | re.VERBOSE  # Allow comments and whitespace in patterns
77    | re.UNICODE  # Make \b respect Unicode word boundaries
78)
79
80# Stuff to remove before we start looking for tags
81removed_re = re.compile(r'''
82  # Comments
83  <!--.*?-->
84
85  # CDATA blocks
86| <!\[CDATA\[.*?\]\]>
87
88  # script blocks
89| <script\b
90
91  # make sure script is not an XML namespace
92  (?!:)
93
94  [^>]*>.*?</script>
95
96''', flags)
97
98tag_expr = r'''
99# Starts with the tag name at a word boundary, where the tag name is
100# not a namespace
101<%(tag_name)s\b(?!:)
102
103# All of the stuff up to a ">", hopefully attributes.
104(?P<attrs>[^>]*?)
105
106(?: # Match a short tag
107    />
108
109|   # Match a full tag
110    >
111
112    (?P<contents>.*?)
113
114    # Closed by
115    (?: # One of the specified close tags
116        </?%(closers)s\s*>
117
118        # End of the string
119    |   \Z
120
121    )
122
123)
124'''
125
126
127def tagMatcher(tag_name, *close_tags):
128    if close_tags:
129        options = '|'.join((tag_name, ) + close_tags)
130        closers = '(?:%s)' % (options, )
131    else:
132        closers = tag_name
133
134    expr = tag_expr % locals()
135    return re.compile(expr, flags)
136
137
138# Must contain at least an open html and an open head tag
139html_find = tagMatcher('html')
140head_find = tagMatcher('head', 'body')
141link_find = re.compile(r'<link\b(?!:)', flags)
142
143attr_find = re.compile(r'''
144# Must start with a sequence of word-characters, followed by an equals sign
145(?P<attr_name>\w+)=
146
147# Then either a quoted or unquoted attribute
148(?:
149
150 # Match everything that\'s between matching quote marks
151 (?P<qopen>["\'])(?P<q_val>.*?)(?P=qopen)
152|
153
154 # If the value is not quoted, match up to whitespace
155 (?P<unq_val>(?:[^\s<>/]|/(?!>))+)
156)
157
158|
159
160(?P<end_link>[<>])
161''', flags)
162
163# Entity replacement:
164replacements = {
165    'amp': '&',
166    'lt': '<',
167    'gt': '>',
168    'quot': '"',
169}
170
171ent_replace = re.compile(r'&(%s);' % '|'.join(list(replacements.keys())))
172
173
174def replaceEnt(mo):
175    "Replace the entities that are specified by OpenID"
176    return replacements.get(mo.group(1), mo.group())
177
178
179def parseLinkAttrs(html, ignore_errors=False):
180    """Find all link tags in a string representing a HTML document and
181    return a list of their attributes.
182
183    @param html: the text to parse
184    @type html: str or unicode
185
186    @param ignore_errors: whether to return despite e.g. parsing errors
187    @type ignore_errors: bool
188
189    @return: A list of dictionaries of attributes, one for each link tag
190    @rtype: [[(type(html), type(html))]]
191    """
192    if isinstance(html, bytes):
193        # Attempt to decode as UTF-8, since that's the most modern -- also
194        # try Latin-1, since that's suggested by HTTP/1.1. If neither of
195        # those works, fall over.
196        try:
197            html = html.decode("utf-8")
198        except UnicodeDecodeError:
199            try:
200                html = html.decode("latin1")
201            except UnicodeDecodeError:
202                if ignore_errors:
203                    # Optionally ignore the errors and act as if no link attrs
204                    # were found here
205                    return []
206                else:
207                    raise AssertionError("Unreadable HTML!")
208
209    stripped = removed_re.sub('', html)
210    html_mo = html_find.search(stripped)
211    if html_mo is None or html_mo.start('contents') == -1:
212        return []
213
214    start, end = html_mo.span('contents')
215    head_mo = head_find.search(stripped, start, end)
216    if head_mo is None or head_mo.start('contents') == -1:
217        return []
218
219    start, end = head_mo.span('contents')
220    link_mos = link_find.finditer(stripped, head_mo.start(), head_mo.end())
221
222    matches = []
223    for link_mo in link_mos:
224        start = link_mo.start() + 5
225        link_attrs = {}
226        for attr_mo in attr_find.finditer(stripped, start):
227            if attr_mo.lastgroup == 'end_link':
228                break
229
230            # Either q_val or unq_val must be present, but not both
231            # unq_val is a True (non-empty) value if it is present
232            attr_name, q_val, unq_val = attr_mo.group('attr_name', 'q_val',
233                                                      'unq_val')
234            attr_val = ent_replace.sub(replaceEnt, unq_val or q_val)
235
236            link_attrs[attr_name] = attr_val
237
238        matches.append(link_attrs)
239
240    return matches
241
242
243def relMatches(rel_attr, target_rel):
244    """Does this target_rel appear in the rel_str?"""
245    # XXX: TESTME
246    rels = rel_attr.strip().split()
247    for rel in rels:
248        rel = rel.lower()
249        if rel == target_rel:
250            return 1
251
252    return 0
253
254
255def linkHasRel(link_attrs, target_rel):
256    """Does this link have target_rel as a relationship?"""
257    # XXX: TESTME
258    rel_attr = link_attrs.get('rel')
259    return rel_attr and relMatches(rel_attr, target_rel)
260
261
262def findLinksRel(link_attrs_list, target_rel):
263    """Filter the list of link attributes on whether it has target_rel
264    as a relationship."""
265    # XXX: TESTME
266    matchesTarget = lambda attrs: linkHasRel(attrs, target_rel)
267    return list(filter(matchesTarget, link_attrs_list))
268
269
270def findFirstHref(link_attrs_list, target_rel):
271    """Return the value of the href attribute for the first link tag
272    in the list that has target_rel as a relationship."""
273    # XXX: TESTME
274    matches = findLinksRel(link_attrs_list, target_rel)
275    if not matches:
276        return None
277    first = matches[0]
278    return first.get('href')
279