1"""Shared support for scanning document type declarations in HTML and XHTML.
2
3Backported for python-future from Python 3.3. Reason: ParserBase is an
4old-style class in the Python 2.7 source of markupbase.py, which I suspect
5might be the cause of sporadic unit-test failures on travis-ci.org with
6test_htmlparser.py.  The test failures look like this:
7
8    ======================================================================
9
10ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStrictTestCase)
11
12----------------------------------------------------------------------
13
14Traceback (most recent call last):
15  File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 661, in test_attr_entity_replacement
16    [("starttag", "a", [("b", "&><\"'")])])
17  File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 93, in _run_check
18    collector = self.get_collector()
19  File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 617, in get_collector
20    return EventCollector(strict=True)
21  File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 27, in __init__
22    html.parser.HTMLParser.__init__(self, *args, **kw)
23  File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 135, in __init__
24    self.reset()
25  File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 143, in reset
26    _markupbase.ParserBase.reset(self)
27
28TypeError: unbound method reset() must be called with ParserBase instance as first argument (got EventCollector instance instead)
29
30This module is used as a foundation for the html.parser module.  It has no
31documented public API and should not be used directly.
32
33"""
34
35import re
36
37_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
38_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
39_commentclose = re.compile(r'--\s*>')
40_markedsectionclose = re.compile(r']\s*]\s*>')
41
42# An analysis of the MS-Word extensions is available at
43# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
44
45_msmarkedsectionclose = re.compile(r']\s*>')
46
47del re
48
49
50class ParserBase(object):
51    """Parser base class which provides some common support methods used
52    by the SGML/HTML and XHTML parsers."""
53
54    def __init__(self):
55        if self.__class__ is ParserBase:
56            raise RuntimeError(
57                "_markupbase.ParserBase must be subclassed")
58
59    def error(self, message):
60        raise NotImplementedError(
61            "subclasses of ParserBase must override error()")
62
63    def reset(self):
64        self.lineno = 1
65        self.offset = 0
66
67    def getpos(self):
68        """Return current line number and offset."""
69        return self.lineno, self.offset
70
71    # Internal -- update line number and offset.  This should be
72    # called for each piece of data exactly once, in order -- in other
73    # words the concatenation of all the input strings to this
74    # function should be exactly the entire input.
75    def updatepos(self, i, j):
76        if i >= j:
77            return j
78        rawdata = self.rawdata
79        nlines = rawdata.count("\n", i, j)
80        if nlines:
81            self.lineno = self.lineno + nlines
82            pos = rawdata.rindex("\n", i, j) # Should not fail
83            self.offset = j-(pos+1)
84        else:
85            self.offset = self.offset + j-i
86        return j
87
88    _decl_otherchars = ''
89
90    # Internal -- parse declaration (for use by subclasses).
91    def parse_declaration(self, i):
92        # This is some sort of declaration; in "HTML as
93        # deployed," this should only be the document type
94        # declaration ("<!DOCTYPE html...>").
95        # ISO 8879:1986, however, has more complex
96        # declaration syntax for elements in <!...>, including:
97        # --comment--
98        # [marked section]
99        # name in the following list: ENTITY, DOCTYPE, ELEMENT,
100        # ATTLIST, NOTATION, SHORTREF, USEMAP,
101        # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
102        rawdata = self.rawdata
103        j = i + 2
104        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
105        if rawdata[j:j+1] == ">":
106            # the empty comment <!>
107            return j + 1
108        if rawdata[j:j+1] in ("-", ""):
109            # Start of comment followed by buffer boundary,
110            # or just a buffer boundary.
111            return -1
112        # A simple, practical version could look like: ((name|stringlit) S*) + '>'
113        n = len(rawdata)
114        if rawdata[j:j+2] == '--': #comment
115            # Locate --.*-- as the body of the comment
116            return self.parse_comment(i)
117        elif rawdata[j] == '[': #marked section
118            # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
119            # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
120            # Note that this is extended by Microsoft Office "Save as Web" function
121            # to include [if...] and [endif].
122            return self.parse_marked_section(i)
123        else: #all other declaration elements
124            decltype, j = self._scan_name(j, i)
125        if j < 0:
126            return j
127        if decltype == "doctype":
128            self._decl_otherchars = ''
129        while j < n:
130            c = rawdata[j]
131            if c == ">":
132                # end of declaration syntax
133                data = rawdata[i+2:j]
134                if decltype == "doctype":
135                    self.handle_decl(data)
136                else:
137                    # According to the HTML5 specs sections "8.2.4.44 Bogus
138                    # comment state" and "8.2.4.45 Markup declaration open
139                    # state", a comment token should be emitted.
140                    # Calling unknown_decl provides more flexibility though.
141                    self.unknown_decl(data)
142                return j + 1
143            if c in "\"'":
144                m = _declstringlit_match(rawdata, j)
145                if not m:
146                    return -1 # incomplete
147                j = m.end()
148            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
149                name, j = self._scan_name(j, i)
150            elif c in self._decl_otherchars:
151                j = j + 1
152            elif c == "[":
153                # this could be handled in a separate doctype parser
154                if decltype == "doctype":
155                    j = self._parse_doctype_subset(j + 1, i)
156                elif decltype in set(["attlist", "linktype", "link", "element"]):
157                    # must tolerate []'d groups in a content model in an element declaration
158                    # also in data attribute specifications of attlist declaration
159                    # also link type declaration subsets in linktype declarations
160                    # also link attribute specification lists in link declarations
161                    self.error("unsupported '[' char in %s declaration" % decltype)
162                else:
163                    self.error("unexpected '[' char in declaration")
164            else:
165                self.error(
166                    "unexpected %r char in declaration" % rawdata[j])
167            if j < 0:
168                return j
169        return -1 # incomplete
170
171    # Internal -- parse a marked section
172    # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
173    def parse_marked_section(self, i, report=1):
174        rawdata= self.rawdata
175        assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
176        sectName, j = self._scan_name( i+3, i )
177        if j < 0:
178            return j
179        if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]):
180            # look for standard ]]> ending
181            match= _markedsectionclose.search(rawdata, i+3)
182        elif sectName in set(["if", "else", "endif"]):
183            # look for MS Office ]> ending
184            match= _msmarkedsectionclose.search(rawdata, i+3)
185        else:
186            self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
187        if not match:
188            return -1
189        if report:
190            j = match.start(0)
191            self.unknown_decl(rawdata[i+3: j])
192        return match.end(0)
193
194    # Internal -- parse comment, return length or -1 if not terminated
195    def parse_comment(self, i, report=1):
196        rawdata = self.rawdata
197        if rawdata[i:i+4] != '<!--':
198            self.error('unexpected call to parse_comment()')
199        match = _commentclose.search(rawdata, i+4)
200        if not match:
201            return -1
202        if report:
203            j = match.start(0)
204            self.handle_comment(rawdata[i+4: j])
205        return match.end(0)
206
207    # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
208    # returning the index just past any whitespace following the trailing ']'.
209    def _parse_doctype_subset(self, i, declstartpos):
210        rawdata = self.rawdata
211        n = len(rawdata)
212        j = i
213        while j < n:
214            c = rawdata[j]
215            if c == "<":
216                s = rawdata[j:j+2]
217                if s == "<":
218                    # end of buffer; incomplete
219                    return -1
220                if s != "<!":
221                    self.updatepos(declstartpos, j + 1)
222                    self.error("unexpected char in internal subset (in %r)" % s)
223                if (j + 2) == n:
224                    # end of buffer; incomplete
225                    return -1
226                if (j + 4) > n:
227                    # end of buffer; incomplete
228                    return -1
229                if rawdata[j:j+4] == "<!--":
230                    j = self.parse_comment(j, report=0)
231                    if j < 0:
232                        return j
233                    continue
234                name, j = self._scan_name(j + 2, declstartpos)
235                if j == -1:
236                    return -1
237                if name not in set(["attlist", "element", "entity", "notation"]):
238                    self.updatepos(declstartpos, j + 2)
239                    self.error(
240                        "unknown declaration %r in internal subset" % name)
241                # handle the individual names
242                meth = getattr(self, "_parse_doctype_" + name)
243                j = meth(j, declstartpos)
244                if j < 0:
245                    return j
246            elif c == "%":
247                # parameter entity reference
248                if (j + 1) == n:
249                    # end of buffer; incomplete
250                    return -1
251                s, j = self._scan_name(j + 1, declstartpos)
252                if j < 0:
253                    return j
254                if rawdata[j] == ";":
255                    j = j + 1
256            elif c == "]":
257                j = j + 1
258                while j < n and rawdata[j].isspace():
259                    j = j + 1
260                if j < n:
261                    if rawdata[j] == ">":
262                        return j
263                    self.updatepos(declstartpos, j)
264                    self.error("unexpected char after internal subset")
265                else:
266                    return -1
267            elif c.isspace():
268                j = j + 1
269            else:
270                self.updatepos(declstartpos, j)
271                self.error("unexpected char %r in internal subset" % c)
272        # end of buffer reached
273        return -1
274
275    # Internal -- scan past <!ELEMENT declarations
276    def _parse_doctype_element(self, i, declstartpos):
277        name, j = self._scan_name(i, declstartpos)
278        if j == -1:
279            return -1
280        # style content model; just skip until '>'
281        rawdata = self.rawdata
282        if '>' in rawdata[j:]:
283            return rawdata.find(">", j) + 1
284        return -1
285
286    # Internal -- scan past <!ATTLIST declarations
287    def _parse_doctype_attlist(self, i, declstartpos):
288        rawdata = self.rawdata
289        name, j = self._scan_name(i, declstartpos)
290        c = rawdata[j:j+1]
291        if c == "":
292            return -1
293        if c == ">":
294            return j + 1
295        while 1:
296            # scan a series of attribute descriptions; simplified:
297            #   name type [value] [#constraint]
298            name, j = self._scan_name(j, declstartpos)
299            if j < 0:
300                return j
301            c = rawdata[j:j+1]
302            if c == "":
303                return -1
304            if c == "(":
305                # an enumerated type; look for ')'
306                if ")" in rawdata[j:]:
307                    j = rawdata.find(")", j) + 1
308                else:
309                    return -1
310                while rawdata[j:j+1].isspace():
311                    j = j + 1
312                if not rawdata[j:]:
313                    # end of buffer, incomplete
314                    return -1
315            else:
316                name, j = self._scan_name(j, declstartpos)
317            c = rawdata[j:j+1]
318            if not c:
319                return -1
320            if c in "'\"":
321                m = _declstringlit_match(rawdata, j)
322                if m:
323                    j = m.end()
324                else:
325                    return -1
326                c = rawdata[j:j+1]
327                if not c:
328                    return -1
329            if c == "#":
330                if rawdata[j:] == "#":
331                    # end of buffer
332                    return -1
333                name, j = self._scan_name(j + 1, declstartpos)
334                if j < 0:
335                    return j
336                c = rawdata[j:j+1]
337                if not c:
338                    return -1
339            if c == '>':
340                # all done
341                return j + 1
342
343    # Internal -- scan past <!NOTATION declarations
344    def _parse_doctype_notation(self, i, declstartpos):
345        name, j = self._scan_name(i, declstartpos)
346        if j < 0:
347            return j
348        rawdata = self.rawdata
349        while 1:
350            c = rawdata[j:j+1]
351            if not c:
352                # end of buffer; incomplete
353                return -1
354            if c == '>':
355                return j + 1
356            if c in "'\"":
357                m = _declstringlit_match(rawdata, j)
358                if not m:
359                    return -1
360                j = m.end()
361            else:
362                name, j = self._scan_name(j, declstartpos)
363                if j < 0:
364                    return j
365
366    # Internal -- scan past <!ENTITY declarations
367    def _parse_doctype_entity(self, i, declstartpos):
368        rawdata = self.rawdata
369        if rawdata[i:i+1] == "%":
370            j = i + 1
371            while 1:
372                c = rawdata[j:j+1]
373                if not c:
374                    return -1
375                if c.isspace():
376                    j = j + 1
377                else:
378                    break
379        else:
380            j = i
381        name, j = self._scan_name(j, declstartpos)
382        if j < 0:
383            return j
384        while 1:
385            c = self.rawdata[j:j+1]
386            if not c:
387                return -1
388            if c in "'\"":
389                m = _declstringlit_match(rawdata, j)
390                if m:
391                    j = m.end()
392                else:
393                    return -1    # incomplete
394            elif c == ">":
395                return j + 1
396            else:
397                name, j = self._scan_name(j, declstartpos)
398                if j < 0:
399                    return j
400
401    # Internal -- scan a name token and the new position and the token, or
402    # return -1 if we've reached the end of the buffer.
403    def _scan_name(self, i, declstartpos):
404        rawdata = self.rawdata
405        n = len(rawdata)
406        if i == n:
407            return None, -1
408        m = _declname_match(rawdata, i)
409        if m:
410            s = m.group()
411            name = s.strip()
412            if (i + len(s)) == n:
413                return None, -1  # end of buffer
414            return name.lower(), m.end()
415        else:
416            self.updatepos(declstartpos, i)
417            self.error("expected name token at %r"
418                       % rawdata[declstartpos:declstartpos+20])
419
420    # To be overridden -- handlers for unknown objects
421    def unknown_decl(self, data):
422        pass
423