1# Parsers for XML and HTML
2
3from lxml.includes cimport xmlparser
4from lxml.includes cimport htmlparser
5
6
7class ParseError(LxmlSyntaxError):
8    """Syntax error while parsing an XML document.
9
10    For compatibility with ElementTree 1.3 and later.
11    """
12    def __init__(self, message, code, line, column, filename=None):
13        super(_ParseError, self).__init__(message)
14        self.lineno, self.offset = (line, column - 1)
15        self.code = code
16        self.filename = filename
17
18    @property
19    def position(self):
20        return self.lineno, self.offset + 1
21
22    @position.setter
23    def position(self, new_pos):
24        self.lineno, column = new_pos
25        self.offset = column - 1
26
27cdef object _ParseError = ParseError
28
29
30class XMLSyntaxError(ParseError):
31    """Syntax error while parsing an XML document.
32    """
33
34cdef class ParserError(LxmlError):
35    """Internal lxml parser error.
36    """
37
38
39@cython.final
40@cython.internal
41cdef class _ParserDictionaryContext:
42    # Global parser context to share the string dictionary.
43    #
44    # This class is a delegate singleton!
45    #
46    # It creates _ParserDictionaryContext objects for each thread to keep thread state,
47    # but those must never be used directly.  Always stick to using the static
48    # __GLOBAL_PARSER_CONTEXT as defined below the class.
49    #
50
51    cdef tree.xmlDict* _c_dict
52    cdef _BaseParser _default_parser
53    cdef list _implied_parser_contexts
54
55    def __cinit__(self):
56        self._c_dict = NULL
57        self._implied_parser_contexts = []
58
59    def __dealloc__(self):
60        if self._c_dict is not NULL:
61            xmlparser.xmlDictFree(self._c_dict)
62
63    cdef void initMainParserContext(self):
64        u"""Put the global context into the thread dictionary of the main
65        thread.  To be called once and only in the main thread."""
66        thread_dict = python.PyThreadState_GetDict()
67        if thread_dict is not NULL:
68            (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
69
70    cdef _ParserDictionaryContext _findThreadParserContext(self):
71        u"Find (or create) the _ParserDictionaryContext object for the current thread"
72        cdef _ParserDictionaryContext context
73        thread_dict = python.PyThreadState_GetDict()
74        if thread_dict is NULL:
75            return self
76        d = <dict>thread_dict
77        result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
78        if result is not NULL:
79            return <object>result
80        context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
81        d[u"_ParserDictionaryContext"] = context
82        return context
83
84    cdef void setDefaultParser(self, _BaseParser parser):
85        u"Set the default parser for the current thread"
86        cdef _ParserDictionaryContext context
87        context = self._findThreadParserContext()
88        context._default_parser = parser
89
90    cdef _BaseParser getDefaultParser(self):
91        u"Return (or create) the default parser of the current thread"
92        cdef _ParserDictionaryContext context
93        context = self._findThreadParserContext()
94        if context._default_parser is None:
95            if self._default_parser is None:
96                self._default_parser = __DEFAULT_XML_PARSER._copy()
97            if context is not self:
98                context._default_parser = self._default_parser._copy()
99        return context._default_parser
100
101    cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
102        u"Return the thread-local dict or create a new one if necessary."
103        cdef _ParserDictionaryContext context
104        context = self._findThreadParserContext()
105        if context._c_dict is NULL:
106            # thread dict not yet set up => use default or create a new one
107            if default is not NULL:
108                context._c_dict = default
109                xmlparser.xmlDictReference(default)
110                return default
111            if self._c_dict is NULL:
112                self._c_dict = xmlparser.xmlDictCreate()
113            if context is not self:
114                context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
115        return context._c_dict
116
117    cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
118        c_dict = c_dict_ref[0]
119        c_thread_dict = self._getThreadDict(c_dict)
120        if c_dict is c_thread_dict:
121            return
122        if c_dict is not NULL:
123            xmlparser.xmlDictFree(c_dict)
124        c_dict_ref[0] = c_thread_dict
125        xmlparser.xmlDictReference(c_thread_dict)
126
127    cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
128        u"Assure we always use the same string dictionary."
129        self.initThreadDictRef(&pctxt.dict)
130        pctxt.dictNames = 1
131
132    cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
133        u"Assure we always use the same string dictionary."
134        self.initThreadDictRef(&pctxt.dict)
135
136    cdef void initDocDict(self, xmlDoc* result):
137        u"Store dict of last object parsed if no shared dict yet"
138        # XXX We also free the result dict here if there already was one.
139        # This case should only occur for new documents with empty dicts,
140        # otherwise we'd free data that's in use => segfault
141        self.initThreadDictRef(&result.dict)
142
143    cdef _ParserContext findImpliedContext(self):
144        u"""Return any current implied xml parser context for the current
145        thread.  This is used when the resolver functions are called
146        with an xmlParserCtxt that was generated from within libxml2
147        (i.e. without a _ParserContext) - which happens when parsing
148        schema and xinclude external references."""
149        cdef _ParserDictionaryContext context
150        cdef _ParserContext implied_context
151
152        # see if we have a current implied parser
153        context = self._findThreadParserContext()
154        if context._implied_parser_contexts:
155            implied_context = context._implied_parser_contexts[-1]
156            return implied_context
157        return None
158
159    cdef void pushImpliedContextFromParser(self, _BaseParser parser):
160        u"Push a new implied context object taken from the parser."
161        if parser is not None:
162            self.pushImpliedContext(parser._getParserContext())
163        else:
164            self.pushImpliedContext(None)
165
166    cdef void pushImpliedContext(self, _ParserContext parser_context):
167        u"Push a new implied context object."
168        cdef _ParserDictionaryContext context
169        context = self._findThreadParserContext()
170        context._implied_parser_contexts.append(parser_context)
171
172    cdef void popImpliedContext(self):
173        u"Pop the current implied context object."
174        cdef _ParserDictionaryContext context
175        context = self._findThreadParserContext()
176        context._implied_parser_contexts.pop()
177
178cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
179__GLOBAL_PARSER_CONTEXT.initMainParserContext()
180
181############################################################
182## support for Python unicode I/O
183############################################################
184
185# name of Python Py_UNICODE encoding as known to libxml2
186cdef const_char* _PY_UNICODE_ENCODING = NULL
187
188cdef int _setupPythonUnicode() except -1:
189    u"""Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode
190    strings if libxml2 supports reading native Python unicode.  This depends
191    on iconv and the local Python installation, so we simply check if we find
192    a matching encoding handler.
193    """
194    cdef tree.xmlCharEncodingHandler* enchandler
195    cdef Py_ssize_t l
196    cdef const_char* enc
197    cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
198    cdef const_xmlChar* buffer = <const_xmlChar*>uchars
199    # apparently, libxml2 can't detect UTF-16 on some systems
200    if (buffer[0] == c'<' and buffer[1] == c'\0' and
201            buffer[2] == c't' and buffer[3] == c'\0'):
202        enc = "UTF-16LE"
203    elif (buffer[0] == c'\0' and buffer[1] == c'<' and
204            buffer[2] == c'\0' and buffer[3] == c't'):
205        enc = "UTF-16BE"
206    else:
207        # let libxml2 give it a try
208        enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
209        if enc is NULL:
210            # not my fault, it's YOUR broken system :)
211            return 0
212    enchandler = tree.xmlFindCharEncodingHandler(enc)
213    if enchandler is not NULL:
214        global _PY_UNICODE_ENCODING
215        tree.xmlCharEncCloseFunc(enchandler)
216        _PY_UNICODE_ENCODING = enc
217    return 0
218
219cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
220    u"Work around bug in libxml2: find iconv name of encoding on our own."
221    cdef tree.xmlCharEncoding enc
222    enc = tree.xmlDetectCharEncoding(buffer, size)
223    if enc == tree.XML_CHAR_ENCODING_UTF16LE:
224        if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
225                          buffer[1] == <const_xmlChar>'\xFE' and
226                          buffer[2] == 0 and buffer[3] == 0):
227            return "UTF-32LE"  # according to BOM
228        else:
229            return "UTF-16LE"
230    elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
231        return "UTF-16BE"
232    elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
233        return "UCS-4LE"
234    elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
235        return "UCS-4BE"
236    elif enc == tree.XML_CHAR_ENCODING_NONE:
237        return NULL
238    else:
239        # returns a constant char*, no need to free it
240        return tree.xmlGetCharEncodingName(enc)
241
242_setupPythonUnicode()
243
244############################################################
245## support for file-like objects
246############################################################
247
248@cython.final
249@cython.internal
250cdef class _FileReaderContext:
251    cdef object _filelike
252    cdef object _encoding
253    cdef object _url
254    cdef object _bytes
255    cdef _ExceptionContext _exc_context
256    cdef Py_ssize_t _bytes_read
257    cdef char* _c_url
258    cdef bint _close_file_after_read
259
260    def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
261        self._exc_context = exc_context
262        self._filelike = filelike
263        self._close_file_after_read = close_file
264        self._encoding = encoding
265        if url is None:
266            self._c_url = NULL
267        else:
268            url = _encodeFilename(url)
269            self._c_url = _cstr(url)
270        self._url = url
271        self._bytes  = b''
272        self._bytes_read = 0
273
274    cdef _close_file(self):
275        if self._filelike is None or not self._close_file_after_read:
276            return
277        try:
278            close = self._filelike.close
279        except AttributeError:
280            close = None
281        finally:
282            self._filelike = None
283        if close is not None:
284            close()
285
286    cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
287        cdef stdio.FILE* c_stream
288        cdef xmlparser.xmlParserInputBuffer* c_buffer
289        c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
290        c_stream = python.PyFile_AsFile(self._filelike)
291        if c_stream is NULL:
292            c_buffer.readcallback  = _readFilelikeParser
293            c_buffer.context = <python.PyObject*>self
294        else:
295            c_buffer.readcallback  = _readFileParser
296            c_buffer.context = c_stream
297        return c_buffer
298
299    cdef xmlparser.xmlParserInput* _createParserInput(
300            self, xmlparser.xmlParserCtxt* ctxt):
301        cdef xmlparser.xmlParserInputBuffer* c_buffer
302        c_buffer = self._createParserInputBuffer()
303        return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
304
305    cdef tree.xmlDtd* _readDtd(self):
306        cdef xmlparser.xmlParserInputBuffer* c_buffer
307        c_buffer = self._createParserInputBuffer()
308        with nogil:
309            return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
310
311    cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
312        cdef xmlDoc* result
313        cdef char* c_encoding
314        cdef stdio.FILE* c_stream
315        cdef xmlparser.xmlInputReadCallback c_read_callback
316        cdef xmlparser.xmlInputCloseCallback c_close_callback
317        cdef void* c_callback_context
318
319        if self._encoding is None:
320            c_encoding = NULL
321        else:
322            c_encoding = _cstr(self._encoding)
323
324        c_stream = python.PyFile_AsFile(self._filelike)
325        if c_stream is NULL:
326            c_read_callback  = _readFilelikeParser
327            c_callback_context = <python.PyObject*>self
328        else:
329            c_read_callback  = _readFileParser
330            c_callback_context = c_stream
331
332        orig_options = ctxt.options
333        with nogil:
334            if ctxt.html:
335                result = htmlparser.htmlCtxtReadIO(
336                        ctxt, c_read_callback, NULL, c_callback_context,
337                        self._c_url, c_encoding, options)
338                if result is not NULL:
339                    if _fixHtmlDictNames(ctxt.dict, result) < 0:
340                        tree.xmlFreeDoc(result)
341                        result = NULL
342            else:
343                result = xmlparser.xmlCtxtReadIO(
344                    ctxt, c_read_callback, NULL, c_callback_context,
345                    self._c_url, c_encoding, options)
346        ctxt.options = orig_options # work around libxml2 problem
347        try:
348            self._close_file()
349        except:
350            self._exc_context._store_raised()
351        finally:
352            return result  # swallow any exceptions
353
354    cdef int copyToBuffer(self, char* c_buffer, int c_requested):
355        cdef int c_byte_count = 0
356        cdef char* c_start
357        cdef Py_ssize_t byte_count, remaining
358        if self._bytes_read < 0:
359            return 0
360        try:
361            byte_count = python.PyBytes_GET_SIZE(self._bytes)
362            remaining  = byte_count - self._bytes_read
363            while c_requested > remaining:
364                c_start = _cstr(self._bytes) + self._bytes_read
365                cstring_h.memcpy(c_buffer, c_start, remaining)
366                c_byte_count += remaining
367                c_buffer += remaining
368                c_requested -= remaining
369
370                self._bytes = self._filelike.read(c_requested)
371                if not isinstance(self._bytes, bytes):
372                    if isinstance(self._bytes, unicode):
373                        if self._encoding is None:
374                            self._bytes = (<unicode>self._bytes).encode('utf8')
375                        else:
376                            self._bytes = python.PyUnicode_AsEncodedString(
377                                self._bytes, _cstr(self._encoding), NULL)
378                    else:
379                        self._close_file()
380                        raise TypeError, \
381                            u"reading from file-like objects must return byte strings or unicode strings"
382
383                remaining = python.PyBytes_GET_SIZE(self._bytes)
384                if remaining == 0:
385                    self._bytes_read = -1
386                    self._close_file()
387                    return c_byte_count
388                self._bytes_read = 0
389
390            if c_requested > 0:
391                c_start = _cstr(self._bytes) + self._bytes_read
392                cstring_h.memcpy(c_buffer, c_start, c_requested)
393                c_byte_count += c_requested
394                self._bytes_read += c_requested
395        except:
396            c_byte_count = -1
397            self._exc_context._store_raised()
398            try:
399                self._close_file()
400            except:
401                self._exc_context._store_raised()
402        finally:
403            return c_byte_count  # swallow any exceptions
404
405cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
406    return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
407
408cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
409    return stdio.fread(c_buffer, 1,  c_size, <stdio.FILE*>ctxt)
410
411############################################################
412## support for custom document loaders
413############################################################
414
415cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
416                                               xmlparser.xmlParserCtxt* c_context) with gil:
417    cdef _ResolverContext context
418    cdef xmlparser.xmlParserInput* c_input
419    cdef _InputDocument doc_ref
420    cdef _FileReaderContext file_context
421    # if there is no _ParserContext associated with the xmlParserCtxt
422    # passed, check to see if the thread state object has an implied
423    # context.
424    if c_context._private is not NULL:
425        context = <_ResolverContext>c_context._private
426    else:
427        context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
428
429    if context is None:
430        if __DEFAULT_ENTITY_LOADER is NULL:
431            return NULL
432        with nogil:
433            # free the GIL as we might do serious I/O here (e.g. HTTP)
434            c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
435        return c_input
436
437    try:
438        if c_url is NULL:
439            url = None
440        else:
441            # parsing a related document (DTD etc.) => UTF-8 encoded URL?
442            url = _decodeFilename(<const_xmlChar*>c_url)
443        if c_pubid is NULL:
444            pubid = None
445        else:
446            pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
447
448        doc_ref = context._resolvers.resolve(url, pubid, context)
449    except:
450        context._store_raised()
451        return NULL
452
453    if doc_ref is not None:
454        if doc_ref._type == PARSER_DATA_STRING:
455            data = doc_ref._data_bytes
456            filename = doc_ref._filename
457            if not filename:
458                filename = None
459            elif not isinstance(filename, bytes):
460                # most likely a text URL
461                filename = filename.encode('utf8')
462                if not isinstance(filename, bytes):
463                    filename = None
464
465            c_input = xmlparser.xmlNewInputStream(c_context)
466            if c_input is not NULL:
467                if filename is not None:
468                    c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
469                c_input.base = _xcstr(data)
470                c_input.length = python.PyBytes_GET_SIZE(data)
471                c_input.cur = c_input.base
472                c_input.end = c_input.base + c_input.length
473        elif doc_ref._type == PARSER_DATA_FILENAME:
474            data = None
475            c_filename = _cstr(doc_ref._filename)
476            with nogil:
477                # free the GIL as we might do serious I/O here
478                c_input = xmlparser.xmlNewInputFromFile(
479                    c_context, c_filename)
480        elif doc_ref._type == PARSER_DATA_FILE:
481            file_context = _FileReaderContext(doc_ref._file, context, url,
482                                              None, doc_ref._close_file)
483            c_input = file_context._createParserInput(c_context)
484            data = file_context
485        else:
486            data = None
487            c_input = NULL
488
489        if data is not None:
490            context._storage.add(data)
491        if c_input is not NULL:
492            return c_input
493
494    if __DEFAULT_ENTITY_LOADER is NULL:
495        return NULL
496
497    with nogil:
498        # free the GIL as we might do serious I/O here (e.g. HTTP)
499        c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
500    return c_input
501
502cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
503__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
504
505
506cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil:
507    cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
508    xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
509    return old
510
511cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil:
512    xmlparser.xmlSetExternalEntityLoader(old)
513
514
515############################################################
516## Parsers
517############################################################
518
519@cython.no_gc_clear  # May have to call "self._validator.disconnect()" on dealloc.
520@cython.internal
521cdef class _ParserContext(_ResolverContext):
522    cdef _ErrorLog _error_log
523    cdef _ParserSchemaValidationContext _validator
524    cdef xmlparser.xmlParserCtxt* _c_ctxt
525    cdef xmlparser.xmlExternalEntityLoader _orig_loader
526    cdef python.PyThread_type_lock _lock
527    cdef _Document _doc
528    cdef bint _collect_ids
529
530    def __cinit__(self):
531        self._c_ctxt = NULL
532        self._collect_ids = True
533        if not config.ENABLE_THREADING:
534            self._lock = NULL
535        else:
536            self._lock = python.PyThread_allocate_lock()
537        self._error_log = _ErrorLog()
538
539    def __dealloc__(self):
540        if config.ENABLE_THREADING and self._lock is not NULL:
541            python.PyThread_free_lock(self._lock)
542            self._lock = NULL
543        if self._c_ctxt is not NULL:
544            if <void*>self._validator is not NULL and self._validator is not None:
545                # If the parser was not closed correctly (e.g. interrupted iterparse()),
546                # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
547                # validator plug might still be in place, which will make xmlFreeParserCtxt()
548                # crash when trying to xmlFree() a static SAX handler.
549                # Thus, make sure we disconnect the handler interceptor here at the latest.
550                self._validator.disconnect()
551            xmlparser.xmlFreeParserCtxt(self._c_ctxt)
552
553    cdef _ParserContext _copy(self):
554        cdef _ParserContext context
555        context = self.__class__()
556        context._collect_ids = self._collect_ids
557        context._validator = self._validator.copy()
558        _initParserContext(context, self._resolvers._copy(), NULL)
559        return context
560
561    cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
562        self._c_ctxt = c_ctxt
563        c_ctxt._private = <void*>self
564
565    cdef void _resetParserContext(self):
566        if self._c_ctxt is not NULL:
567            if self._c_ctxt.html:
568                htmlparser.htmlCtxtReset(self._c_ctxt)
569                self._c_ctxt.disableSAX = 0 # work around bug in libxml2
570            else:
571                xmlparser.xmlClearParserCtxt(self._c_ctxt)
572
573    cdef int prepare(self, bint set_document_loader=True) except -1:
574        cdef int result
575        if config.ENABLE_THREADING and self._lock is not NULL:
576            with nogil:
577                result = python.PyThread_acquire_lock(
578                    self._lock, python.WAIT_LOCK)
579            if result == 0:
580                raise ParserError, u"parser locking failed"
581        self._error_log.clear()
582        self._doc = None
583        self._c_ctxt.sax.serror = _receiveParserError
584        self._orig_loader = _register_document_loader() if set_document_loader else NULL
585        if self._validator is not None:
586            self._validator.connect(self._c_ctxt, self._error_log)
587        return 0
588
589    cdef int cleanup(self) except -1:
590        if self._orig_loader is not NULL:
591            _reset_document_loader(self._orig_loader)
592        try:
593            if self._validator is not None:
594                self._validator.disconnect()
595            self._resetParserContext()
596            self.clear()
597            self._doc = None
598            self._c_ctxt.sax.serror = NULL
599        finally:
600            if config.ENABLE_THREADING and self._lock is not NULL:
601                python.PyThread_release_lock(self._lock)
602        return 0
603
604    cdef object _handleParseResult(self, _BaseParser parser,
605                                   xmlDoc* result, filename):
606        c_doc = self._handleParseResultDoc(parser, result, filename)
607        if self._doc is not None and self._doc._c_doc is c_doc:
608            return self._doc
609        else:
610            return _documentFactory(c_doc, parser)
611
612    cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
613                                       xmlDoc* result, filename) except NULL:
614        recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
615        return _handleParseResult(self, self._c_ctxt, result,
616                                  filename, recover,
617                                  free_doc=self._doc is None)
618
619cdef _initParserContext(_ParserContext context,
620                        _ResolverRegistry resolvers,
621                        xmlparser.xmlParserCtxt* c_ctxt):
622    _initResolverContext(context, resolvers)
623    if c_ctxt is not NULL:
624        context._initParserContext(c_ctxt)
625
626cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
627    (<_ParserContext>_parser_context._private)._error_log._receive(error)
628
629cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
630    if __DEBUG:
631        if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
632            _forwardError(NULL, error)
633        else:
634            _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
635
636cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
637                          _ErrorLog error_log) except -1:
638    if filename is not None and \
639           ctxt.lastError.domain == xmlerror.XML_FROM_IO:
640        if isinstance(filename, bytes):
641            filename = _decodeFilenameWithLength(
642                <bytes>filename, len(<bytes>filename))
643        if ctxt.lastError.message is not NULL:
644            try:
645                message = ctxt.lastError.message.decode('utf-8')
646            except UnicodeDecodeError:
647                # the filename may be in there => play it safe
648                message = ctxt.lastError.message.decode('iso8859-1')
649            message = f"Error reading file '{filename}': {message.strip()}"
650        else:
651            message = f"Error reading '{filename}'"
652        raise IOError, message
653    elif error_log:
654        raise error_log._buildParseException(
655            XMLSyntaxError, u"Document is not well formed")
656    elif ctxt.lastError.message is not NULL:
657        message = ctxt.lastError.message.strip()
658        code = ctxt.lastError.code
659        line = ctxt.lastError.line
660        column = ctxt.lastError.int2
661        if ctxt.lastError.line > 0:
662            message = f"line {line}: {message}"
663        raise XMLSyntaxError(message, code, line, column, filename)
664    else:
665        raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
666                             filename)
667
668cdef xmlDoc* _handleParseResult(_ParserContext context,
669                                xmlparser.xmlParserCtxt* c_ctxt,
670                                xmlDoc* result, filename,
671                                bint recover, bint free_doc) except NULL:
672    cdef bint well_formed
673    if result is not NULL:
674        __GLOBAL_PARSER_CONTEXT.initDocDict(result)
675
676    if c_ctxt.myDoc is not NULL:
677        if c_ctxt.myDoc is not result:
678            __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
679            tree.xmlFreeDoc(c_ctxt.myDoc)
680        c_ctxt.myDoc = NULL
681
682    if result is not NULL:
683        if (context._validator is not None and
684                not context._validator.isvalid()):
685            well_formed = 0  # actually not 'valid', but anyway ...
686        elif (not c_ctxt.wellFormed and not c_ctxt.html and
687                c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
688                [1 for error in context._error_log
689                 if error.type == ErrorTypes.ERR_INVALID_CHAR]):
690            # An encoding error occurred and libxml2 switched from UTF-8
691            # input to (undecoded) Latin-1, at some arbitrary point in the
692            # document.  Better raise an error than allowing for a broken
693            # tree with mixed encodings.
694            well_formed = 0
695        elif recover or (c_ctxt.wellFormed and
696                         c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
697            well_formed = 1
698        elif not c_ctxt.replaceEntities and not c_ctxt.validate \
699                 and context is not None:
700            # in this mode, we ignore errors about undefined entities
701            for error in context._error_log.filter_from_errors():
702                if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
703                       error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
704                    well_formed = 0
705                    break
706            else:
707                well_formed = 1
708        else:
709            well_formed = 0
710
711        if not well_formed:
712            if free_doc:
713                tree.xmlFreeDoc(result)
714            result = NULL
715
716    if context is not None and context._has_raised():
717        if result is not NULL:
718            if free_doc:
719                tree.xmlFreeDoc(result)
720            result = NULL
721        context._raise_if_stored()
722
723    if result is NULL:
724        if context is not None:
725            _raiseParseError(c_ctxt, filename, context._error_log)
726        else:
727            _raiseParseError(c_ctxt, filename, None)
728    else:
729        if result.URL is NULL and filename is not None:
730            result.URL = tree.xmlStrdup(_xcstr(filename))
731        if result.encoding is NULL:
732            result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
733
734    if context._validator is not None and \
735           context._validator._add_default_attributes:
736        # we currently need to do this here as libxml2 does not
737        # support inserting default attributes during parse-time
738        # validation
739        context._validator.inject_default_attributes(result)
740
741    return result
742
743cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
744    cdef xmlNode* c_node
745    if c_doc is NULL:
746        return 0
747    c_node = c_doc.children
748    tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
749    if c_node.type == tree.XML_ELEMENT_NODE:
750        if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
751            return -1
752    tree.END_FOR_EACH_ELEMENT_FROM(c_node)
753    return 0
754
755cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
756                                  xmlNode* c_start_node) nogil:
757    """
758    Move names to the dict, iterating in document order, starting at
759    c_start_node. This is used in incremental parsing after each chunk.
760    """
761    cdef xmlNode* c_node
762    if not c_doc:
763        return 0
764    if not c_start_node:
765        return _fixHtmlDictNames(c_dict, c_doc)
766    c_node = c_start_node
767    tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
768    if c_node.type == tree.XML_ELEMENT_NODE:
769        if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
770            return -1
771    tree.END_FOR_EACH_ELEMENT_FROM(c_node)
772    return 0
773
774cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
775                                      xmlNode* c_node) nogil:
776    cdef xmlNode* c_attr
777    c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
778    if c_name is NULL:
779        return -1
780    if c_name is not c_node.name:
781        tree.xmlFree(<char*>c_node.name)
782        c_node.name = c_name
783    c_attr = <xmlNode*>c_node.properties
784    while c_attr is not NULL:
785        c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
786        if c_name is NULL:
787            return -1
788        if c_name is not c_attr.name:
789            tree.xmlFree(<char*>c_attr.name)
790            c_attr.name = c_name
791        c_attr = c_attr.next
792    return 0
793
794@cython.internal
795cdef class _BaseParser:
796    cdef ElementClassLookup _class_lookup
797    cdef _ResolverRegistry _resolvers
798    cdef _ParserContext _parser_context
799    cdef _ParserContext _push_parser_context
800    cdef int _parse_options
801    cdef bint _for_html
802    cdef bint _remove_comments
803    cdef bint _remove_pis
804    cdef bint _strip_cdata
805    cdef bint _collect_ids
806    cdef XMLSchema _schema
807    cdef bytes _filename
808    cdef readonly object target
809    cdef object _default_encoding
810    cdef tuple _events_to_collect  # (event_types, tag)
811
812    def __init__(self, int parse_options, bint for_html, XMLSchema schema,
813                 remove_comments, remove_pis, strip_cdata, collect_ids,
814                 target, encoding):
815        cdef tree.xmlCharEncodingHandler* enchandler
816        cdef int c_encoding
817        if not isinstance(self, (XMLParser, HTMLParser)):
818            raise TypeError, u"This class cannot be instantiated"
819
820        self._parse_options = parse_options
821        self.target = target
822        self._for_html = for_html
823        self._remove_comments = remove_comments
824        self._remove_pis = remove_pis
825        self._strip_cdata = strip_cdata
826        self._collect_ids = collect_ids
827        self._schema = schema
828
829        self._resolvers = _ResolverRegistry()
830
831        if encoding is None:
832            self._default_encoding = None
833        else:
834            encoding = _utf8(encoding)
835            enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
836            if enchandler is NULL:
837                raise LookupError, f"unknown encoding: '{encoding}'"
838            tree.xmlCharEncCloseFunc(enchandler)
839            self._default_encoding = encoding
840
841    cdef _setBaseURL(self, base_url):
842        self._filename = _encodeFilename(base_url)
843
844    cdef _collectEvents(self, event_types, tag):
845        if event_types is None:
846            event_types = ()
847        else:
848            event_types = tuple(set(event_types))
849            _buildParseEventFilter(event_types)  # purely for validation
850        self._events_to_collect = (event_types, tag)
851
852    cdef _ParserContext _getParserContext(self):
853        cdef xmlparser.xmlParserCtxt* pctxt
854        if self._parser_context is None:
855            self._parser_context = self._createContext(self.target, None)
856            self._parser_context._collect_ids = self._collect_ids
857            if self._schema is not None:
858                self._parser_context._validator = \
859                    self._schema._newSaxValidator(
860                        self._parse_options & xmlparser.XML_PARSE_DTDATTR)
861            pctxt = self._newParserCtxt()
862            _initParserContext(self._parser_context, self._resolvers, pctxt)
863            self._configureSaxContext(pctxt)
864        return self._parser_context
865
866    cdef _ParserContext _getPushParserContext(self):
867        cdef xmlparser.xmlParserCtxt* pctxt
868        if self._push_parser_context is None:
869            self._push_parser_context = self._createContext(
870                self.target, self._events_to_collect)
871            self._push_parser_context._collect_ids = self._collect_ids
872            if self._schema is not None:
873                self._push_parser_context._validator = \
874                    self._schema._newSaxValidator(
875                        self._parse_options & xmlparser.XML_PARSE_DTDATTR)
876            pctxt = self._newPushParserCtxt()
877            _initParserContext(
878                self._push_parser_context, self._resolvers, pctxt)
879            self._configureSaxContext(pctxt)
880        return self._push_parser_context
881
882    cdef _ParserContext _createContext(self, target, events_to_collect):
883        cdef _SaxParserContext sax_context
884        if target is not None:
885            sax_context = _TargetParserContext(self)
886            (<_TargetParserContext>sax_context)._setTarget(target)
887        elif events_to_collect:
888            sax_context = _SaxParserContext(self)
889        else:
890            # nothing special to configure
891            return _ParserContext()
892        if events_to_collect:
893            events, tag = events_to_collect
894            sax_context._setEventFilter(events, tag)
895        return sax_context
896
897    @cython.final
898    cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
899        if self._remove_comments:
900            pctxt.sax.comment = NULL
901        if self._remove_pis:
902            pctxt.sax.processingInstruction = NULL
903        if self._strip_cdata:
904            # hard switch-off for CDATA nodes => makes them plain text
905            pctxt.sax.cdataBlock = NULL
906
907    cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
908        cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
909        if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
910            # need to extend SAX1 context to SAX2 to get proper error reports
911            if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
912                sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
913                if sax is NULL:
914                    raise MemoryError()
915                cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
916                                 sizeof(htmlparser.htmlDefaultSAXHandler))
917                c_ctxt.sax = sax
918            sax.initialized = xmlparser.XML_SAX2_MAGIC
919            sax.serror = _receiveParserError
920            sax.startElementNs = NULL
921            sax.endElementNs = NULL
922            sax._private = NULL
923        return 0
924
925    cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
926        cdef xmlparser.xmlParserCtxt* c_ctxt
927        if self._for_html:
928            c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
929            if c_ctxt is not NULL:
930                self._registerHtmlErrorHandler(c_ctxt)
931        else:
932            c_ctxt = xmlparser.xmlNewParserCtxt()
933        if c_ctxt is NULL:
934            raise MemoryError
935        c_ctxt.sax.startDocument = _initSaxDocument
936        return c_ctxt
937
938    cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
939        cdef xmlparser.xmlParserCtxt* c_ctxt
940        cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
941        if self._for_html:
942            c_ctxt = htmlparser.htmlCreatePushParserCtxt(
943                NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
944            if c_ctxt is not NULL:
945                self._registerHtmlErrorHandler(c_ctxt)
946                htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
947        else:
948            c_ctxt = xmlparser.xmlCreatePushParserCtxt(
949                NULL, NULL, NULL, 0, c_filename)
950            if c_ctxt is not NULL:
951                xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
952        if c_ctxt is NULL:
953            raise MemoryError()
954        c_ctxt.sax.startDocument = _initSaxDocument
955        return c_ctxt
956
957    @property
958    def error_log(self):
959        """The error log of the last parser run.
960        """
961        cdef _ParserContext context
962        context = self._getParserContext()
963        return context._error_log.copy()
964
965    @property
966    def resolvers(self):
967        """The custom resolver registry of this parser."""
968        return self._resolvers
969
970    @property
971    def version(self):
972        """The version of the underlying XML parser."""
973        return u"libxml2 %d.%d.%d" % LIBXML_VERSION
974
975    def setElementClassLookup(self, ElementClassLookup lookup = None):
976        u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
977        self.set_element_class_lookup(lookup)
978
979    def set_element_class_lookup(self, ElementClassLookup lookup = None):
980        u"""set_element_class_lookup(self, lookup = None)
981
982        Set a lookup scheme for element classes generated from this parser.
983
984        Reset it by passing None or nothing.
985        """
986        self._class_lookup = lookup
987
988    cdef _BaseParser _copy(self):
989        u"Create a new parser with the same configuration."
990        cdef _BaseParser parser
991        parser = self.__class__()
992        parser._parse_options = self._parse_options
993        parser._for_html = self._for_html
994        parser._remove_comments = self._remove_comments
995        parser._remove_pis = self._remove_pis
996        parser._strip_cdata = self._strip_cdata
997        parser._filename = self._filename
998        parser._resolvers = self._resolvers
999        parser.target = self.target
1000        parser._class_lookup  = self._class_lookup
1001        parser._default_encoding = self._default_encoding
1002        parser._schema = self._schema
1003        parser._events_to_collect = self._events_to_collect
1004        return parser
1005
1006    def copy(self):
1007        u"""copy(self)
1008
1009        Create a new parser with the same configuration.
1010        """
1011        return self._copy()
1012
1013    def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1014        u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1015
1016        Creates a new element associated with this parser.
1017        """
1018        return _makeElement(_tag, NULL, None, self, None, None,
1019                            attrib, nsmap, _extra)
1020
1021    # internal parser methods
1022
1023    cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
1024        u"""Parse unicode document, share dictionary if possible.
1025        """
1026        cdef _ParserContext context
1027        cdef xmlDoc* result
1028        cdef xmlparser.xmlParserCtxt* pctxt
1029        cdef Py_ssize_t py_buffer_len
1030        cdef int buffer_len, c_kind
1031        cdef const_char* c_text
1032        cdef const_char* c_encoding = _PY_UNICODE_ENCODING
1033        cdef bint is_pep393_string = (
1034            python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
1035        if is_pep393_string:
1036            c_text = <const_char*>python.PyUnicode_DATA(utext)
1037            py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
1038            c_kind = python.PyUnicode_KIND(utext)
1039            if c_kind == 1:
1040                c_encoding = 'ISO-8859-1'
1041            elif c_kind == 2:
1042                py_buffer_len *= 2
1043                if python.PY_BIG_ENDIAN:
1044                    c_encoding = 'UTF-16BE'  # actually UCS-2
1045                else:
1046                    c_encoding = 'UTF-16LE'  # actually UCS-2
1047            elif c_kind == 4:
1048                py_buffer_len *= 4
1049                if python.PY_BIG_ENDIAN:
1050                    c_encoding = 'UCS-4BE'
1051                else:
1052                    c_encoding = 'UCS-4LE'
1053            else:
1054                assert False, f"Illegal Unicode kind {c_kind}"
1055        else:
1056            py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
1057            c_text = python.PyUnicode_AS_DATA(utext)
1058        assert 0 <= py_buffer_len <= limits.INT_MAX
1059        buffer_len = py_buffer_len
1060
1061        context = self._getParserContext()
1062        context.prepare()
1063        try:
1064            pctxt = context._c_ctxt
1065            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1066            orig_options = pctxt.options
1067            with nogil:
1068                if self._for_html:
1069                    result = htmlparser.htmlCtxtReadMemory(
1070                        pctxt, c_text, buffer_len, c_filename, c_encoding,
1071                        self._parse_options)
1072                    if result is not NULL:
1073                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
1074                            tree.xmlFreeDoc(result)
1075                            result = NULL
1076                else:
1077                    result = xmlparser.xmlCtxtReadMemory(
1078                        pctxt, c_text, buffer_len, c_filename, c_encoding,
1079                        self._parse_options)
1080            pctxt.options = orig_options # work around libxml2 problem
1081
1082            return context._handleParseResultDoc(self, result, None)
1083        finally:
1084            context.cleanup()
1085
1086    cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1087                           char* c_filename) except NULL:
1088        u"""Parse document, share dictionary if possible.
1089        """
1090        cdef _ParserContext context
1091        cdef xmlDoc* result
1092        cdef xmlparser.xmlParserCtxt* pctxt
1093        cdef char* c_encoding
1094        cdef tree.xmlCharEncoding enc
1095        context = self._getParserContext()
1096        context.prepare()
1097        try:
1098            pctxt = context._c_ctxt
1099            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1100
1101            if self._default_encoding is None:
1102                c_encoding = NULL
1103                # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
1104                # NOTE: limit to problematic cases because it changes character offsets
1105                if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
1106                                   c_text[2] == 0 and c_text[3] == 0):
1107                    c_encoding = "UTF-32LE"
1108                    c_text += 4
1109                    c_len -= 4
1110                elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
1111                                     c_text[2] == '\xFE' and c_text[3] == '\xFF'):
1112                    c_encoding = "UTF-32BE"
1113                    c_text += 4
1114                    c_len -= 4
1115                else:
1116                    # no BOM => try to determine encoding
1117                    enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
1118                    if enc == tree.XML_CHAR_ENCODING_UCS4LE:
1119                        c_encoding = 'UTF-32LE'
1120                    elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
1121                        c_encoding = 'UTF-32BE'
1122            else:
1123                c_encoding = _cstr(self._default_encoding)
1124
1125            orig_options = pctxt.options
1126            with nogil:
1127                if self._for_html:
1128                    result = htmlparser.htmlCtxtReadMemory(
1129                        pctxt, c_text, c_len, c_filename,
1130                        c_encoding, self._parse_options)
1131                    if result is not NULL:
1132                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
1133                            tree.xmlFreeDoc(result)
1134                            result = NULL
1135                else:
1136                    result = xmlparser.xmlCtxtReadMemory(
1137                        pctxt, c_text, c_len, c_filename,
1138                        c_encoding, self._parse_options)
1139            pctxt.options = orig_options # work around libxml2 problem
1140
1141            return context._handleParseResultDoc(self, result, None)
1142        finally:
1143            context.cleanup()
1144
1145    cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
1146        cdef _ParserContext context
1147        cdef xmlDoc* result
1148        cdef xmlparser.xmlParserCtxt* pctxt
1149        cdef char* c_encoding
1150        result = NULL
1151
1152        context = self._getParserContext()
1153        context.prepare()
1154        try:
1155            pctxt = context._c_ctxt
1156            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1157
1158            if self._default_encoding is None:
1159                c_encoding = NULL
1160            else:
1161                c_encoding = _cstr(self._default_encoding)
1162
1163            orig_options = pctxt.options
1164            with nogil:
1165                if self._for_html:
1166                    result = htmlparser.htmlCtxtReadFile(
1167                        pctxt, c_filename, c_encoding, self._parse_options)
1168                    if result is not NULL:
1169                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
1170                            tree.xmlFreeDoc(result)
1171                            result = NULL
1172                else:
1173                    result = xmlparser.xmlCtxtReadFile(
1174                        pctxt, c_filename, c_encoding, self._parse_options)
1175            pctxt.options = orig_options # work around libxml2 problem
1176
1177            return context._handleParseResultDoc(self, result, c_filename)
1178        finally:
1179            context.cleanup()
1180
1181    cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
1182                                       encoding) except NULL:
1183        cdef _ParserContext context
1184        cdef _FileReaderContext file_context
1185        cdef xmlDoc* result
1186        cdef xmlparser.xmlParserCtxt* pctxt
1187        cdef char* c_filename
1188        if not filename:
1189            filename = None
1190
1191        context = self._getParserContext()
1192        context.prepare()
1193        try:
1194            pctxt = context._c_ctxt
1195            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1196            file_context = _FileReaderContext(
1197                filelike, context, filename,
1198                encoding or self._default_encoding)
1199            result = file_context._readDoc(pctxt, self._parse_options)
1200
1201            return context._handleParseResultDoc(
1202                self, result, filename)
1203        finally:
1204            context.cleanup()
1205
1206
1207cdef void _initSaxDocument(void* ctxt) with gil:
1208    xmlparser.xmlSAX2StartDocument(ctxt)
1209    c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
1210    c_doc = c_ctxt.myDoc
1211
1212    # set up document dict
1213    if c_doc and c_ctxt.dict and not c_doc.dict:
1214        # I have no idea why libxml2 disables this - we need it
1215        c_ctxt.dictNames = 1
1216        c_doc.dict = c_ctxt.dict
1217        xmlparser.xmlDictReference(c_ctxt.dict)
1218
1219    # set up XML ID hash table
1220    if c_ctxt._private:
1221        context = <_ParserContext>c_ctxt._private
1222        if context._collect_ids:
1223            # keep the global parser dict from filling up with XML IDs
1224            if c_doc and not c_doc.ids:
1225                # memory errors are not fatal here
1226                c_dict = xmlparser.xmlDictCreate()
1227                if c_dict:
1228                    c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
1229                    xmlparser.xmlDictFree(c_dict)
1230                else:
1231                    c_doc.ids = tree.xmlHashCreate(0)
1232        else:
1233            c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
1234            if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
1235                # already initialised but empty => clear
1236                tree.xmlHashFree(c_doc.ids, NULL)
1237                c_doc.ids = NULL
1238
1239
1240############################################################
1241## ET feed parser
1242############################################################
1243
1244cdef class _FeedParser(_BaseParser):
1245    cdef bint _feed_parser_running
1246
1247    @property
1248    def feed_error_log(self):
1249        """The error log of the last (or current) run of the feed parser.
1250
1251        Note that this is local to the feed parser and thus is
1252        different from what the ``error_log`` property returns.
1253        """
1254        return self._getPushParserContext()._error_log.copy()
1255
1256    cpdef feed(self, data):
1257        u"""feed(self, data)
1258
1259        Feeds data to the parser.  The argument should be an 8-bit string
1260        buffer containing encoded data, although Unicode is supported as long
1261        as both string types are not mixed.
1262
1263        This is the main entry point to the consumer interface of a
1264        parser.  The parser will parse as much of the XML stream as it
1265        can on each call.  To finish parsing or to reset the parser,
1266        call the ``close()`` method.  Both methods may raise
1267        ParseError if errors occur in the input data.  If an error is
1268        raised, there is no longer a need to call ``close()``.
1269
1270        The feed parser interface is independent of the normal parser
1271        usage.  You can use the same parser as a feed parser and in
1272        the ``parse()`` function concurrently.
1273        """
1274        cdef _ParserContext context
1275        cdef bytes bstring
1276        cdef xmlparser.xmlParserCtxt* pctxt
1277        cdef Py_ssize_t py_buffer_len, ustart
1278        cdef const_char* char_data
1279        cdef const_char* c_encoding
1280        cdef int buffer_len
1281        cdef int error
1282        cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1283
1284        if isinstance(data, bytes):
1285            if self._default_encoding is None:
1286                c_encoding = NULL
1287            else:
1288                c_encoding = self._default_encoding
1289            char_data = _cstr(data)
1290            py_buffer_len = python.PyBytes_GET_SIZE(data)
1291            ustart = 0
1292        elif isinstance(data, unicode):
1293            c_encoding = b"UTF-8"
1294            char_data = NULL
1295            py_buffer_len = len(<unicode> data)
1296            ustart = 0
1297        else:
1298            raise TypeError, u"Parsing requires string data"
1299
1300        context = self._getPushParserContext()
1301        pctxt = context._c_ctxt
1302        error = 0
1303        if not self._feed_parser_running:
1304            context.prepare(set_document_loader=False)
1305            self._feed_parser_running = 1
1306            c_filename = (_cstr(self._filename)
1307                          if self._filename is not None else NULL)
1308
1309            # We have to give *mlCtxtResetPush() enough input to figure
1310            # out the character encoding (at least four bytes),
1311            # however if we give it all we got, we'll have nothing for
1312            # *mlParseChunk() and things go wrong.
1313            buffer_len = 0
1314            if char_data is not NULL:
1315                buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
1316            orig_loader = _register_document_loader()
1317            if self._for_html:
1318                error = _htmlCtxtResetPush(
1319                    pctxt, char_data, buffer_len, c_filename, c_encoding,
1320                    self._parse_options)
1321            else:
1322                xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1323                error = xmlparser.xmlCtxtResetPush(
1324                    pctxt, char_data, buffer_len, c_filename, c_encoding)
1325            _reset_document_loader(orig_loader)
1326            py_buffer_len -= buffer_len
1327            char_data += buffer_len
1328            if error:
1329                raise MemoryError()
1330            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1331
1332        #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1333
1334        fixup_error = 0
1335        while py_buffer_len > 0 and (error == 0 or recover):
1336            if char_data is NULL:
1337                # Unicode parsing by converting chunks to UTF-8
1338                buffer_len = 2**19  # len(bytes) <= 4 * (2**19) == 2 MiB
1339                bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8')
1340                ustart += buffer_len
1341                py_buffer_len -= buffer_len  # may end up < 0
1342                error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring))
1343            else:
1344                # Direct byte string parsing.
1345                buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX
1346                error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len)
1347                py_buffer_len -= buffer_len
1348                char_data += buffer_len
1349
1350            if fixup_error:
1351                context.store_exception(MemoryError())
1352
1353            if context._has_raised():
1354                # propagate Python exceptions immediately
1355                recover = 0
1356                error = 1
1357                break
1358
1359            if error and not pctxt.replaceEntities and not pctxt.validate:
1360                # in this mode, we ignore errors about undefined entities
1361                for entry in context._error_log.filter_from_errors():
1362                    if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1363                           entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1364                        break
1365                else:
1366                    error = 0
1367
1368        if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1369            # propagate Python exceptions immediately
1370            recover = 0
1371            error = 1
1372
1373        if fixup_error or not recover and (error or not pctxt.wellFormed):
1374            self._feed_parser_running = 0
1375            try:
1376                context._handleParseResult(self, pctxt.myDoc, None)
1377            finally:
1378                context.cleanup()
1379
1380    cpdef close(self):
1381        u"""close(self)
1382
1383        Terminates feeding data to this parser.  This tells the parser to
1384        process any remaining data in the feed buffer, and then returns the
1385        root Element of the tree that was parsed.
1386
1387        This method must be called after passing the last chunk of data into
1388        the ``feed()`` method.  It should only be called when using the feed
1389        parser interface, all other usage is undefined.
1390        """
1391        if not self._feed_parser_running:
1392            raise XMLSyntaxError(u"no element found",
1393                                 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
1394                                 self._filename)
1395
1396        context = self._getPushParserContext()
1397        pctxt = context._c_ctxt
1398
1399        self._feed_parser_running = 0
1400        if self._for_html:
1401            htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1402        else:
1403            xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1404
1405        if (pctxt.recovery and not pctxt.disableSAX and
1406                isinstance(context, _SaxParserContext)):
1407            # apply any left-over 'end' events
1408            (<_SaxParserContext>context).flushEvents()
1409
1410        try:
1411            result = context._handleParseResult(self, pctxt.myDoc, None)
1412        finally:
1413            context.cleanup()
1414
1415        if isinstance(result, _Document):
1416            return (<_Document>result).getroot()
1417        else:
1418            return result
1419
1420
1421cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt,
1422                                  const char* char_data, int buffer_len):
1423    fixup_error = 0
1424    with nogil:
1425        if c_ctxt.html:
1426            c_node = c_ctxt.node  # last node where the parser stopped
1427            orig_loader = _register_document_loader()
1428            error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0)
1429            _reset_document_loader(orig_loader)
1430            # and now for the fun part: move node names to the dict
1431            if c_ctxt.myDoc:
1432                fixup_error = _fixHtmlDictSubtreeNames(
1433                    c_ctxt.dict, c_ctxt.myDoc, c_node)
1434                if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict:
1435                    xmlparser.xmlDictFree(c_ctxt.myDoc.dict)
1436                    c_ctxt.myDoc.dict = c_ctxt.dict
1437                    xmlparser.xmlDictReference(c_ctxt.dict)
1438        else:
1439            orig_loader = _register_document_loader()
1440            error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0)
1441            _reset_document_loader(orig_loader)
1442    return (error, fixup_error)
1443
1444
1445cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1446                             const_char* c_data, int buffer_len,
1447                             const_char* c_filename, const_char* c_encoding,
1448                             int parse_options) except -1:
1449    cdef xmlparser.xmlParserInput* c_input_stream
1450    # libxml2 lacks an HTML push parser setup function
1451    error = xmlparser.xmlCtxtResetPush(
1452        c_ctxt, c_data, buffer_len, c_filename, c_encoding)
1453    if error:
1454        return error
1455
1456    # fix libxml2 setup for HTML
1457    c_ctxt.progressive = 1
1458    c_ctxt.html = 1
1459    htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1460
1461    return 0
1462
1463
1464############################################################
1465## XML parser
1466############################################################
1467
1468cdef int _XML_DEFAULT_PARSE_OPTIONS
1469_XML_DEFAULT_PARSE_OPTIONS = (
1470    xmlparser.XML_PARSE_NOENT   |
1471    xmlparser.XML_PARSE_NOCDATA |
1472    xmlparser.XML_PARSE_NONET   |
1473    xmlparser.XML_PARSE_COMPACT |
1474    xmlparser.XML_PARSE_BIG_LINES
1475    )
1476
1477cdef class XMLParser(_FeedParser):
1478    u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1479
1480    The XML parser.
1481
1482    Parsers can be supplied as additional argument to various parse
1483    functions of the lxml API.  A default parser is always available
1484    and can be replaced by a call to the global function
1485    'set_default_parser'.  New parsers can be created at any time
1486    without a major run-time overhead.
1487
1488    The keyword arguments in the constructor are mainly based on the
1489    libxml2 parser configuration.  A DTD will also be loaded if DTD
1490    validation or attribute default values are requested (unless you
1491    additionally provide an XMLSchema from which the default
1492    attributes can be read).
1493
1494    Available boolean keyword arguments:
1495
1496    - attribute_defaults - inject default attributes from DTD or XMLSchema
1497    - dtd_validation     - validate against a DTD referenced by the document
1498    - load_dtd           - use DTD for parsing
1499    - no_network         - prevent network access for related files (default: True)
1500    - ns_clean           - clean up redundant namespace declarations
1501    - recover            - try hard to parse through broken XML
1502    - remove_blank_text  - discard blank text nodes that appear ignorable
1503    - remove_comments    - discard comments
1504    - remove_pis         - discard processing instructions
1505    - strip_cdata        - replace CDATA sections by normal text content (default: True)
1506    - compact            - save memory for short text content (default: True)
1507    - collect_ids        - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1508    - resolve_entities   - replace entities by their text value (default: True)
1509    - huge_tree          - disable security restrictions and support very deep trees
1510                           and very long text content (only affects libxml2 2.7+)
1511
1512    Other keyword arguments:
1513
1514    - encoding - override the document encoding
1515    - target   - a parser target object that will receive the parse events
1516    - schema   - an XMLSchema to validate against
1517
1518    Note that you should avoid sharing parsers between threads.  While this is
1519    not harmful, it is more efficient to use separate parsers.  This does not
1520    apply to the default parser.
1521    """
1522    def __init__(self, *, encoding=None, attribute_defaults=False,
1523                 dtd_validation=False, load_dtd=False, no_network=True,
1524                 ns_clean=False, recover=False, XMLSchema schema=None,
1525                 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1526                 remove_comments=False, remove_pis=False, strip_cdata=True,
1527                 collect_ids=True, target=None, compact=True):
1528        cdef int parse_options
1529        parse_options = _XML_DEFAULT_PARSE_OPTIONS
1530        if load_dtd:
1531            parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1532        if dtd_validation:
1533            parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1534                            xmlparser.XML_PARSE_DTDLOAD
1535        if attribute_defaults:
1536            parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1537            if schema is None:
1538                parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1539        if ns_clean:
1540            parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1541        if recover:
1542            parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1543        if remove_blank_text:
1544            parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1545        if huge_tree:
1546            parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1547        if not no_network:
1548            parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1549        if not compact:
1550            parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1551        if not resolve_entities:
1552            parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1553        if not strip_cdata:
1554            parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1555
1556        _BaseParser.__init__(self, parse_options, 0, schema,
1557                             remove_comments, remove_pis, strip_cdata,
1558                             collect_ids, target, encoding)
1559
1560
1561cdef class XMLPullParser(XMLParser):
1562    """XMLPullParser(self, events=None, *, tag=None, **kwargs)
1563
1564    XML parser that collects parse events in an iterator.
1565
1566    The collected events are the same as for iterparse(), but the
1567    parser itself is non-blocking in the sense that it receives
1568    data chunks incrementally through its .feed() method, instead
1569    of reading them directly from a file(-like) object all by itself.
1570
1571    By default, it collects Element end events.  To change that,
1572    pass any subset of the available events into the ``events``
1573    argument: ``'start'``, ``'end'``, ``'start-ns'``,
1574    ``'end-ns'``, ``'comment'``, ``'pi'``.
1575
1576    To support loading external dependencies relative to the input
1577    source, you can pass the ``base_url``.
1578    """
1579    def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1580        XMLParser.__init__(self, **kwargs)
1581        if events is None:
1582            events = ('end',)
1583        self._setBaseURL(base_url)
1584        self._collectEvents(events, tag)
1585
1586    def read_events(self):
1587        return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1588
1589
1590cdef class ETCompatXMLParser(XMLParser):
1591    u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1592                 dtd_validation=False, load_dtd=False, no_network=True, \
1593                 ns_clean=False, recover=False, schema=None, \
1594                 huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1595                 remove_comments=True, remove_pis=True, strip_cdata=True, \
1596                 target=None, compact=True)
1597
1598    An XML parser with an ElementTree compatible default setup.
1599
1600    See the XMLParser class for details.
1601
1602    This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1603    and thus ignores comments and processing instructions.
1604    """
1605    def __init__(self, *, encoding=None, attribute_defaults=False,
1606                 dtd_validation=False, load_dtd=False, no_network=True,
1607                 ns_clean=False, recover=False, schema=None,
1608                 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1609                 remove_comments=True, remove_pis=True, strip_cdata=True,
1610                 target=None, compact=True):
1611        XMLParser.__init__(self,
1612                           attribute_defaults=attribute_defaults,
1613                           dtd_validation=dtd_validation,
1614                           load_dtd=load_dtd,
1615                           no_network=no_network,
1616                           ns_clean=ns_clean,
1617                           recover=recover,
1618                           remove_blank_text=remove_blank_text,
1619                           huge_tree=huge_tree,
1620                           compact=compact,
1621                           resolve_entities=resolve_entities,
1622                           remove_comments=remove_comments,
1623                           remove_pis=remove_pis,
1624                           strip_cdata=strip_cdata,
1625                           target=target,
1626                           encoding=encoding,
1627                           schema=schema)
1628
1629# ET 1.2 compatible name
1630XMLTreeBuilder = ETCompatXMLParser
1631
1632
1633cdef XMLParser __DEFAULT_XML_PARSER
1634__DEFAULT_XML_PARSER = XMLParser()
1635
1636__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1637
1638def set_default_parser(_BaseParser parser=None):
1639    u"""set_default_parser(parser=None)
1640
1641    Set a default parser for the current thread.  This parser is used
1642    globally whenever no parser is supplied to the various parse functions of
1643    the lxml API.  If this function is called without a parser (or if it is
1644    None), the default parser is reset to the original configuration.
1645
1646    Note that the pre-installed default parser is not thread-safe.  Avoid the
1647    default parser in multi-threaded environments.  You can create a separate
1648    parser for each thread explicitly or use a parser pool.
1649    """
1650    if parser is None:
1651        parser = __DEFAULT_XML_PARSER
1652    __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1653
1654def get_default_parser():
1655    u"get_default_parser()"
1656    return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1657
1658############################################################
1659## HTML parser
1660############################################################
1661
1662cdef int _HTML_DEFAULT_PARSE_OPTIONS
1663_HTML_DEFAULT_PARSE_OPTIONS = (
1664    htmlparser.HTML_PARSE_RECOVER |
1665    htmlparser.HTML_PARSE_NONET   |
1666    htmlparser.HTML_PARSE_COMPACT
1667    )
1668
1669cdef class HTMLParser(_FeedParser):
1670    u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1671                   remove_comments=False, remove_pis=False, strip_cdata=True, \
1672                   no_network=True, target=None, schema: XMLSchema =None, \
1673                   recover=True, compact=True, collect_ids=True, huge_tree=False)
1674
1675    The HTML parser.
1676
1677    This parser allows reading HTML into a normal XML tree.  By
1678    default, it can read broken (non well-formed) HTML, depending on
1679    the capabilities of libxml2.  Use the 'recover' option to switch
1680    this off.
1681
1682    Available boolean keyword arguments:
1683
1684    - recover            - try hard to parse through broken HTML (default: True)
1685    - no_network         - prevent network access for related files (default: True)
1686    - remove_blank_text  - discard empty text nodes that are ignorable (i.e. not actual text content)
1687    - remove_comments    - discard comments
1688    - remove_pis         - discard processing instructions
1689    - strip_cdata        - replace CDATA sections by normal text content (default: True)
1690    - compact            - save memory for short text content (default: True)
1691    - default_doctype    - add a default doctype even if it is not found in the HTML (default: True)
1692    - collect_ids        - use a hash table of XML IDs for fast access (default: True)
1693    - huge_tree          - disable security restrictions and support very deep trees
1694                           and very long text content (only affects libxml2 2.7+)
1695
1696    Other keyword arguments:
1697
1698    - encoding - override the document encoding
1699    - target   - a parser target object that will receive the parse events
1700    - schema   - an XMLSchema to validate against
1701
1702    Note that you should avoid sharing parsers between threads for performance
1703    reasons.
1704    """
1705    def __init__(self, *, encoding=None, remove_blank_text=False,
1706                 remove_comments=False, remove_pis=False, strip_cdata=True,
1707                 no_network=True, target=None, XMLSchema schema=None,
1708                 recover=True, compact=True, default_doctype=True,
1709                 collect_ids=True, huge_tree=False):
1710        cdef int parse_options
1711        parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1712        if remove_blank_text:
1713            parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1714        if not recover:
1715            parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1716        if not no_network:
1717            parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1718        if not compact:
1719            parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1720        if not default_doctype:
1721            parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
1722        if huge_tree:
1723            parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1724
1725        _BaseParser.__init__(self, parse_options, 1, schema,
1726                             remove_comments, remove_pis, strip_cdata,
1727                             collect_ids, target, encoding)
1728
1729
1730cdef HTMLParser __DEFAULT_HTML_PARSER
1731__DEFAULT_HTML_PARSER = HTMLParser()
1732
1733
1734cdef class HTMLPullParser(HTMLParser):
1735    """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
1736
1737    HTML parser that collects parse events in an iterator.
1738
1739    The collected events are the same as for iterparse(), but the
1740    parser itself is non-blocking in the sense that it receives
1741    data chunks incrementally through its .feed() method, instead
1742    of reading them directly from a file(-like) object all by itself.
1743
1744    By default, it collects Element end events.  To change that,
1745    pass any subset of the available events into the ``events``
1746    argument: ``'start'``, ``'end'``, ``'start-ns'``,
1747    ``'end-ns'``, ``'comment'``, ``'pi'``.
1748
1749    To support loading external dependencies relative to the input
1750    source, you can pass the ``base_url``.
1751    """
1752    def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1753        HTMLParser.__init__(self, **kwargs)
1754        if events is None:
1755            events = ('end',)
1756        self._setBaseURL(base_url)
1757        self._collectEvents(events, tag)
1758
1759    def read_events(self):
1760        return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1761
1762
1763############################################################
1764## helper functions for document creation
1765############################################################
1766
1767cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1768    cdef char* c_filename
1769    cdef char* c_text
1770    cdef Py_ssize_t c_len
1771    cdef bint is_pep393_string
1772    if parser is None:
1773        parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1774    if not filename:
1775        c_filename = NULL
1776    else:
1777        filename_utf = _encodeFilenameUTF8(filename)
1778        c_filename = _cstr(filename_utf)
1779    if isinstance(text, unicode):
1780        is_pep393_string = (
1781            python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
1782        if is_pep393_string:
1783            c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1784        else:
1785            c_len = python.PyUnicode_GET_DATA_SIZE(text)
1786        if c_len > limits.INT_MAX:
1787            return (<_BaseParser>parser)._parseDocFromFilelike(
1788                StringIO(text), filename, None)
1789        if _PY_UNICODE_ENCODING is NULL and not is_pep393_string:
1790            text = (<unicode>text).encode('utf8')
1791            return (<_BaseParser>parser)._parseDocFromFilelike(
1792                BytesIO(text), filename, "UTF-8")
1793        return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1794    else:
1795        c_len = python.PyBytes_GET_SIZE(text)
1796        if c_len > limits.INT_MAX:
1797            return (<_BaseParser>parser)._parseDocFromFilelike(
1798                BytesIO(text), filename, None)
1799        c_text = _cstr(text)
1800        return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1801
1802cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1803    if parser is None:
1804        parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1805    return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1806
1807cdef xmlDoc* _parseDocFromFilelike(source, filename,
1808                                   _BaseParser parser) except NULL:
1809    if parser is None:
1810        parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1811    return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1812
1813cdef xmlDoc* _newXMLDoc() except NULL:
1814    cdef xmlDoc* result
1815    result = tree.xmlNewDoc(NULL)
1816    if result is NULL:
1817        raise MemoryError()
1818    if result.encoding is NULL:
1819        result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1820    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1821    return result
1822
1823cdef xmlDoc* _newHTMLDoc() except NULL:
1824    cdef xmlDoc* result
1825    result = tree.htmlNewDoc(NULL, NULL)
1826    if result is NULL:
1827        raise MemoryError()
1828    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1829    return result
1830
1831cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1832    cdef xmlDoc* result
1833    if recursive:
1834        with nogil:
1835            result = tree.xmlCopyDoc(c_doc, recursive)
1836    else:
1837        result = tree.xmlCopyDoc(c_doc, 0)
1838    if result is NULL:
1839        raise MemoryError()
1840    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1841    return result
1842
1843cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1844    u"Recursively copy the document and make c_new_root the new root node."
1845    cdef xmlDoc* result
1846    cdef xmlNode* c_node
1847    result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1848    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1849    with nogil:
1850        c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1851    if c_node is NULL:
1852        raise MemoryError()
1853    tree.xmlDocSetRootElement(result, c_node)
1854    _copyTail(c_new_root.next, c_node)
1855    return result
1856
1857cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1858    u"Recursively copy the element into the document. c_doc is not modified."
1859    cdef xmlNode* c_root
1860    c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1861    if c_root is NULL:
1862        raise MemoryError()
1863    _copyTail(c_node.next, c_root)
1864    return c_root
1865
1866
1867############################################################
1868## API level helper functions for _Document creation
1869############################################################
1870
1871cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1872    cdef _Document doc
1873    if _isString(source):
1874        # parse the file directly from the filesystem
1875        doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1876        # fix base URL if requested
1877        if base_url is not None:
1878            base_url = _encodeFilenameUTF8(base_url)
1879            if doc._c_doc.URL is not NULL:
1880                tree.xmlFree(<char*>doc._c_doc.URL)
1881            doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1882        return doc
1883
1884    if base_url is not None:
1885        url = base_url
1886    else:
1887        url = _getFilenameForFile(source)
1888
1889    if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1890        # StringIO - reading from start?
1891        if source.tell() == 0:
1892            return _parseMemoryDocument(source.getvalue(), url, parser)
1893
1894    # Support for file-like objects (urlgrabber.urlopen, ...)
1895    if hasattr(source, u'read'):
1896        return _parseFilelikeDocument(source, url, parser)
1897
1898    raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
1899
1900cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1901    c_doc = _parseDocFromFile(url, parser)
1902    return _documentFactory(c_doc, parser)
1903
1904cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1905    if isinstance(text, unicode):
1906        if _hasEncodingDeclaration(text):
1907            raise ValueError(
1908                u"Unicode strings with encoding declaration are not supported. "
1909                u"Please use bytes input or XML fragments without declaration.")
1910    elif not isinstance(text, bytes):
1911        raise ValueError, u"can only parse strings"
1912    c_doc = _parseDoc(text, url, parser)
1913    return _documentFactory(c_doc, parser)
1914
1915cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1916    c_doc = _parseDocFromFilelike(source, url, parser)
1917    return _documentFactory(c_doc, parser)
1918