1# Parsers for XML and HTML 2 3from lxml.includes cimport xmlparser 4from lxml.includes cimport htmlparser 5 6 7class ParseError(LxmlSyntaxError): 8 """Syntax error while parsing an XML document. 9 10 For compatibility with ElementTree 1.3 and later. 11 """ 12 def __init__(self, message, code, line, column, filename=None): 13 super(_ParseError, self).__init__(message) 14 self.lineno, self.offset = (line, column - 1) 15 self.code = code 16 self.filename = filename 17 18 @property 19 def position(self): 20 return self.lineno, self.offset + 1 21 22 @position.setter 23 def position(self, new_pos): 24 self.lineno, column = new_pos 25 self.offset = column - 1 26 27cdef object _ParseError = ParseError 28 29 30class XMLSyntaxError(ParseError): 31 """Syntax error while parsing an XML document. 32 """ 33 34cdef class ParserError(LxmlError): 35 """Internal lxml parser error. 36 """ 37 38 39@cython.final 40@cython.internal 41cdef class _ParserDictionaryContext: 42 # Global parser context to share the string dictionary. 43 # 44 # This class is a delegate singleton! 45 # 46 # It creates _ParserDictionaryContext objects for each thread to keep thread state, 47 # but those must never be used directly. Always stick to using the static 48 # __GLOBAL_PARSER_CONTEXT as defined below the class. 49 # 50 51 cdef tree.xmlDict* _c_dict 52 cdef _BaseParser _default_parser 53 cdef list _implied_parser_contexts 54 55 def __cinit__(self): 56 self._c_dict = NULL 57 self._implied_parser_contexts = [] 58 59 def __dealloc__(self): 60 if self._c_dict is not NULL: 61 xmlparser.xmlDictFree(self._c_dict) 62 63 cdef void initMainParserContext(self): 64 u"""Put the global context into the thread dictionary of the main 65 thread. To be called once and only in the main thread.""" 66 thread_dict = python.PyThreadState_GetDict() 67 if thread_dict is not NULL: 68 (<dict>thread_dict)[u"_ParserDictionaryContext"] = self 69 70 cdef _ParserDictionaryContext _findThreadParserContext(self): 71 u"Find (or create) the _ParserDictionaryContext object for the current thread" 72 cdef _ParserDictionaryContext context 73 thread_dict = python.PyThreadState_GetDict() 74 if thread_dict is NULL: 75 return self 76 d = <dict>thread_dict 77 result = python.PyDict_GetItem(d, u"_ParserDictionaryContext") 78 if result is not NULL: 79 return <object>result 80 context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext) 81 d[u"_ParserDictionaryContext"] = context 82 return context 83 84 cdef void setDefaultParser(self, _BaseParser parser): 85 u"Set the default parser for the current thread" 86 cdef _ParserDictionaryContext context 87 context = self._findThreadParserContext() 88 context._default_parser = parser 89 90 cdef _BaseParser getDefaultParser(self): 91 u"Return (or create) the default parser of the current thread" 92 cdef _ParserDictionaryContext context 93 context = self._findThreadParserContext() 94 if context._default_parser is None: 95 if self._default_parser is None: 96 self._default_parser = __DEFAULT_XML_PARSER._copy() 97 if context is not self: 98 context._default_parser = self._default_parser._copy() 99 return context._default_parser 100 101 cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default): 102 u"Return the thread-local dict or create a new one if necessary." 103 cdef _ParserDictionaryContext context 104 context = self._findThreadParserContext() 105 if context._c_dict is NULL: 106 # thread dict not yet set up => use default or create a new one 107 if default is not NULL: 108 context._c_dict = default 109 xmlparser.xmlDictReference(default) 110 return default 111 if self._c_dict is NULL: 112 self._c_dict = xmlparser.xmlDictCreate() 113 if context is not self: 114 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict) 115 return context._c_dict 116 117 cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref): 118 c_dict = c_dict_ref[0] 119 c_thread_dict = self._getThreadDict(c_dict) 120 if c_dict is c_thread_dict: 121 return 122 if c_dict is not NULL: 123 xmlparser.xmlDictFree(c_dict) 124 c_dict_ref[0] = c_thread_dict 125 xmlparser.xmlDictReference(c_thread_dict) 126 127 cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt): 128 u"Assure we always use the same string dictionary." 129 self.initThreadDictRef(&pctxt.dict) 130 pctxt.dictNames = 1 131 132 cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt): 133 u"Assure we always use the same string dictionary." 134 self.initThreadDictRef(&pctxt.dict) 135 136 cdef void initDocDict(self, xmlDoc* result): 137 u"Store dict of last object parsed if no shared dict yet" 138 # XXX We also free the result dict here if there already was one. 139 # This case should only occur for new documents with empty dicts, 140 # otherwise we'd free data that's in use => segfault 141 self.initThreadDictRef(&result.dict) 142 143 cdef _ParserContext findImpliedContext(self): 144 u"""Return any current implied xml parser context for the current 145 thread. This is used when the resolver functions are called 146 with an xmlParserCtxt that was generated from within libxml2 147 (i.e. without a _ParserContext) - which happens when parsing 148 schema and xinclude external references.""" 149 cdef _ParserDictionaryContext context 150 cdef _ParserContext implied_context 151 152 # see if we have a current implied parser 153 context = self._findThreadParserContext() 154 if context._implied_parser_contexts: 155 implied_context = context._implied_parser_contexts[-1] 156 return implied_context 157 return None 158 159 cdef void pushImpliedContextFromParser(self, _BaseParser parser): 160 u"Push a new implied context object taken from the parser." 161 if parser is not None: 162 self.pushImpliedContext(parser._getParserContext()) 163 else: 164 self.pushImpliedContext(None) 165 166 cdef void pushImpliedContext(self, _ParserContext parser_context): 167 u"Push a new implied context object." 168 cdef _ParserDictionaryContext context 169 context = self._findThreadParserContext() 170 context._implied_parser_contexts.append(parser_context) 171 172 cdef void popImpliedContext(self): 173 u"Pop the current implied context object." 174 cdef _ParserDictionaryContext context 175 context = self._findThreadParserContext() 176 context._implied_parser_contexts.pop() 177 178cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext() 179__GLOBAL_PARSER_CONTEXT.initMainParserContext() 180 181############################################################ 182## support for Python unicode I/O 183############################################################ 184 185# name of Python Py_UNICODE encoding as known to libxml2 186cdef const_char* _PY_UNICODE_ENCODING = NULL 187 188cdef int _setupPythonUnicode() except -1: 189 u"""Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode 190 strings if libxml2 supports reading native Python unicode. This depends 191 on iconv and the local Python installation, so we simply check if we find 192 a matching encoding handler. 193 """ 194 cdef tree.xmlCharEncodingHandler* enchandler 195 cdef Py_ssize_t l 196 cdef const_char* enc 197 cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>'] 198 cdef const_xmlChar* buffer = <const_xmlChar*>uchars 199 # apparently, libxml2 can't detect UTF-16 on some systems 200 if (buffer[0] == c'<' and buffer[1] == c'\0' and 201 buffer[2] == c't' and buffer[3] == c'\0'): 202 enc = "UTF-16LE" 203 elif (buffer[0] == c'\0' and buffer[1] == c'<' and 204 buffer[2] == c'\0' and buffer[3] == c't'): 205 enc = "UTF-16BE" 206 else: 207 # let libxml2 give it a try 208 enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7) 209 if enc is NULL: 210 # not my fault, it's YOUR broken system :) 211 return 0 212 enchandler = tree.xmlFindCharEncodingHandler(enc) 213 if enchandler is not NULL: 214 global _PY_UNICODE_ENCODING 215 tree.xmlCharEncCloseFunc(enchandler) 216 _PY_UNICODE_ENCODING = enc 217 return 0 218 219cdef const_char* _findEncodingName(const_xmlChar* buffer, int size): 220 u"Work around bug in libxml2: find iconv name of encoding on our own." 221 cdef tree.xmlCharEncoding enc 222 enc = tree.xmlDetectCharEncoding(buffer, size) 223 if enc == tree.XML_CHAR_ENCODING_UTF16LE: 224 if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and 225 buffer[1] == <const_xmlChar>'\xFE' and 226 buffer[2] == 0 and buffer[3] == 0): 227 return "UTF-32LE" # according to BOM 228 else: 229 return "UTF-16LE" 230 elif enc == tree.XML_CHAR_ENCODING_UTF16BE: 231 return "UTF-16BE" 232 elif enc == tree.XML_CHAR_ENCODING_UCS4LE: 233 return "UCS-4LE" 234 elif enc == tree.XML_CHAR_ENCODING_UCS4BE: 235 return "UCS-4BE" 236 elif enc == tree.XML_CHAR_ENCODING_NONE: 237 return NULL 238 else: 239 # returns a constant char*, no need to free it 240 return tree.xmlGetCharEncodingName(enc) 241 242_setupPythonUnicode() 243 244############################################################ 245## support for file-like objects 246############################################################ 247 248@cython.final 249@cython.internal 250cdef class _FileReaderContext: 251 cdef object _filelike 252 cdef object _encoding 253 cdef object _url 254 cdef object _bytes 255 cdef _ExceptionContext _exc_context 256 cdef Py_ssize_t _bytes_read 257 cdef char* _c_url 258 cdef bint _close_file_after_read 259 260 def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False): 261 self._exc_context = exc_context 262 self._filelike = filelike 263 self._close_file_after_read = close_file 264 self._encoding = encoding 265 if url is None: 266 self._c_url = NULL 267 else: 268 url = _encodeFilename(url) 269 self._c_url = _cstr(url) 270 self._url = url 271 self._bytes = b'' 272 self._bytes_read = 0 273 274 cdef _close_file(self): 275 if self._filelike is None or not self._close_file_after_read: 276 return 277 try: 278 close = self._filelike.close 279 except AttributeError: 280 close = None 281 finally: 282 self._filelike = None 283 if close is not None: 284 close() 285 286 cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self): 287 cdef stdio.FILE* c_stream 288 cdef xmlparser.xmlParserInputBuffer* c_buffer 289 c_buffer = xmlparser.xmlAllocParserInputBuffer(0) 290 c_stream = python.PyFile_AsFile(self._filelike) 291 if c_stream is NULL: 292 c_buffer.readcallback = _readFilelikeParser 293 c_buffer.context = <python.PyObject*>self 294 else: 295 c_buffer.readcallback = _readFileParser 296 c_buffer.context = c_stream 297 return c_buffer 298 299 cdef xmlparser.xmlParserInput* _createParserInput( 300 self, xmlparser.xmlParserCtxt* ctxt): 301 cdef xmlparser.xmlParserInputBuffer* c_buffer 302 c_buffer = self._createParserInputBuffer() 303 return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) 304 305 cdef tree.xmlDtd* _readDtd(self): 306 cdef xmlparser.xmlParserInputBuffer* c_buffer 307 c_buffer = self._createParserInputBuffer() 308 with nogil: 309 return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0) 310 311 cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options): 312 cdef xmlDoc* result 313 cdef char* c_encoding 314 cdef stdio.FILE* c_stream 315 cdef xmlparser.xmlInputReadCallback c_read_callback 316 cdef xmlparser.xmlInputCloseCallback c_close_callback 317 cdef void* c_callback_context 318 319 if self._encoding is None: 320 c_encoding = NULL 321 else: 322 c_encoding = _cstr(self._encoding) 323 324 c_stream = python.PyFile_AsFile(self._filelike) 325 if c_stream is NULL: 326 c_read_callback = _readFilelikeParser 327 c_callback_context = <python.PyObject*>self 328 else: 329 c_read_callback = _readFileParser 330 c_callback_context = c_stream 331 332 orig_options = ctxt.options 333 with nogil: 334 if ctxt.html: 335 result = htmlparser.htmlCtxtReadIO( 336 ctxt, c_read_callback, NULL, c_callback_context, 337 self._c_url, c_encoding, options) 338 if result is not NULL: 339 if _fixHtmlDictNames(ctxt.dict, result) < 0: 340 tree.xmlFreeDoc(result) 341 result = NULL 342 else: 343 result = xmlparser.xmlCtxtReadIO( 344 ctxt, c_read_callback, NULL, c_callback_context, 345 self._c_url, c_encoding, options) 346 ctxt.options = orig_options # work around libxml2 problem 347 try: 348 self._close_file() 349 except: 350 self._exc_context._store_raised() 351 finally: 352 return result # swallow any exceptions 353 354 cdef int copyToBuffer(self, char* c_buffer, int c_requested): 355 cdef int c_byte_count = 0 356 cdef char* c_start 357 cdef Py_ssize_t byte_count, remaining 358 if self._bytes_read < 0: 359 return 0 360 try: 361 byte_count = python.PyBytes_GET_SIZE(self._bytes) 362 remaining = byte_count - self._bytes_read 363 while c_requested > remaining: 364 c_start = _cstr(self._bytes) + self._bytes_read 365 cstring_h.memcpy(c_buffer, c_start, remaining) 366 c_byte_count += remaining 367 c_buffer += remaining 368 c_requested -= remaining 369 370 self._bytes = self._filelike.read(c_requested) 371 if not isinstance(self._bytes, bytes): 372 if isinstance(self._bytes, unicode): 373 if self._encoding is None: 374 self._bytes = (<unicode>self._bytes).encode('utf8') 375 else: 376 self._bytes = python.PyUnicode_AsEncodedString( 377 self._bytes, _cstr(self._encoding), NULL) 378 else: 379 self._close_file() 380 raise TypeError, \ 381 u"reading from file-like objects must return byte strings or unicode strings" 382 383 remaining = python.PyBytes_GET_SIZE(self._bytes) 384 if remaining == 0: 385 self._bytes_read = -1 386 self._close_file() 387 return c_byte_count 388 self._bytes_read = 0 389 390 if c_requested > 0: 391 c_start = _cstr(self._bytes) + self._bytes_read 392 cstring_h.memcpy(c_buffer, c_start, c_requested) 393 c_byte_count += c_requested 394 self._bytes_read += c_requested 395 except: 396 c_byte_count = -1 397 self._exc_context._store_raised() 398 try: 399 self._close_file() 400 except: 401 self._exc_context._store_raised() 402 finally: 403 return c_byte_count # swallow any exceptions 404 405cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil: 406 return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size) 407 408cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil: 409 return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt) 410 411############################################################ 412## support for custom document loaders 413############################################################ 414 415cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid, 416 xmlparser.xmlParserCtxt* c_context) with gil: 417 cdef _ResolverContext context 418 cdef xmlparser.xmlParserInput* c_input 419 cdef _InputDocument doc_ref 420 cdef _FileReaderContext file_context 421 # if there is no _ParserContext associated with the xmlParserCtxt 422 # passed, check to see if the thread state object has an implied 423 # context. 424 if c_context._private is not NULL: 425 context = <_ResolverContext>c_context._private 426 else: 427 context = __GLOBAL_PARSER_CONTEXT.findImpliedContext() 428 429 if context is None: 430 if __DEFAULT_ENTITY_LOADER is NULL: 431 return NULL 432 with nogil: 433 # free the GIL as we might do serious I/O here (e.g. HTTP) 434 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) 435 return c_input 436 437 try: 438 if c_url is NULL: 439 url = None 440 else: 441 # parsing a related document (DTD etc.) => UTF-8 encoded URL? 442 url = _decodeFilename(<const_xmlChar*>c_url) 443 if c_pubid is NULL: 444 pubid = None 445 else: 446 pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8 447 448 doc_ref = context._resolvers.resolve(url, pubid, context) 449 except: 450 context._store_raised() 451 return NULL 452 453 if doc_ref is not None: 454 if doc_ref._type == PARSER_DATA_STRING: 455 data = doc_ref._data_bytes 456 filename = doc_ref._filename 457 if not filename: 458 filename = None 459 elif not isinstance(filename, bytes): 460 # most likely a text URL 461 filename = filename.encode('utf8') 462 if not isinstance(filename, bytes): 463 filename = None 464 465 c_input = xmlparser.xmlNewInputStream(c_context) 466 if c_input is not NULL: 467 if filename is not None: 468 c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename)) 469 c_input.base = _xcstr(data) 470 c_input.length = python.PyBytes_GET_SIZE(data) 471 c_input.cur = c_input.base 472 c_input.end = c_input.base + c_input.length 473 elif doc_ref._type == PARSER_DATA_FILENAME: 474 data = None 475 c_filename = _cstr(doc_ref._filename) 476 with nogil: 477 # free the GIL as we might do serious I/O here 478 c_input = xmlparser.xmlNewInputFromFile( 479 c_context, c_filename) 480 elif doc_ref._type == PARSER_DATA_FILE: 481 file_context = _FileReaderContext(doc_ref._file, context, url, 482 None, doc_ref._close_file) 483 c_input = file_context._createParserInput(c_context) 484 data = file_context 485 else: 486 data = None 487 c_input = NULL 488 489 if data is not None: 490 context._storage.add(data) 491 if c_input is not NULL: 492 return c_input 493 494 if __DEFAULT_ENTITY_LOADER is NULL: 495 return NULL 496 497 with nogil: 498 # free the GIL as we might do serious I/O here (e.g. HTTP) 499 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) 500 return c_input 501 502cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER 503__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader() 504 505 506cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil: 507 cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader() 508 xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver) 509 return old 510 511cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil: 512 xmlparser.xmlSetExternalEntityLoader(old) 513 514 515############################################################ 516## Parsers 517############################################################ 518 519@cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc. 520@cython.internal 521cdef class _ParserContext(_ResolverContext): 522 cdef _ErrorLog _error_log 523 cdef _ParserSchemaValidationContext _validator 524 cdef xmlparser.xmlParserCtxt* _c_ctxt 525 cdef xmlparser.xmlExternalEntityLoader _orig_loader 526 cdef python.PyThread_type_lock _lock 527 cdef _Document _doc 528 cdef bint _collect_ids 529 530 def __cinit__(self): 531 self._c_ctxt = NULL 532 self._collect_ids = True 533 if not config.ENABLE_THREADING: 534 self._lock = NULL 535 else: 536 self._lock = python.PyThread_allocate_lock() 537 self._error_log = _ErrorLog() 538 539 def __dealloc__(self): 540 if config.ENABLE_THREADING and self._lock is not NULL: 541 python.PyThread_free_lock(self._lock) 542 self._lock = NULL 543 if self._c_ctxt is not NULL: 544 if <void*>self._validator is not NULL and self._validator is not None: 545 # If the parser was not closed correctly (e.g. interrupted iterparse()), 546 # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX 547 # validator plug might still be in place, which will make xmlFreeParserCtxt() 548 # crash when trying to xmlFree() a static SAX handler. 549 # Thus, make sure we disconnect the handler interceptor here at the latest. 550 self._validator.disconnect() 551 xmlparser.xmlFreeParserCtxt(self._c_ctxt) 552 553 cdef _ParserContext _copy(self): 554 cdef _ParserContext context 555 context = self.__class__() 556 context._collect_ids = self._collect_ids 557 context._validator = self._validator.copy() 558 _initParserContext(context, self._resolvers._copy(), NULL) 559 return context 560 561 cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): 562 self._c_ctxt = c_ctxt 563 c_ctxt._private = <void*>self 564 565 cdef void _resetParserContext(self): 566 if self._c_ctxt is not NULL: 567 if self._c_ctxt.html: 568 htmlparser.htmlCtxtReset(self._c_ctxt) 569 self._c_ctxt.disableSAX = 0 # work around bug in libxml2 570 else: 571 xmlparser.xmlClearParserCtxt(self._c_ctxt) 572 573 cdef int prepare(self, bint set_document_loader=True) except -1: 574 cdef int result 575 if config.ENABLE_THREADING and self._lock is not NULL: 576 with nogil: 577 result = python.PyThread_acquire_lock( 578 self._lock, python.WAIT_LOCK) 579 if result == 0: 580 raise ParserError, u"parser locking failed" 581 self._error_log.clear() 582 self._doc = None 583 self._c_ctxt.sax.serror = _receiveParserError 584 self._orig_loader = _register_document_loader() if set_document_loader else NULL 585 if self._validator is not None: 586 self._validator.connect(self._c_ctxt, self._error_log) 587 return 0 588 589 cdef int cleanup(self) except -1: 590 if self._orig_loader is not NULL: 591 _reset_document_loader(self._orig_loader) 592 try: 593 if self._validator is not None: 594 self._validator.disconnect() 595 self._resetParserContext() 596 self.clear() 597 self._doc = None 598 self._c_ctxt.sax.serror = NULL 599 finally: 600 if config.ENABLE_THREADING and self._lock is not NULL: 601 python.PyThread_release_lock(self._lock) 602 return 0 603 604 cdef object _handleParseResult(self, _BaseParser parser, 605 xmlDoc* result, filename): 606 c_doc = self._handleParseResultDoc(parser, result, filename) 607 if self._doc is not None and self._doc._c_doc is c_doc: 608 return self._doc 609 else: 610 return _documentFactory(c_doc, parser) 611 612 cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser, 613 xmlDoc* result, filename) except NULL: 614 recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER 615 return _handleParseResult(self, self._c_ctxt, result, 616 filename, recover, 617 free_doc=self._doc is None) 618 619cdef _initParserContext(_ParserContext context, 620 _ResolverRegistry resolvers, 621 xmlparser.xmlParserCtxt* c_ctxt): 622 _initResolverContext(context, resolvers) 623 if c_ctxt is not NULL: 624 context._initParserContext(c_ctxt) 625 626cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil: 627 (<_ParserContext>_parser_context._private)._error_log._receive(error) 628 629cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil: 630 if __DEBUG: 631 if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL: 632 _forwardError(NULL, error) 633 else: 634 _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error) 635 636cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, 637 _ErrorLog error_log) except -1: 638 if filename is not None and \ 639 ctxt.lastError.domain == xmlerror.XML_FROM_IO: 640 if isinstance(filename, bytes): 641 filename = _decodeFilenameWithLength( 642 <bytes>filename, len(<bytes>filename)) 643 if ctxt.lastError.message is not NULL: 644 try: 645 message = ctxt.lastError.message.decode('utf-8') 646 except UnicodeDecodeError: 647 # the filename may be in there => play it safe 648 message = ctxt.lastError.message.decode('iso8859-1') 649 message = f"Error reading file '{filename}': {message.strip()}" 650 else: 651 message = f"Error reading '{filename}'" 652 raise IOError, message 653 elif error_log: 654 raise error_log._buildParseException( 655 XMLSyntaxError, u"Document is not well formed") 656 elif ctxt.lastError.message is not NULL: 657 message = ctxt.lastError.message.strip() 658 code = ctxt.lastError.code 659 line = ctxt.lastError.line 660 column = ctxt.lastError.int2 661 if ctxt.lastError.line > 0: 662 message = f"line {line}: {message}" 663 raise XMLSyntaxError(message, code, line, column, filename) 664 else: 665 raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0, 666 filename) 667 668cdef xmlDoc* _handleParseResult(_ParserContext context, 669 xmlparser.xmlParserCtxt* c_ctxt, 670 xmlDoc* result, filename, 671 bint recover, bint free_doc) except NULL: 672 cdef bint well_formed 673 if result is not NULL: 674 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 675 676 if c_ctxt.myDoc is not NULL: 677 if c_ctxt.myDoc is not result: 678 __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc) 679 tree.xmlFreeDoc(c_ctxt.myDoc) 680 c_ctxt.myDoc = NULL 681 682 if result is not NULL: 683 if (context._validator is not None and 684 not context._validator.isvalid()): 685 well_formed = 0 # actually not 'valid', but anyway ... 686 elif (not c_ctxt.wellFormed and not c_ctxt.html and 687 c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and 688 [1 for error in context._error_log 689 if error.type == ErrorTypes.ERR_INVALID_CHAR]): 690 # An encoding error occurred and libxml2 switched from UTF-8 691 # input to (undecoded) Latin-1, at some arbitrary point in the 692 # document. Better raise an error than allowing for a broken 693 # tree with mixed encodings. 694 well_formed = 0 695 elif recover or (c_ctxt.wellFormed and 696 c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): 697 well_formed = 1 698 elif not c_ctxt.replaceEntities and not c_ctxt.validate \ 699 and context is not None: 700 # in this mode, we ignore errors about undefined entities 701 for error in context._error_log.filter_from_errors(): 702 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ 703 error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: 704 well_formed = 0 705 break 706 else: 707 well_formed = 1 708 else: 709 well_formed = 0 710 711 if not well_formed: 712 if free_doc: 713 tree.xmlFreeDoc(result) 714 result = NULL 715 716 if context is not None and context._has_raised(): 717 if result is not NULL: 718 if free_doc: 719 tree.xmlFreeDoc(result) 720 result = NULL 721 context._raise_if_stored() 722 723 if result is NULL: 724 if context is not None: 725 _raiseParseError(c_ctxt, filename, context._error_log) 726 else: 727 _raiseParseError(c_ctxt, filename, None) 728 else: 729 if result.URL is NULL and filename is not None: 730 result.URL = tree.xmlStrdup(_xcstr(filename)) 731 if result.encoding is NULL: 732 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8") 733 734 if context._validator is not None and \ 735 context._validator._add_default_attributes: 736 # we currently need to do this here as libxml2 does not 737 # support inserting default attributes during parse-time 738 # validation 739 context._validator.inject_default_attributes(result) 740 741 return result 742 743cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil: 744 cdef xmlNode* c_node 745 if c_doc is NULL: 746 return 0 747 c_node = c_doc.children 748 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1) 749 if c_node.type == tree.XML_ELEMENT_NODE: 750 if _fixHtmlDictNodeNames(c_dict, c_node) < 0: 751 return -1 752 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 753 return 0 754 755cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc, 756 xmlNode* c_start_node) nogil: 757 """ 758 Move names to the dict, iterating in document order, starting at 759 c_start_node. This is used in incremental parsing after each chunk. 760 """ 761 cdef xmlNode* c_node 762 if not c_doc: 763 return 0 764 if not c_start_node: 765 return _fixHtmlDictNames(c_dict, c_doc) 766 c_node = c_start_node 767 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1) 768 if c_node.type == tree.XML_ELEMENT_NODE: 769 if _fixHtmlDictNodeNames(c_dict, c_node) < 0: 770 return -1 771 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 772 return 0 773 774cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict, 775 xmlNode* c_node) nogil: 776 cdef xmlNode* c_attr 777 c_name = tree.xmlDictLookup(c_dict, c_node.name, -1) 778 if c_name is NULL: 779 return -1 780 if c_name is not c_node.name: 781 tree.xmlFree(<char*>c_node.name) 782 c_node.name = c_name 783 c_attr = <xmlNode*>c_node.properties 784 while c_attr is not NULL: 785 c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1) 786 if c_name is NULL: 787 return -1 788 if c_name is not c_attr.name: 789 tree.xmlFree(<char*>c_attr.name) 790 c_attr.name = c_name 791 c_attr = c_attr.next 792 return 0 793 794@cython.internal 795cdef class _BaseParser: 796 cdef ElementClassLookup _class_lookup 797 cdef _ResolverRegistry _resolvers 798 cdef _ParserContext _parser_context 799 cdef _ParserContext _push_parser_context 800 cdef int _parse_options 801 cdef bint _for_html 802 cdef bint _remove_comments 803 cdef bint _remove_pis 804 cdef bint _strip_cdata 805 cdef bint _collect_ids 806 cdef XMLSchema _schema 807 cdef bytes _filename 808 cdef readonly object target 809 cdef object _default_encoding 810 cdef tuple _events_to_collect # (event_types, tag) 811 812 def __init__(self, int parse_options, bint for_html, XMLSchema schema, 813 remove_comments, remove_pis, strip_cdata, collect_ids, 814 target, encoding): 815 cdef tree.xmlCharEncodingHandler* enchandler 816 cdef int c_encoding 817 if not isinstance(self, (XMLParser, HTMLParser)): 818 raise TypeError, u"This class cannot be instantiated" 819 820 self._parse_options = parse_options 821 self.target = target 822 self._for_html = for_html 823 self._remove_comments = remove_comments 824 self._remove_pis = remove_pis 825 self._strip_cdata = strip_cdata 826 self._collect_ids = collect_ids 827 self._schema = schema 828 829 self._resolvers = _ResolverRegistry() 830 831 if encoding is None: 832 self._default_encoding = None 833 else: 834 encoding = _utf8(encoding) 835 enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding)) 836 if enchandler is NULL: 837 raise LookupError, f"unknown encoding: '{encoding}'" 838 tree.xmlCharEncCloseFunc(enchandler) 839 self._default_encoding = encoding 840 841 cdef _setBaseURL(self, base_url): 842 self._filename = _encodeFilename(base_url) 843 844 cdef _collectEvents(self, event_types, tag): 845 if event_types is None: 846 event_types = () 847 else: 848 event_types = tuple(set(event_types)) 849 _buildParseEventFilter(event_types) # purely for validation 850 self._events_to_collect = (event_types, tag) 851 852 cdef _ParserContext _getParserContext(self): 853 cdef xmlparser.xmlParserCtxt* pctxt 854 if self._parser_context is None: 855 self._parser_context = self._createContext(self.target, None) 856 self._parser_context._collect_ids = self._collect_ids 857 if self._schema is not None: 858 self._parser_context._validator = \ 859 self._schema._newSaxValidator( 860 self._parse_options & xmlparser.XML_PARSE_DTDATTR) 861 pctxt = self._newParserCtxt() 862 _initParserContext(self._parser_context, self._resolvers, pctxt) 863 self._configureSaxContext(pctxt) 864 return self._parser_context 865 866 cdef _ParserContext _getPushParserContext(self): 867 cdef xmlparser.xmlParserCtxt* pctxt 868 if self._push_parser_context is None: 869 self._push_parser_context = self._createContext( 870 self.target, self._events_to_collect) 871 self._push_parser_context._collect_ids = self._collect_ids 872 if self._schema is not None: 873 self._push_parser_context._validator = \ 874 self._schema._newSaxValidator( 875 self._parse_options & xmlparser.XML_PARSE_DTDATTR) 876 pctxt = self._newPushParserCtxt() 877 _initParserContext( 878 self._push_parser_context, self._resolvers, pctxt) 879 self._configureSaxContext(pctxt) 880 return self._push_parser_context 881 882 cdef _ParserContext _createContext(self, target, events_to_collect): 883 cdef _SaxParserContext sax_context 884 if target is not None: 885 sax_context = _TargetParserContext(self) 886 (<_TargetParserContext>sax_context)._setTarget(target) 887 elif events_to_collect: 888 sax_context = _SaxParserContext(self) 889 else: 890 # nothing special to configure 891 return _ParserContext() 892 if events_to_collect: 893 events, tag = events_to_collect 894 sax_context._setEventFilter(events, tag) 895 return sax_context 896 897 @cython.final 898 cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1: 899 if self._remove_comments: 900 pctxt.sax.comment = NULL 901 if self._remove_pis: 902 pctxt.sax.processingInstruction = NULL 903 if self._strip_cdata: 904 # hard switch-off for CDATA nodes => makes them plain text 905 pctxt.sax.cdataBlock = NULL 906 907 cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1: 908 cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax 909 if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC: 910 # need to extend SAX1 context to SAX2 to get proper error reports 911 if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler: 912 sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler)) 913 if sax is NULL: 914 raise MemoryError() 915 cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler, 916 sizeof(htmlparser.htmlDefaultSAXHandler)) 917 c_ctxt.sax = sax 918 sax.initialized = xmlparser.XML_SAX2_MAGIC 919 sax.serror = _receiveParserError 920 sax.startElementNs = NULL 921 sax.endElementNs = NULL 922 sax._private = NULL 923 return 0 924 925 cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL: 926 cdef xmlparser.xmlParserCtxt* c_ctxt 927 if self._for_html: 928 c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) 929 if c_ctxt is not NULL: 930 self._registerHtmlErrorHandler(c_ctxt) 931 else: 932 c_ctxt = xmlparser.xmlNewParserCtxt() 933 if c_ctxt is NULL: 934 raise MemoryError 935 c_ctxt.sax.startDocument = _initSaxDocument 936 return c_ctxt 937 938 cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL: 939 cdef xmlparser.xmlParserCtxt* c_ctxt 940 cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL 941 if self._for_html: 942 c_ctxt = htmlparser.htmlCreatePushParserCtxt( 943 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE) 944 if c_ctxt is not NULL: 945 self._registerHtmlErrorHandler(c_ctxt) 946 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options) 947 else: 948 c_ctxt = xmlparser.xmlCreatePushParserCtxt( 949 NULL, NULL, NULL, 0, c_filename) 950 if c_ctxt is not NULL: 951 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options) 952 if c_ctxt is NULL: 953 raise MemoryError() 954 c_ctxt.sax.startDocument = _initSaxDocument 955 return c_ctxt 956 957 @property 958 def error_log(self): 959 """The error log of the last parser run. 960 """ 961 cdef _ParserContext context 962 context = self._getParserContext() 963 return context._error_log.copy() 964 965 @property 966 def resolvers(self): 967 """The custom resolver registry of this parser.""" 968 return self._resolvers 969 970 @property 971 def version(self): 972 """The version of the underlying XML parser.""" 973 return u"libxml2 %d.%d.%d" % LIBXML_VERSION 974 975 def setElementClassLookup(self, ElementClassLookup lookup = None): 976 u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead." 977 self.set_element_class_lookup(lookup) 978 979 def set_element_class_lookup(self, ElementClassLookup lookup = None): 980 u"""set_element_class_lookup(self, lookup = None) 981 982 Set a lookup scheme for element classes generated from this parser. 983 984 Reset it by passing None or nothing. 985 """ 986 self._class_lookup = lookup 987 988 cdef _BaseParser _copy(self): 989 u"Create a new parser with the same configuration." 990 cdef _BaseParser parser 991 parser = self.__class__() 992 parser._parse_options = self._parse_options 993 parser._for_html = self._for_html 994 parser._remove_comments = self._remove_comments 995 parser._remove_pis = self._remove_pis 996 parser._strip_cdata = self._strip_cdata 997 parser._filename = self._filename 998 parser._resolvers = self._resolvers 999 parser.target = self.target 1000 parser._class_lookup = self._class_lookup 1001 parser._default_encoding = self._default_encoding 1002 parser._schema = self._schema 1003 parser._events_to_collect = self._events_to_collect 1004 return parser 1005 1006 def copy(self): 1007 u"""copy(self) 1008 1009 Create a new parser with the same configuration. 1010 """ 1011 return self._copy() 1012 1013 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): 1014 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra) 1015 1016 Creates a new element associated with this parser. 1017 """ 1018 return _makeElement(_tag, NULL, None, self, None, None, 1019 attrib, nsmap, _extra) 1020 1021 # internal parser methods 1022 1023 cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: 1024 u"""Parse unicode document, share dictionary if possible. 1025 """ 1026 cdef _ParserContext context 1027 cdef xmlDoc* result 1028 cdef xmlparser.xmlParserCtxt* pctxt 1029 cdef Py_ssize_t py_buffer_len 1030 cdef int buffer_len, c_kind 1031 cdef const_char* c_text 1032 cdef const_char* c_encoding = _PY_UNICODE_ENCODING 1033 cdef bint is_pep393_string = ( 1034 python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext)) 1035 if is_pep393_string: 1036 c_text = <const_char*>python.PyUnicode_DATA(utext) 1037 py_buffer_len = python.PyUnicode_GET_LENGTH(utext) 1038 c_kind = python.PyUnicode_KIND(utext) 1039 if c_kind == 1: 1040 c_encoding = 'ISO-8859-1' 1041 elif c_kind == 2: 1042 py_buffer_len *= 2 1043 if python.PY_BIG_ENDIAN: 1044 c_encoding = 'UTF-16BE' # actually UCS-2 1045 else: 1046 c_encoding = 'UTF-16LE' # actually UCS-2 1047 elif c_kind == 4: 1048 py_buffer_len *= 4 1049 if python.PY_BIG_ENDIAN: 1050 c_encoding = 'UCS-4BE' 1051 else: 1052 c_encoding = 'UCS-4LE' 1053 else: 1054 assert False, f"Illegal Unicode kind {c_kind}" 1055 else: 1056 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext) 1057 c_text = python.PyUnicode_AS_DATA(utext) 1058 assert 0 <= py_buffer_len <= limits.INT_MAX 1059 buffer_len = py_buffer_len 1060 1061 context = self._getParserContext() 1062 context.prepare() 1063 try: 1064 pctxt = context._c_ctxt 1065 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1066 orig_options = pctxt.options 1067 with nogil: 1068 if self._for_html: 1069 result = htmlparser.htmlCtxtReadMemory( 1070 pctxt, c_text, buffer_len, c_filename, c_encoding, 1071 self._parse_options) 1072 if result is not NULL: 1073 if _fixHtmlDictNames(pctxt.dict, result) < 0: 1074 tree.xmlFreeDoc(result) 1075 result = NULL 1076 else: 1077 result = xmlparser.xmlCtxtReadMemory( 1078 pctxt, c_text, buffer_len, c_filename, c_encoding, 1079 self._parse_options) 1080 pctxt.options = orig_options # work around libxml2 problem 1081 1082 return context._handleParseResultDoc(self, result, None) 1083 finally: 1084 context.cleanup() 1085 1086 cdef xmlDoc* _parseDoc(self, char* c_text, int c_len, 1087 char* c_filename) except NULL: 1088 u"""Parse document, share dictionary if possible. 1089 """ 1090 cdef _ParserContext context 1091 cdef xmlDoc* result 1092 cdef xmlparser.xmlParserCtxt* pctxt 1093 cdef char* c_encoding 1094 cdef tree.xmlCharEncoding enc 1095 context = self._getParserContext() 1096 context.prepare() 1097 try: 1098 pctxt = context._c_ctxt 1099 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1100 1101 if self._default_encoding is None: 1102 c_encoding = NULL 1103 # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs 1104 # NOTE: limit to problematic cases because it changes character offsets 1105 if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and 1106 c_text[2] == 0 and c_text[3] == 0): 1107 c_encoding = "UTF-32LE" 1108 c_text += 4 1109 c_len -= 4 1110 elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and 1111 c_text[2] == '\xFE' and c_text[3] == '\xFF'): 1112 c_encoding = "UTF-32BE" 1113 c_text += 4 1114 c_len -= 4 1115 else: 1116 # no BOM => try to determine encoding 1117 enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len) 1118 if enc == tree.XML_CHAR_ENCODING_UCS4LE: 1119 c_encoding = 'UTF-32LE' 1120 elif enc == tree.XML_CHAR_ENCODING_UCS4BE: 1121 c_encoding = 'UTF-32BE' 1122 else: 1123 c_encoding = _cstr(self._default_encoding) 1124 1125 orig_options = pctxt.options 1126 with nogil: 1127 if self._for_html: 1128 result = htmlparser.htmlCtxtReadMemory( 1129 pctxt, c_text, c_len, c_filename, 1130 c_encoding, self._parse_options) 1131 if result is not NULL: 1132 if _fixHtmlDictNames(pctxt.dict, result) < 0: 1133 tree.xmlFreeDoc(result) 1134 result = NULL 1135 else: 1136 result = xmlparser.xmlCtxtReadMemory( 1137 pctxt, c_text, c_len, c_filename, 1138 c_encoding, self._parse_options) 1139 pctxt.options = orig_options # work around libxml2 problem 1140 1141 return context._handleParseResultDoc(self, result, None) 1142 finally: 1143 context.cleanup() 1144 1145 cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: 1146 cdef _ParserContext context 1147 cdef xmlDoc* result 1148 cdef xmlparser.xmlParserCtxt* pctxt 1149 cdef char* c_encoding 1150 result = NULL 1151 1152 context = self._getParserContext() 1153 context.prepare() 1154 try: 1155 pctxt = context._c_ctxt 1156 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1157 1158 if self._default_encoding is None: 1159 c_encoding = NULL 1160 else: 1161 c_encoding = _cstr(self._default_encoding) 1162 1163 orig_options = pctxt.options 1164 with nogil: 1165 if self._for_html: 1166 result = htmlparser.htmlCtxtReadFile( 1167 pctxt, c_filename, c_encoding, self._parse_options) 1168 if result is not NULL: 1169 if _fixHtmlDictNames(pctxt.dict, result) < 0: 1170 tree.xmlFreeDoc(result) 1171 result = NULL 1172 else: 1173 result = xmlparser.xmlCtxtReadFile( 1174 pctxt, c_filename, c_encoding, self._parse_options) 1175 pctxt.options = orig_options # work around libxml2 problem 1176 1177 return context._handleParseResultDoc(self, result, c_filename) 1178 finally: 1179 context.cleanup() 1180 1181 cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename, 1182 encoding) except NULL: 1183 cdef _ParserContext context 1184 cdef _FileReaderContext file_context 1185 cdef xmlDoc* result 1186 cdef xmlparser.xmlParserCtxt* pctxt 1187 cdef char* c_filename 1188 if not filename: 1189 filename = None 1190 1191 context = self._getParserContext() 1192 context.prepare() 1193 try: 1194 pctxt = context._c_ctxt 1195 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1196 file_context = _FileReaderContext( 1197 filelike, context, filename, 1198 encoding or self._default_encoding) 1199 result = file_context._readDoc(pctxt, self._parse_options) 1200 1201 return context._handleParseResultDoc( 1202 self, result, filename) 1203 finally: 1204 context.cleanup() 1205 1206 1207cdef void _initSaxDocument(void* ctxt) with gil: 1208 xmlparser.xmlSAX2StartDocument(ctxt) 1209 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 1210 c_doc = c_ctxt.myDoc 1211 1212 # set up document dict 1213 if c_doc and c_ctxt.dict and not c_doc.dict: 1214 # I have no idea why libxml2 disables this - we need it 1215 c_ctxt.dictNames = 1 1216 c_doc.dict = c_ctxt.dict 1217 xmlparser.xmlDictReference(c_ctxt.dict) 1218 1219 # set up XML ID hash table 1220 if c_ctxt._private: 1221 context = <_ParserContext>c_ctxt._private 1222 if context._collect_ids: 1223 # keep the global parser dict from filling up with XML IDs 1224 if c_doc and not c_doc.ids: 1225 # memory errors are not fatal here 1226 c_dict = xmlparser.xmlDictCreate() 1227 if c_dict: 1228 c_doc.ids = tree.xmlHashCreateDict(0, c_dict) 1229 xmlparser.xmlDictFree(c_dict) 1230 else: 1231 c_doc.ids = tree.xmlHashCreate(0) 1232 else: 1233 c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS 1234 if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids): 1235 # already initialised but empty => clear 1236 tree.xmlHashFree(c_doc.ids, NULL) 1237 c_doc.ids = NULL 1238 1239 1240############################################################ 1241## ET feed parser 1242############################################################ 1243 1244cdef class _FeedParser(_BaseParser): 1245 cdef bint _feed_parser_running 1246 1247 @property 1248 def feed_error_log(self): 1249 """The error log of the last (or current) run of the feed parser. 1250 1251 Note that this is local to the feed parser and thus is 1252 different from what the ``error_log`` property returns. 1253 """ 1254 return self._getPushParserContext()._error_log.copy() 1255 1256 cpdef feed(self, data): 1257 u"""feed(self, data) 1258 1259 Feeds data to the parser. The argument should be an 8-bit string 1260 buffer containing encoded data, although Unicode is supported as long 1261 as both string types are not mixed. 1262 1263 This is the main entry point to the consumer interface of a 1264 parser. The parser will parse as much of the XML stream as it 1265 can on each call. To finish parsing or to reset the parser, 1266 call the ``close()`` method. Both methods may raise 1267 ParseError if errors occur in the input data. If an error is 1268 raised, there is no longer a need to call ``close()``. 1269 1270 The feed parser interface is independent of the normal parser 1271 usage. You can use the same parser as a feed parser and in 1272 the ``parse()`` function concurrently. 1273 """ 1274 cdef _ParserContext context 1275 cdef bytes bstring 1276 cdef xmlparser.xmlParserCtxt* pctxt 1277 cdef Py_ssize_t py_buffer_len, ustart 1278 cdef const_char* char_data 1279 cdef const_char* c_encoding 1280 cdef int buffer_len 1281 cdef int error 1282 cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER 1283 1284 if isinstance(data, bytes): 1285 if self._default_encoding is None: 1286 c_encoding = NULL 1287 else: 1288 c_encoding = self._default_encoding 1289 char_data = _cstr(data) 1290 py_buffer_len = python.PyBytes_GET_SIZE(data) 1291 ustart = 0 1292 elif isinstance(data, unicode): 1293 c_encoding = b"UTF-8" 1294 char_data = NULL 1295 py_buffer_len = len(<unicode> data) 1296 ustart = 0 1297 else: 1298 raise TypeError, u"Parsing requires string data" 1299 1300 context = self._getPushParserContext() 1301 pctxt = context._c_ctxt 1302 error = 0 1303 if not self._feed_parser_running: 1304 context.prepare(set_document_loader=False) 1305 self._feed_parser_running = 1 1306 c_filename = (_cstr(self._filename) 1307 if self._filename is not None else NULL) 1308 1309 # We have to give *mlCtxtResetPush() enough input to figure 1310 # out the character encoding (at least four bytes), 1311 # however if we give it all we got, we'll have nothing for 1312 # *mlParseChunk() and things go wrong. 1313 buffer_len = 0 1314 if char_data is not NULL: 1315 buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len 1316 orig_loader = _register_document_loader() 1317 if self._for_html: 1318 error = _htmlCtxtResetPush( 1319 pctxt, char_data, buffer_len, c_filename, c_encoding, 1320 self._parse_options) 1321 else: 1322 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) 1323 error = xmlparser.xmlCtxtResetPush( 1324 pctxt, char_data, buffer_len, c_filename, c_encoding) 1325 _reset_document_loader(orig_loader) 1326 py_buffer_len -= buffer_len 1327 char_data += buffer_len 1328 if error: 1329 raise MemoryError() 1330 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1331 1332 #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding 1333 1334 fixup_error = 0 1335 while py_buffer_len > 0 and (error == 0 or recover): 1336 if char_data is NULL: 1337 # Unicode parsing by converting chunks to UTF-8 1338 buffer_len = 2**19 # len(bytes) <= 4 * (2**19) == 2 MiB 1339 bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8') 1340 ustart += buffer_len 1341 py_buffer_len -= buffer_len # may end up < 0 1342 error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring)) 1343 else: 1344 # Direct byte string parsing. 1345 buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX 1346 error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len) 1347 py_buffer_len -= buffer_len 1348 char_data += buffer_len 1349 1350 if fixup_error: 1351 context.store_exception(MemoryError()) 1352 1353 if context._has_raised(): 1354 # propagate Python exceptions immediately 1355 recover = 0 1356 error = 1 1357 break 1358 1359 if error and not pctxt.replaceEntities and not pctxt.validate: 1360 # in this mode, we ignore errors about undefined entities 1361 for entry in context._error_log.filter_from_errors(): 1362 if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ 1363 entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY: 1364 break 1365 else: 1366 error = 0 1367 1368 if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised(): 1369 # propagate Python exceptions immediately 1370 recover = 0 1371 error = 1 1372 1373 if fixup_error or not recover and (error or not pctxt.wellFormed): 1374 self._feed_parser_running = 0 1375 try: 1376 context._handleParseResult(self, pctxt.myDoc, None) 1377 finally: 1378 context.cleanup() 1379 1380 cpdef close(self): 1381 u"""close(self) 1382 1383 Terminates feeding data to this parser. This tells the parser to 1384 process any remaining data in the feed buffer, and then returns the 1385 root Element of the tree that was parsed. 1386 1387 This method must be called after passing the last chunk of data into 1388 the ``feed()`` method. It should only be called when using the feed 1389 parser interface, all other usage is undefined. 1390 """ 1391 if not self._feed_parser_running: 1392 raise XMLSyntaxError(u"no element found", 1393 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0, 1394 self._filename) 1395 1396 context = self._getPushParserContext() 1397 pctxt = context._c_ctxt 1398 1399 self._feed_parser_running = 0 1400 if self._for_html: 1401 htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) 1402 else: 1403 xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) 1404 1405 if (pctxt.recovery and not pctxt.disableSAX and 1406 isinstance(context, _SaxParserContext)): 1407 # apply any left-over 'end' events 1408 (<_SaxParserContext>context).flushEvents() 1409 1410 try: 1411 result = context._handleParseResult(self, pctxt.myDoc, None) 1412 finally: 1413 context.cleanup() 1414 1415 if isinstance(result, _Document): 1416 return (<_Document>result).getroot() 1417 else: 1418 return result 1419 1420 1421cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt, 1422 const char* char_data, int buffer_len): 1423 fixup_error = 0 1424 with nogil: 1425 if c_ctxt.html: 1426 c_node = c_ctxt.node # last node where the parser stopped 1427 orig_loader = _register_document_loader() 1428 error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0) 1429 _reset_document_loader(orig_loader) 1430 # and now for the fun part: move node names to the dict 1431 if c_ctxt.myDoc: 1432 fixup_error = _fixHtmlDictSubtreeNames( 1433 c_ctxt.dict, c_ctxt.myDoc, c_node) 1434 if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict: 1435 xmlparser.xmlDictFree(c_ctxt.myDoc.dict) 1436 c_ctxt.myDoc.dict = c_ctxt.dict 1437 xmlparser.xmlDictReference(c_ctxt.dict) 1438 else: 1439 orig_loader = _register_document_loader() 1440 error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0) 1441 _reset_document_loader(orig_loader) 1442 return (error, fixup_error) 1443 1444 1445cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt, 1446 const_char* c_data, int buffer_len, 1447 const_char* c_filename, const_char* c_encoding, 1448 int parse_options) except -1: 1449 cdef xmlparser.xmlParserInput* c_input_stream 1450 # libxml2 lacks an HTML push parser setup function 1451 error = xmlparser.xmlCtxtResetPush( 1452 c_ctxt, c_data, buffer_len, c_filename, c_encoding) 1453 if error: 1454 return error 1455 1456 # fix libxml2 setup for HTML 1457 c_ctxt.progressive = 1 1458 c_ctxt.html = 1 1459 htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options) 1460 1461 return 0 1462 1463 1464############################################################ 1465## XML parser 1466############################################################ 1467 1468cdef int _XML_DEFAULT_PARSE_OPTIONS 1469_XML_DEFAULT_PARSE_OPTIONS = ( 1470 xmlparser.XML_PARSE_NOENT | 1471 xmlparser.XML_PARSE_NOCDATA | 1472 xmlparser.XML_PARSE_NONET | 1473 xmlparser.XML_PARSE_COMPACT | 1474 xmlparser.XML_PARSE_BIG_LINES 1475 ) 1476 1477cdef class XMLParser(_FeedParser): 1478 u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True) 1479 1480 The XML parser. 1481 1482 Parsers can be supplied as additional argument to various parse 1483 functions of the lxml API. A default parser is always available 1484 and can be replaced by a call to the global function 1485 'set_default_parser'. New parsers can be created at any time 1486 without a major run-time overhead. 1487 1488 The keyword arguments in the constructor are mainly based on the 1489 libxml2 parser configuration. A DTD will also be loaded if DTD 1490 validation or attribute default values are requested (unless you 1491 additionally provide an XMLSchema from which the default 1492 attributes can be read). 1493 1494 Available boolean keyword arguments: 1495 1496 - attribute_defaults - inject default attributes from DTD or XMLSchema 1497 - dtd_validation - validate against a DTD referenced by the document 1498 - load_dtd - use DTD for parsing 1499 - no_network - prevent network access for related files (default: True) 1500 - ns_clean - clean up redundant namespace declarations 1501 - recover - try hard to parse through broken XML 1502 - remove_blank_text - discard blank text nodes that appear ignorable 1503 - remove_comments - discard comments 1504 - remove_pis - discard processing instructions 1505 - strip_cdata - replace CDATA sections by normal text content (default: True) 1506 - compact - save memory for short text content (default: True) 1507 - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation) 1508 - resolve_entities - replace entities by their text value (default: True) 1509 - huge_tree - disable security restrictions and support very deep trees 1510 and very long text content (only affects libxml2 2.7+) 1511 1512 Other keyword arguments: 1513 1514 - encoding - override the document encoding 1515 - target - a parser target object that will receive the parse events 1516 - schema - an XMLSchema to validate against 1517 1518 Note that you should avoid sharing parsers between threads. While this is 1519 not harmful, it is more efficient to use separate parsers. This does not 1520 apply to the default parser. 1521 """ 1522 def __init__(self, *, encoding=None, attribute_defaults=False, 1523 dtd_validation=False, load_dtd=False, no_network=True, 1524 ns_clean=False, recover=False, XMLSchema schema=None, 1525 huge_tree=False, remove_blank_text=False, resolve_entities=True, 1526 remove_comments=False, remove_pis=False, strip_cdata=True, 1527 collect_ids=True, target=None, compact=True): 1528 cdef int parse_options 1529 parse_options = _XML_DEFAULT_PARSE_OPTIONS 1530 if load_dtd: 1531 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD 1532 if dtd_validation: 1533 parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \ 1534 xmlparser.XML_PARSE_DTDLOAD 1535 if attribute_defaults: 1536 parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR 1537 if schema is None: 1538 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD 1539 if ns_clean: 1540 parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN 1541 if recover: 1542 parse_options = parse_options | xmlparser.XML_PARSE_RECOVER 1543 if remove_blank_text: 1544 parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS 1545 if huge_tree: 1546 parse_options = parse_options | xmlparser.XML_PARSE_HUGE 1547 if not no_network: 1548 parse_options = parse_options ^ xmlparser.XML_PARSE_NONET 1549 if not compact: 1550 parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT 1551 if not resolve_entities: 1552 parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT 1553 if not strip_cdata: 1554 parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA 1555 1556 _BaseParser.__init__(self, parse_options, 0, schema, 1557 remove_comments, remove_pis, strip_cdata, 1558 collect_ids, target, encoding) 1559 1560 1561cdef class XMLPullParser(XMLParser): 1562 """XMLPullParser(self, events=None, *, tag=None, **kwargs) 1563 1564 XML parser that collects parse events in an iterator. 1565 1566 The collected events are the same as for iterparse(), but the 1567 parser itself is non-blocking in the sense that it receives 1568 data chunks incrementally through its .feed() method, instead 1569 of reading them directly from a file(-like) object all by itself. 1570 1571 By default, it collects Element end events. To change that, 1572 pass any subset of the available events into the ``events`` 1573 argument: ``'start'``, ``'end'``, ``'start-ns'``, 1574 ``'end-ns'``, ``'comment'``, ``'pi'``. 1575 1576 To support loading external dependencies relative to the input 1577 source, you can pass the ``base_url``. 1578 """ 1579 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs): 1580 XMLParser.__init__(self, **kwargs) 1581 if events is None: 1582 events = ('end',) 1583 self._setBaseURL(base_url) 1584 self._collectEvents(events, tag) 1585 1586 def read_events(self): 1587 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator 1588 1589 1590cdef class ETCompatXMLParser(XMLParser): 1591 u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \ 1592 dtd_validation=False, load_dtd=False, no_network=True, \ 1593 ns_clean=False, recover=False, schema=None, \ 1594 huge_tree=False, remove_blank_text=False, resolve_entities=True, \ 1595 remove_comments=True, remove_pis=True, strip_cdata=True, \ 1596 target=None, compact=True) 1597 1598 An XML parser with an ElementTree compatible default setup. 1599 1600 See the XMLParser class for details. 1601 1602 This parser has ``remove_comments`` and ``remove_pis`` enabled by default 1603 and thus ignores comments and processing instructions. 1604 """ 1605 def __init__(self, *, encoding=None, attribute_defaults=False, 1606 dtd_validation=False, load_dtd=False, no_network=True, 1607 ns_clean=False, recover=False, schema=None, 1608 huge_tree=False, remove_blank_text=False, resolve_entities=True, 1609 remove_comments=True, remove_pis=True, strip_cdata=True, 1610 target=None, compact=True): 1611 XMLParser.__init__(self, 1612 attribute_defaults=attribute_defaults, 1613 dtd_validation=dtd_validation, 1614 load_dtd=load_dtd, 1615 no_network=no_network, 1616 ns_clean=ns_clean, 1617 recover=recover, 1618 remove_blank_text=remove_blank_text, 1619 huge_tree=huge_tree, 1620 compact=compact, 1621 resolve_entities=resolve_entities, 1622 remove_comments=remove_comments, 1623 remove_pis=remove_pis, 1624 strip_cdata=strip_cdata, 1625 target=target, 1626 encoding=encoding, 1627 schema=schema) 1628 1629# ET 1.2 compatible name 1630XMLTreeBuilder = ETCompatXMLParser 1631 1632 1633cdef XMLParser __DEFAULT_XML_PARSER 1634__DEFAULT_XML_PARSER = XMLParser() 1635 1636__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) 1637 1638def set_default_parser(_BaseParser parser=None): 1639 u"""set_default_parser(parser=None) 1640 1641 Set a default parser for the current thread. This parser is used 1642 globally whenever no parser is supplied to the various parse functions of 1643 the lxml API. If this function is called without a parser (or if it is 1644 None), the default parser is reset to the original configuration. 1645 1646 Note that the pre-installed default parser is not thread-safe. Avoid the 1647 default parser in multi-threaded environments. You can create a separate 1648 parser for each thread explicitly or use a parser pool. 1649 """ 1650 if parser is None: 1651 parser = __DEFAULT_XML_PARSER 1652 __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser) 1653 1654def get_default_parser(): 1655 u"get_default_parser()" 1656 return __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1657 1658############################################################ 1659## HTML parser 1660############################################################ 1661 1662cdef int _HTML_DEFAULT_PARSE_OPTIONS 1663_HTML_DEFAULT_PARSE_OPTIONS = ( 1664 htmlparser.HTML_PARSE_RECOVER | 1665 htmlparser.HTML_PARSE_NONET | 1666 htmlparser.HTML_PARSE_COMPACT 1667 ) 1668 1669cdef class HTMLParser(_FeedParser): 1670 u"""HTMLParser(self, encoding=None, remove_blank_text=False, \ 1671 remove_comments=False, remove_pis=False, strip_cdata=True, \ 1672 no_network=True, target=None, schema: XMLSchema =None, \ 1673 recover=True, compact=True, collect_ids=True, huge_tree=False) 1674 1675 The HTML parser. 1676 1677 This parser allows reading HTML into a normal XML tree. By 1678 default, it can read broken (non well-formed) HTML, depending on 1679 the capabilities of libxml2. Use the 'recover' option to switch 1680 this off. 1681 1682 Available boolean keyword arguments: 1683 1684 - recover - try hard to parse through broken HTML (default: True) 1685 - no_network - prevent network access for related files (default: True) 1686 - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content) 1687 - remove_comments - discard comments 1688 - remove_pis - discard processing instructions 1689 - strip_cdata - replace CDATA sections by normal text content (default: True) 1690 - compact - save memory for short text content (default: True) 1691 - default_doctype - add a default doctype even if it is not found in the HTML (default: True) 1692 - collect_ids - use a hash table of XML IDs for fast access (default: True) 1693 - huge_tree - disable security restrictions and support very deep trees 1694 and very long text content (only affects libxml2 2.7+) 1695 1696 Other keyword arguments: 1697 1698 - encoding - override the document encoding 1699 - target - a parser target object that will receive the parse events 1700 - schema - an XMLSchema to validate against 1701 1702 Note that you should avoid sharing parsers between threads for performance 1703 reasons. 1704 """ 1705 def __init__(self, *, encoding=None, remove_blank_text=False, 1706 remove_comments=False, remove_pis=False, strip_cdata=True, 1707 no_network=True, target=None, XMLSchema schema=None, 1708 recover=True, compact=True, default_doctype=True, 1709 collect_ids=True, huge_tree=False): 1710 cdef int parse_options 1711 parse_options = _HTML_DEFAULT_PARSE_OPTIONS 1712 if remove_blank_text: 1713 parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS 1714 if not recover: 1715 parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER 1716 if not no_network: 1717 parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET 1718 if not compact: 1719 parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT 1720 if not default_doctype: 1721 parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD 1722 if huge_tree: 1723 parse_options = parse_options | xmlparser.XML_PARSE_HUGE 1724 1725 _BaseParser.__init__(self, parse_options, 1, schema, 1726 remove_comments, remove_pis, strip_cdata, 1727 collect_ids, target, encoding) 1728 1729 1730cdef HTMLParser __DEFAULT_HTML_PARSER 1731__DEFAULT_HTML_PARSER = HTMLParser() 1732 1733 1734cdef class HTMLPullParser(HTMLParser): 1735 """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs) 1736 1737 HTML parser that collects parse events in an iterator. 1738 1739 The collected events are the same as for iterparse(), but the 1740 parser itself is non-blocking in the sense that it receives 1741 data chunks incrementally through its .feed() method, instead 1742 of reading them directly from a file(-like) object all by itself. 1743 1744 By default, it collects Element end events. To change that, 1745 pass any subset of the available events into the ``events`` 1746 argument: ``'start'``, ``'end'``, ``'start-ns'``, 1747 ``'end-ns'``, ``'comment'``, ``'pi'``. 1748 1749 To support loading external dependencies relative to the input 1750 source, you can pass the ``base_url``. 1751 """ 1752 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs): 1753 HTMLParser.__init__(self, **kwargs) 1754 if events is None: 1755 events = ('end',) 1756 self._setBaseURL(base_url) 1757 self._collectEvents(events, tag) 1758 1759 def read_events(self): 1760 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator 1761 1762 1763############################################################ 1764## helper functions for document creation 1765############################################################ 1766 1767cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL: 1768 cdef char* c_filename 1769 cdef char* c_text 1770 cdef Py_ssize_t c_len 1771 cdef bint is_pep393_string 1772 if parser is None: 1773 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1774 if not filename: 1775 c_filename = NULL 1776 else: 1777 filename_utf = _encodeFilenameUTF8(filename) 1778 c_filename = _cstr(filename_utf) 1779 if isinstance(text, unicode): 1780 is_pep393_string = ( 1781 python.PEP393_ENABLED and python.PyUnicode_IS_READY(text)) 1782 if is_pep393_string: 1783 c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text) 1784 else: 1785 c_len = python.PyUnicode_GET_DATA_SIZE(text) 1786 if c_len > limits.INT_MAX: 1787 return (<_BaseParser>parser)._parseDocFromFilelike( 1788 StringIO(text), filename, None) 1789 if _PY_UNICODE_ENCODING is NULL and not is_pep393_string: 1790 text = (<unicode>text).encode('utf8') 1791 return (<_BaseParser>parser)._parseDocFromFilelike( 1792 BytesIO(text), filename, "UTF-8") 1793 return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename) 1794 else: 1795 c_len = python.PyBytes_GET_SIZE(text) 1796 if c_len > limits.INT_MAX: 1797 return (<_BaseParser>parser)._parseDocFromFilelike( 1798 BytesIO(text), filename, None) 1799 c_text = _cstr(text) 1800 return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename) 1801 1802cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL: 1803 if parser is None: 1804 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1805 return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8)) 1806 1807cdef xmlDoc* _parseDocFromFilelike(source, filename, 1808 _BaseParser parser) except NULL: 1809 if parser is None: 1810 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1811 return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None) 1812 1813cdef xmlDoc* _newXMLDoc() except NULL: 1814 cdef xmlDoc* result 1815 result = tree.xmlNewDoc(NULL) 1816 if result is NULL: 1817 raise MemoryError() 1818 if result.encoding is NULL: 1819 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8") 1820 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1821 return result 1822 1823cdef xmlDoc* _newHTMLDoc() except NULL: 1824 cdef xmlDoc* result 1825 result = tree.htmlNewDoc(NULL, NULL) 1826 if result is NULL: 1827 raise MemoryError() 1828 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1829 return result 1830 1831cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL: 1832 cdef xmlDoc* result 1833 if recursive: 1834 with nogil: 1835 result = tree.xmlCopyDoc(c_doc, recursive) 1836 else: 1837 result = tree.xmlCopyDoc(c_doc, 0) 1838 if result is NULL: 1839 raise MemoryError() 1840 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1841 return result 1842 1843cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL: 1844 u"Recursively copy the document and make c_new_root the new root node." 1845 cdef xmlDoc* result 1846 cdef xmlNode* c_node 1847 result = tree.xmlCopyDoc(c_doc, 0) # non recursive 1848 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1849 with nogil: 1850 c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive 1851 if c_node is NULL: 1852 raise MemoryError() 1853 tree.xmlDocSetRootElement(result, c_node) 1854 _copyTail(c_new_root.next, c_node) 1855 return result 1856 1857cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL: 1858 u"Recursively copy the element into the document. c_doc is not modified." 1859 cdef xmlNode* c_root 1860 c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive 1861 if c_root is NULL: 1862 raise MemoryError() 1863 _copyTail(c_node.next, c_root) 1864 return c_root 1865 1866 1867############################################################ 1868## API level helper functions for _Document creation 1869############################################################ 1870 1871cdef _Document _parseDocument(source, _BaseParser parser, base_url): 1872 cdef _Document doc 1873 if _isString(source): 1874 # parse the file directly from the filesystem 1875 doc = _parseDocumentFromURL(_encodeFilename(source), parser) 1876 # fix base URL if requested 1877 if base_url is not None: 1878 base_url = _encodeFilenameUTF8(base_url) 1879 if doc._c_doc.URL is not NULL: 1880 tree.xmlFree(<char*>doc._c_doc.URL) 1881 doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url)) 1882 return doc 1883 1884 if base_url is not None: 1885 url = base_url 1886 else: 1887 url = _getFilenameForFile(source) 1888 1889 if hasattr(source, u'getvalue') and hasattr(source, u'tell'): 1890 # StringIO - reading from start? 1891 if source.tell() == 0: 1892 return _parseMemoryDocument(source.getvalue(), url, parser) 1893 1894 # Support for file-like objects (urlgrabber.urlopen, ...) 1895 if hasattr(source, u'read'): 1896 return _parseFilelikeDocument(source, url, parser) 1897 1898 raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'" 1899 1900cdef _Document _parseDocumentFromURL(url, _BaseParser parser): 1901 c_doc = _parseDocFromFile(url, parser) 1902 return _documentFactory(c_doc, parser) 1903 1904cdef _Document _parseMemoryDocument(text, url, _BaseParser parser): 1905 if isinstance(text, unicode): 1906 if _hasEncodingDeclaration(text): 1907 raise ValueError( 1908 u"Unicode strings with encoding declaration are not supported. " 1909 u"Please use bytes input or XML fragments without declaration.") 1910 elif not isinstance(text, bytes): 1911 raise ValueError, u"can only parse strings" 1912 c_doc = _parseDoc(text, url, parser) 1913 return _documentFactory(c_doc, parser) 1914 1915cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser): 1916 c_doc = _parseDocFromFilelike(source, url, parser) 1917 return _documentFactory(c_doc, parser) 1918