1# SAX-like interfaces 2 3class XMLSyntaxAssertionError(XMLSyntaxError, AssertionError): 4 """ 5 An XMLSyntaxError that additionally inherits from AssertionError for 6 ElementTree / backwards compatibility reasons. 7 8 This class may get replaced by a plain XMLSyntaxError in a future version. 9 """ 10 11 12ctypedef enum _SaxParserEvents: 13 SAX_EVENT_START = 1 << 0 14 SAX_EVENT_END = 1 << 1 15 SAX_EVENT_DATA = 1 << 2 16 SAX_EVENT_DOCTYPE = 1 << 3 17 SAX_EVENT_PI = 1 << 4 18 SAX_EVENT_COMMENT = 1 << 5 19 SAX_EVENT_START_NS = 1 << 6 20 SAX_EVENT_END_NS = 1 << 7 21 22ctypedef enum _ParseEventFilter: 23 PARSE_EVENT_FILTER_START = 1 << 0 24 PARSE_EVENT_FILTER_END = 1 << 1 25 PARSE_EVENT_FILTER_START_NS = 1 << 2 26 PARSE_EVENT_FILTER_END_NS = 1 << 3 27 PARSE_EVENT_FILTER_COMMENT = 1 << 4 28 PARSE_EVENT_FILTER_PI = 1 << 5 29 30 31cdef int _buildParseEventFilter(events) except -1: 32 cdef int event_filter 33 event_filter = 0 34 for event in events: 35 if event == 'start': 36 event_filter |= PARSE_EVENT_FILTER_START 37 elif event == 'end': 38 event_filter |= PARSE_EVENT_FILTER_END 39 elif event == 'start-ns': 40 event_filter |= PARSE_EVENT_FILTER_START_NS 41 elif event == 'end-ns': 42 event_filter |= PARSE_EVENT_FILTER_END_NS 43 elif event == 'comment': 44 event_filter |= PARSE_EVENT_FILTER_COMMENT 45 elif event == 'pi': 46 event_filter |= PARSE_EVENT_FILTER_PI 47 else: 48 raise ValueError, f"invalid event name '{event}'" 49 return event_filter 50 51 52cdef class _SaxParserTarget: 53 cdef int _sax_event_filter 54 def __cinit__(self): 55 self._sax_event_filter = 0 56 57 cdef _handleSaxStart(self, tag, attrib, nsmap): 58 return None 59 cdef _handleSaxEnd(self, tag): 60 return None 61 cdef int _handleSaxData(self, data) except -1: 62 return 0 63 cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1: 64 return 0 65 cdef _handleSaxPi(self, target, data): 66 return None 67 cdef _handleSaxComment(self, comment): 68 return None 69 cdef _handleSaxStartNs(self, prefix, uri): 70 return None 71 cdef _handleSaxEndNs(self, prefix): 72 return None 73 74 75#@cython.final 76@cython.internal 77@cython.no_gc_clear # Required because parent class uses it - Cython bug. 78cdef class _SaxParserContext(_ParserContext): 79 u"""This class maps SAX2 events to parser target events. 80 """ 81 cdef _SaxParserTarget _target 82 cdef _BaseParser _parser 83 cdef xmlparser.startElementNsSAX2Func _origSaxStart 84 cdef xmlparser.endElementNsSAX2Func _origSaxEnd 85 cdef xmlparser.startElementSAXFunc _origSaxStartNoNs 86 cdef xmlparser.endElementSAXFunc _origSaxEndNoNs 87 cdef xmlparser.charactersSAXFunc _origSaxData 88 cdef xmlparser.cdataBlockSAXFunc _origSaxCData 89 cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype 90 cdef xmlparser.commentSAXFunc _origSaxComment 91 cdef xmlparser.processingInstructionSAXFunc _origSaxPI 92 cdef xmlparser.startDocumentSAXFunc _origSaxStartDocument 93 94 # for event collecting 95 cdef int _event_filter 96 cdef list _ns_stack 97 cdef list _node_stack 98 cdef _ParseEventsIterator events_iterator 99 100 # for iterparse 101 cdef _Element _root 102 cdef _MultiTagMatcher _matcher 103 104 def __cinit__(self, _BaseParser parser): 105 self._ns_stack = [] 106 self._node_stack = [] 107 self._parser = parser 108 self.events_iterator = _ParseEventsIterator() 109 110 cdef void _setSaxParserTarget(self, _SaxParserTarget target): 111 self._target = target 112 113 cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): 114 _ParserContext._initParserContext(self, c_ctxt) 115 if self._target is not None: 116 self._connectTarget(c_ctxt) 117 elif self._event_filter: 118 self._connectEvents(c_ctxt) 119 120 cdef void _connectTarget(self, xmlparser.xmlParserCtxt* c_ctxt): 121 """Wrap original SAX2 callbacks to call into parser target. 122 """ 123 sax = c_ctxt.sax 124 self._origSaxStart = sax.startElementNs = NULL 125 self._origSaxStartNoNs = sax.startElement = NULL 126 if self._target._sax_event_filter & (SAX_EVENT_START | 127 SAX_EVENT_START_NS | 128 SAX_EVENT_END_NS): 129 # intercept => overwrite orig callback 130 # FIXME: also intercept on when collecting END events 131 if sax.initialized == xmlparser.XML_SAX2_MAGIC: 132 sax.startElementNs = _handleSaxTargetStart 133 if self._target._sax_event_filter & SAX_EVENT_START: 134 sax.startElement = _handleSaxTargetStartNoNs 135 136 self._origSaxEnd = sax.endElementNs = NULL 137 self._origSaxEndNoNs = sax.endElement = NULL 138 if self._target._sax_event_filter & (SAX_EVENT_END | 139 SAX_EVENT_END_NS): 140 if sax.initialized == xmlparser.XML_SAX2_MAGIC: 141 sax.endElementNs = _handleSaxEnd 142 if self._target._sax_event_filter & SAX_EVENT_END: 143 sax.endElement = _handleSaxEndNoNs 144 145 self._origSaxData = sax.characters = sax.cdataBlock = NULL 146 if self._target._sax_event_filter & SAX_EVENT_DATA: 147 sax.characters = sax.cdataBlock = _handleSaxData 148 149 # doctype propagation is always required for entity replacement 150 self._origSaxDoctype = sax.internalSubset 151 if self._target._sax_event_filter & SAX_EVENT_DOCTYPE: 152 sax.internalSubset = _handleSaxTargetDoctype 153 154 self._origSaxPI = sax.processingInstruction = NULL 155 if self._target._sax_event_filter & SAX_EVENT_PI: 156 sax.processingInstruction = _handleSaxTargetPI 157 158 self._origSaxComment = sax.comment = NULL 159 if self._target._sax_event_filter & SAX_EVENT_COMMENT: 160 sax.comment = _handleSaxTargetComment 161 162 # enforce entity replacement 163 sax.reference = NULL 164 c_ctxt.replaceEntities = 1 165 166 cdef void _connectEvents(self, xmlparser.xmlParserCtxt* c_ctxt): 167 """Wrap original SAX2 callbacks to collect parse events without parser target. 168 """ 169 sax = c_ctxt.sax 170 self._origSaxStartDocument = sax.startDocument 171 sax.startDocument = _handleSaxStartDocument 172 173 # only override "start" event handler if needed 174 self._origSaxStart = sax.startElementNs 175 if self._event_filter == 0 or c_ctxt.html or \ 176 self._event_filter & (PARSE_EVENT_FILTER_START | 177 PARSE_EVENT_FILTER_END | 178 PARSE_EVENT_FILTER_START_NS | 179 PARSE_EVENT_FILTER_END_NS): 180 sax.startElementNs = <xmlparser.startElementNsSAX2Func>_handleSaxStart 181 182 self._origSaxStartNoNs = sax.startElement 183 if self._event_filter == 0 or c_ctxt.html or \ 184 self._event_filter & (PARSE_EVENT_FILTER_START | 185 PARSE_EVENT_FILTER_END): 186 sax.startElement = <xmlparser.startElementSAXFunc>_handleSaxStartNoNs 187 188 # only override "end" event handler if needed 189 self._origSaxEnd = sax.endElementNs 190 if self._event_filter == 0 or \ 191 self._event_filter & (PARSE_EVENT_FILTER_END | 192 PARSE_EVENT_FILTER_END_NS): 193 sax.endElementNs = <xmlparser.endElementNsSAX2Func>_handleSaxEnd 194 195 self._origSaxEndNoNs = sax.endElement 196 if self._event_filter == 0 or \ 197 self._event_filter & PARSE_EVENT_FILTER_END: 198 sax.endElement = <xmlparser.endElementSAXFunc>_handleSaxEndNoNs 199 200 self._origSaxComment = sax.comment 201 if self._event_filter & PARSE_EVENT_FILTER_COMMENT: 202 sax.comment = <xmlparser.commentSAXFunc>_handleSaxComment 203 204 self._origSaxPI = sax.processingInstruction 205 if self._event_filter & PARSE_EVENT_FILTER_PI: 206 sax.processingInstruction = <xmlparser.processingInstructionSAXFunc>_handleSaxPIEvent 207 208 cdef _setEventFilter(self, events, tag): 209 self._event_filter = _buildParseEventFilter(events) 210 if not self._event_filter or tag is None or tag == '*': 211 self._matcher = None 212 else: 213 self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag) 214 215 cdef int startDocument(self, xmlDoc* c_doc) except -1: 216 try: 217 self._doc = _documentFactory(c_doc, self._parser) 218 finally: 219 self._parser = None # clear circular reference ASAP 220 if self._matcher is not None: 221 self._matcher.cacheTags(self._doc, True) # force entry in libxml2 dict 222 return 0 223 224 cdef int pushEvent(self, event, xmlNode* c_node) except -1: 225 cdef _Element root 226 if self._root is None: 227 root = self._doc.getroot() 228 if root is not None and root._c_node.type == tree.XML_ELEMENT_NODE: 229 self._root = root 230 node = _elementFactory(self._doc, c_node) 231 self.events_iterator._events.append( (event, node) ) 232 return 0 233 234 cdef int flushEvents(self) except -1: 235 events = self.events_iterator._events 236 while self._node_stack: 237 events.append( ('end', self._node_stack.pop()) ) 238 _pushSaxNsEndEvents(self) 239 while self._ns_stack: 240 _pushSaxNsEndEvents(self) 241 242 cdef void _handleSaxException(self, xmlparser.xmlParserCtxt* c_ctxt): 243 if c_ctxt.errNo == xmlerror.XML_ERR_OK: 244 c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR 245 # stop parsing immediately 246 c_ctxt.wellFormed = 0 247 c_ctxt.disableSAX = 1 248 c_ctxt.instate = xmlparser.XML_PARSER_EOF 249 self._store_raised() 250 251 252@cython.final 253@cython.internal 254cdef class _ParseEventsIterator: 255 """A reusable parse events iterator""" 256 cdef list _events 257 cdef int _event_index 258 259 def __cinit__(self): 260 self._events = [] 261 self._event_index = 0 262 263 def __iter__(self): 264 return self 265 266 def __next__(self): 267 cdef int event_index = self._event_index 268 events = self._events 269 if event_index >= 2**10 or event_index * 2 >= len(events): 270 if event_index: 271 # clean up from time to time 272 del events[:event_index] 273 self._event_index = event_index = 0 274 if event_index >= len(events): 275 raise StopIteration 276 item = events[event_index] 277 self._event_index = event_index + 1 278 return item 279 280 281cdef list _build_prefix_uri_list(_SaxParserContext context, int c_nb_namespaces, 282 const_xmlChar** c_namespaces): 283 "Build [(prefix, uri)] list of declared namespaces." 284 cdef int i 285 namespaces = [] 286 for i in xrange(c_nb_namespaces): 287 namespaces.append((funicodeOrEmpty(c_namespaces[0]), funicode(c_namespaces[1]))) 288 c_namespaces += 2 289 return namespaces 290 291 292cdef void _handleSaxStart( 293 void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix, 294 const_xmlChar* c_namespace, int c_nb_namespaces, 295 const_xmlChar** c_namespaces, 296 int c_nb_attributes, int c_nb_defaulted, 297 const_xmlChar** c_attributes) with gil: 298 cdef int i 299 cdef size_t c_len 300 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 301 if c_ctxt._private is NULL or c_ctxt.disableSAX: 302 return 303 context = <_SaxParserContext>c_ctxt._private 304 cdef int event_filter = context._event_filter 305 try: 306 if (c_nb_namespaces and 307 event_filter & (PARSE_EVENT_FILTER_START_NS | 308 PARSE_EVENT_FILTER_END_NS)): 309 declared_namespaces = _build_prefix_uri_list( 310 context, c_nb_namespaces, c_namespaces) 311 if event_filter & PARSE_EVENT_FILTER_START_NS: 312 for prefix_uri_tuple in declared_namespaces: 313 context.events_iterator._events.append(("start-ns", prefix_uri_tuple)) 314 else: 315 declared_namespaces = None 316 317 context._origSaxStart(c_ctxt, c_localname, c_prefix, c_namespace, 318 c_nb_namespaces, c_namespaces, c_nb_attributes, 319 c_nb_defaulted, c_attributes) 320 if c_ctxt.html: 321 _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node) 322 323 if event_filter & PARSE_EVENT_FILTER_END_NS: 324 context._ns_stack.append(declared_namespaces) 325 if event_filter & (PARSE_EVENT_FILTER_END | 326 PARSE_EVENT_FILTER_START): 327 _pushSaxStartEvent(context, c_ctxt, c_namespace, c_localname, None) 328 except: 329 context._handleSaxException(c_ctxt) 330 finally: 331 return # swallow any further exceptions 332 333 334cdef void _handleSaxTargetStart( 335 void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix, 336 const_xmlChar* c_namespace, int c_nb_namespaces, 337 const_xmlChar** c_namespaces, 338 int c_nb_attributes, int c_nb_defaulted, 339 const_xmlChar** c_attributes) with gil: 340 cdef int i 341 cdef size_t c_len 342 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 343 if c_ctxt._private is NULL or c_ctxt.disableSAX: 344 return 345 context = <_SaxParserContext>c_ctxt._private 346 347 cdef int event_filter = context._event_filter 348 cdef int sax_event_filter = context._target._sax_event_filter 349 try: 350 if c_nb_namespaces: 351 declared_namespaces = _build_prefix_uri_list( 352 context, c_nb_namespaces, c_namespaces) 353 354 if event_filter & PARSE_EVENT_FILTER_START_NS: 355 for prefix_uri_tuple in declared_namespaces: 356 context.events_iterator._events.append(("start-ns", prefix_uri_tuple)) 357 358 if sax_event_filter & SAX_EVENT_START_NS: 359 for prefix, uri in declared_namespaces: 360 context._target._handleSaxStartNs(prefix, uri) 361 #if not context._target._sax_event_filter & SAX_EVENT_START: 362 # # *Only* collecting start-ns events. 363 # return 364 else: 365 declared_namespaces = None 366 367 if sax_event_filter & SAX_EVENT_START: 368 if c_nb_defaulted > 0: 369 # only add default attributes if we asked for them 370 if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0: 371 c_nb_attributes -= c_nb_defaulted 372 if c_nb_attributes == 0: 373 attrib = IMMUTABLE_EMPTY_MAPPING 374 else: 375 attrib = {} 376 for i in xrange(c_nb_attributes): 377 name = _namespacedNameFromNsName( 378 c_attributes[2], c_attributes[0]) 379 if c_attributes[3] is NULL: 380 value = '' 381 else: 382 c_len = c_attributes[4] - c_attributes[3] 383 value = c_attributes[3][:c_len].decode('utf8') 384 attrib[name] = value 385 c_attributes += 5 386 387 nsmap = dict(declared_namespaces) if c_nb_namespaces else IMMUTABLE_EMPTY_MAPPING 388 389 element = _callTargetSaxStart( 390 context, c_ctxt, 391 _namespacedNameFromNsName(c_namespace, c_localname), 392 attrib, nsmap) 393 else: 394 element = None 395 396 if (event_filter & PARSE_EVENT_FILTER_END_NS or 397 sax_event_filter & SAX_EVENT_END_NS): 398 context._ns_stack.append(declared_namespaces) 399 if event_filter & (PARSE_EVENT_FILTER_END | 400 PARSE_EVENT_FILTER_START): 401 _pushSaxStartEvent(context, c_ctxt, c_namespace, 402 c_localname, element) 403 except: 404 context._handleSaxException(c_ctxt) 405 finally: 406 return # swallow any further exceptions 407 408 409cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name, 410 const_xmlChar** c_attributes) with gil: 411 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 412 if c_ctxt._private is NULL or c_ctxt.disableSAX: 413 return 414 context = <_SaxParserContext>c_ctxt._private 415 try: 416 context._origSaxStartNoNs(c_ctxt, c_name, c_attributes) 417 if c_ctxt.html: 418 _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node) 419 if context._event_filter & (PARSE_EVENT_FILTER_END | 420 PARSE_EVENT_FILTER_START): 421 _pushSaxStartEvent(context, c_ctxt, NULL, c_name, None) 422 except: 423 context._handleSaxException(c_ctxt) 424 finally: 425 return # swallow any further exceptions 426 427 428cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name, 429 const_xmlChar** c_attributes) with gil: 430 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 431 if c_ctxt._private is NULL or c_ctxt.disableSAX: 432 return 433 context = <_SaxParserContext>c_ctxt._private 434 try: 435 if c_attributes is NULL: 436 attrib = IMMUTABLE_EMPTY_MAPPING 437 else: 438 attrib = {} 439 while c_attributes[0] is not NULL: 440 name = funicode(c_attributes[0]) 441 attrib[name] = funicodeOrEmpty(c_attributes[1]) 442 c_attributes += 2 443 element = _callTargetSaxStart( 444 context, c_ctxt, funicode(c_name), 445 attrib, IMMUTABLE_EMPTY_MAPPING) 446 if context._event_filter & (PARSE_EVENT_FILTER_END | 447 PARSE_EVENT_FILTER_START): 448 _pushSaxStartEvent(context, c_ctxt, NULL, c_name, element) 449 except: 450 context._handleSaxException(c_ctxt) 451 finally: 452 return # swallow any further exceptions 453 454 455cdef _callTargetSaxStart(_SaxParserContext context, 456 xmlparser.xmlParserCtxt* c_ctxt, 457 tag, attrib, nsmap): 458 element = context._target._handleSaxStart(tag, attrib, nsmap) 459 if element is not None and c_ctxt.input is not NULL: 460 if isinstance(element, _Element): 461 (<_Element>element)._c_node.line = ( 462 <unsigned short>c_ctxt.input.line 463 if c_ctxt.input.line < 65535 else 65535) 464 return element 465 466 467cdef int _pushSaxStartEvent(_SaxParserContext context, 468 xmlparser.xmlParserCtxt* c_ctxt, 469 const_xmlChar* c_href, 470 const_xmlChar* c_name, node) except -1: 471 if (context._matcher is None or 472 context._matcher.matchesNsTag(c_href, c_name)): 473 if node is None and context._target is None: 474 assert context._doc is not None 475 node = _elementFactory(context._doc, c_ctxt.node) 476 if context._event_filter & PARSE_EVENT_FILTER_START: 477 context.events_iterator._events.append(('start', node)) 478 if (context._target is None and 479 context._event_filter & PARSE_EVENT_FILTER_END): 480 context._node_stack.append(node) 481 return 0 482 483 484cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname, 485 const_xmlChar* c_prefix, 486 const_xmlChar* c_namespace) with gil: 487 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 488 if c_ctxt._private is NULL or c_ctxt.disableSAX: 489 return 490 context = <_SaxParserContext>c_ctxt._private 491 try: 492 if context._target is not None: 493 if context._target._sax_event_filter & SAX_EVENT_END: 494 node = context._target._handleSaxEnd( 495 _namespacedNameFromNsName(c_namespace, c_localname)) 496 else: 497 node = None 498 else: 499 context._origSaxEnd(c_ctxt, c_localname, c_prefix, c_namespace) 500 node = None 501 _pushSaxEndEvent(context, c_namespace, c_localname, node) 502 _pushSaxNsEndEvents(context) 503 except: 504 context._handleSaxException(c_ctxt) 505 finally: 506 return # swallow any further exceptions 507 508 509cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) with gil: 510 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 511 if c_ctxt._private is NULL or c_ctxt.disableSAX: 512 return 513 context = <_SaxParserContext>c_ctxt._private 514 try: 515 if context._target is not None: 516 node = context._target._handleSaxEnd(funicode(c_name)) 517 else: 518 context._origSaxEndNoNs(c_ctxt, c_name) 519 node = None 520 _pushSaxEndEvent(context, NULL, c_name, node) 521 except: 522 context._handleSaxException(c_ctxt) 523 finally: 524 return # swallow any further exceptions 525 526 527cdef int _pushSaxNsEndEvents(_SaxParserContext context) except -1: 528 cdef bint build_events = context._event_filter & PARSE_EVENT_FILTER_END_NS 529 cdef bint call_target = ( 530 context._target is not None 531 and context._target._sax_event_filter & SAX_EVENT_END_NS) 532 if not build_events and not call_target: 533 return 0 534 535 cdef list declared_namespaces = context._ns_stack.pop() 536 if declared_namespaces is None: 537 return 0 538 539 cdef tuple prefix_uri 540 for prefix_uri in reversed(declared_namespaces): 541 if call_target: 542 context._target._handleSaxEndNs(prefix_uri[0]) 543 if build_events: 544 context.events_iterator._events.append(('end-ns', None)) 545 546 return 0 547 548 549cdef int _pushSaxEndEvent(_SaxParserContext context, 550 const_xmlChar* c_href, 551 const_xmlChar* c_name, node) except -1: 552 if context._event_filter & PARSE_EVENT_FILTER_END: 553 if (context._matcher is None or 554 context._matcher.matchesNsTag(c_href, c_name)): 555 if context._target is None: 556 node = context._node_stack.pop() 557 context.events_iterator._events.append(('end', node)) 558 return 0 559 560 561cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) with gil: 562 # can only be called if parsing with a target 563 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 564 if c_ctxt._private is NULL or c_ctxt.disableSAX: 565 return 566 context = <_SaxParserContext>c_ctxt._private 567 try: 568 context._target._handleSaxData( 569 c_data[:data_len].decode('utf8')) 570 except: 571 context._handleSaxException(c_ctxt) 572 finally: 573 return # swallow any further exceptions 574 575 576cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name, 577 const_xmlChar* c_public, 578 const_xmlChar* c_system) with gil: 579 # can only be called if parsing with a target 580 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 581 if c_ctxt._private is NULL or c_ctxt.disableSAX: 582 return 583 context = <_SaxParserContext>c_ctxt._private 584 try: 585 context._target._handleSaxDoctype( 586 funicodeOrNone(c_name), 587 funicodeOrNone(c_public), 588 funicodeOrNone(c_system)) 589 except: 590 context._handleSaxException(c_ctxt) 591 finally: 592 return # swallow any further exceptions 593 594 595cdef void _handleSaxStartDocument(void* ctxt) with gil: 596 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 597 if c_ctxt._private is NULL or c_ctxt.disableSAX: 598 return 599 context = <_SaxParserContext>c_ctxt._private 600 context._origSaxStartDocument(ctxt) 601 c_doc = c_ctxt.myDoc 602 try: 603 context.startDocument(c_doc) 604 except: 605 context._handleSaxException(c_ctxt) 606 finally: 607 return # swallow any further exceptions 608 609 610cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target, 611 const_xmlChar* c_data) with gil: 612 # can only be called if parsing with a target 613 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 614 if c_ctxt._private is NULL or c_ctxt.disableSAX: 615 return 616 context = <_SaxParserContext>c_ctxt._private 617 try: 618 pi = context._target._handleSaxPi( 619 funicodeOrNone(c_target), 620 funicodeOrEmpty(c_data)) 621 if context._event_filter & PARSE_EVENT_FILTER_PI: 622 context.events_iterator._events.append(('pi', pi)) 623 except: 624 context._handleSaxException(c_ctxt) 625 finally: 626 return # swallow any further exceptions 627 628 629cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target, 630 const_xmlChar* data) with gil: 631 # can only be called when collecting pi events 632 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 633 if c_ctxt._private is NULL or c_ctxt.disableSAX: 634 return 635 context = <_SaxParserContext>c_ctxt._private 636 context._origSaxPI(ctxt, target, data) 637 c_node = _findLastEventNode(c_ctxt) 638 if c_node is NULL: 639 return 640 try: 641 context.pushEvent('pi', c_node) 642 except: 643 context._handleSaxException(c_ctxt) 644 finally: 645 return # swallow any further exceptions 646 647 648cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) with gil: 649 # can only be called if parsing with a target 650 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 651 if c_ctxt._private is NULL or c_ctxt.disableSAX: 652 return 653 context = <_SaxParserContext>c_ctxt._private 654 try: 655 comment = context._target._handleSaxComment(funicodeOrEmpty(c_data)) 656 if context._event_filter & PARSE_EVENT_FILTER_COMMENT: 657 context.events_iterator._events.append(('comment', comment)) 658 except: 659 context._handleSaxException(c_ctxt) 660 finally: 661 return # swallow any further exceptions 662 663 664cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) with gil: 665 # can only be called when collecting comment events 666 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 667 if c_ctxt._private is NULL or c_ctxt.disableSAX: 668 return 669 context = <_SaxParserContext>c_ctxt._private 670 context._origSaxComment(ctxt, text) 671 c_node = _findLastEventNode(c_ctxt) 672 if c_node is NULL: 673 return 674 try: 675 context.pushEvent('comment', c_node) 676 except: 677 context._handleSaxException(c_ctxt) 678 finally: 679 return # swallow any further exceptions 680 681 682cdef inline xmlNode* _findLastEventNode(xmlparser.xmlParserCtxt* c_ctxt): 683 # this mimics what libxml2 creates for comments/PIs 684 if c_ctxt.inSubset == 1: 685 return c_ctxt.myDoc.intSubset.last 686 elif c_ctxt.inSubset == 2: 687 return c_ctxt.myDoc.extSubset.last 688 elif c_ctxt.node is NULL: 689 return c_ctxt.myDoc.last 690 elif c_ctxt.node.type == tree.XML_ELEMENT_NODE: 691 return c_ctxt.node.last 692 else: 693 return c_ctxt.node.next 694 695 696############################################################ 697## ET compatible XML tree builder 698############################################################ 699 700cdef class TreeBuilder(_SaxParserTarget): 701 u"""TreeBuilder(self, element_factory=None, parser=None, 702 comment_factory=None, pi_factory=None, 703 insert_comments=True, insert_pis=True) 704 705 Parser target that builds a tree from parse event callbacks. 706 707 The factory arguments can be used to influence the creation of 708 elements, comments and processing instructions. 709 710 By default, comments and processing instructions are inserted into 711 the tree, but they can be ignored by passing the respective flags. 712 713 The final tree is returned by the ``close()`` method. 714 """ 715 cdef _BaseParser _parser 716 cdef object _factory 717 cdef object _comment_factory 718 cdef object _pi_factory 719 cdef list _data 720 cdef list _element_stack 721 cdef object _element_stack_pop 722 cdef _Element _last # may be None 723 cdef bint _in_tail 724 cdef bint _insert_comments 725 cdef bint _insert_pis 726 727 def __init__(self, *, element_factory=None, parser=None, 728 comment_factory=None, pi_factory=None, 729 bint insert_comments=True, bint insert_pis=True): 730 self._sax_event_filter = \ 731 SAX_EVENT_START | SAX_EVENT_END | SAX_EVENT_DATA | \ 732 SAX_EVENT_PI | SAX_EVENT_COMMENT 733 self._data = [] # data collector 734 self._element_stack = [] # element stack 735 self._element_stack_pop = self._element_stack.pop 736 self._last = None # last element 737 self._in_tail = 0 # true if we're after an end tag 738 self._factory = element_factory 739 self._comment_factory = comment_factory if comment_factory is not None else Comment 740 self._pi_factory = pi_factory if pi_factory is not None else ProcessingInstruction 741 self._insert_comments = insert_comments 742 self._insert_pis = insert_pis 743 self._parser = parser 744 745 @cython.final 746 cdef int _flush(self) except -1: 747 if self._data: 748 if self._last is not None: 749 text = u"".join(self._data) 750 if self._in_tail: 751 assert self._last.tail is None, u"internal error (tail)" 752 self._last.tail = text 753 else: 754 assert self._last.text is None, u"internal error (text)" 755 self._last.text = text 756 del self._data[:] 757 return 0 758 759 # internal SAX event handlers 760 761 @cython.final 762 cdef _handleSaxStart(self, tag, attrib, nsmap): 763 self._flush() 764 if self._factory is not None: 765 self._last = self._factory(tag, attrib) 766 if self._element_stack: 767 _appendChild(self._element_stack[-1], self._last) 768 elif self._element_stack: 769 self._last = _makeSubElement( 770 self._element_stack[-1], tag, None, None, attrib, nsmap, None) 771 else: 772 self._last = _makeElement( 773 tag, NULL, None, self._parser, None, None, attrib, nsmap, None) 774 self._element_stack.append(self._last) 775 self._in_tail = 0 776 return self._last 777 778 @cython.final 779 cdef _handleSaxEnd(self, tag): 780 self._flush() 781 self._last = self._element_stack_pop() 782 self._in_tail = 1 783 return self._last 784 785 @cython.final 786 cdef int _handleSaxData(self, data) except -1: 787 self._data.append(data) 788 789 @cython.final 790 cdef _handleSaxPi(self, target, data): 791 elem = self._pi_factory(target, data) 792 if self._insert_pis: 793 self._flush() 794 self._last = elem 795 if self._element_stack: 796 _appendChild(self._element_stack[-1], self._last) 797 self._in_tail = 1 798 return self._last 799 800 @cython.final 801 cdef _handleSaxComment(self, comment): 802 elem = self._comment_factory(comment) 803 if self._insert_comments: 804 self._flush() 805 self._last = elem 806 if self._element_stack: 807 _appendChild(self._element_stack[-1], self._last) 808 self._in_tail = 1 809 return elem 810 811 # Python level event handlers 812 813 def close(self): 814 u"""close(self) 815 816 Flushes the builder buffers, and returns the toplevel document 817 element. Raises XMLSyntaxError on inconsistencies. 818 """ 819 if self._element_stack: 820 raise XMLSyntaxAssertionError("missing end tags") 821 # TODO: this does not necessarily seem like an error case. Why not just return None? 822 if self._last is None: 823 raise XMLSyntaxAssertionError("missing toplevel element") 824 return self._last 825 826 def data(self, data): 827 u"""data(self, data) 828 829 Adds text to the current element. The value should be either an 830 8-bit string containing ASCII text, or a Unicode string. 831 """ 832 self._handleSaxData(data) 833 834 def start(self, tag, attrs, nsmap=None): 835 u"""start(self, tag, attrs, nsmap=None) 836 837 Opens a new element. 838 """ 839 if nsmap is None: 840 nsmap = IMMUTABLE_EMPTY_MAPPING 841 return self._handleSaxStart(tag, attrs, nsmap) 842 843 def end(self, tag): 844 u"""end(self, tag) 845 846 Closes the current element. 847 """ 848 element = self._handleSaxEnd(tag) 849 assert self._last.tag == tag,\ 850 f"end tag mismatch (expected {self._last.tag}, got {tag})" 851 return element 852 853 def pi(self, target, data=None): 854 u"""pi(self, target, data=None) 855 856 Creates a processing instruction using the factory, appends it 857 (unless disabled) and returns it. 858 """ 859 return self._handleSaxPi(target, data) 860 861 def comment(self, comment): 862 u"""comment(self, comment) 863 864 Creates a comment using the factory, appends it (unless disabled) 865 and returns it. 866 """ 867 return self._handleSaxComment(comment) 868