1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of libfolia 7 8 libfolia is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 libfolia is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 */ 26 #include <cassert> 27 #include <iostream> 28 #include <iomanip> 29 #include <fstream> 30 #include <cstring> 31 #include <cstdio> 32 #include <string> 33 #include <stack> 34 #include <stdexcept> 35 #include <algorithm> 36 #include "ticcutils/PrettyPrint.h" 37 #include "ticcutils/FileUtils.h" 38 #include "ticcutils/XMLtools.h" 39 #include "ticcutils/zipper.h" 40 #include "libfolia/folia.h" 41 42 using namespace std; 43 44 /// define a static default LogStream 45 TiCC::LogStream DBG_CERR(cerr,"folia-engine:"); 46 47 /// direct Debugging info to the internal file, if present, or to the default stream 48 #define DBG *TiCC::Log((_dbg_file?_dbg_file:&DBG_CERR)) 49 50 namespace folia { 51 52 using TiCC::operator<<; 53 xml_tree(int d,int i,const std::string & t,const std::string & c)54 xml_tree::xml_tree( int d, 55 int i, 56 const std::string& t, 57 const std::string& c ): 58 /// create an xml_tree element with the given parameters 59 depth(d), 60 index(i), 61 tag(t), 62 textclass(c), 63 parent(0), 64 link(0), 65 next(0) 66 {} 67 ~xml_tree()68 xml_tree::~xml_tree() { 69 /// delete an xml_tree 70 if ( link ){ 71 delete link; 72 } 73 if ( next ){ 74 delete next; 75 } 76 } 77 print(ostream & os,const xml_tree * tree)78 void print( ostream& os, const xml_tree* tree ){ 79 //! pretty print an xml_tree 80 /*! 81 \param os the output stream 82 \param tree the tree 83 */ 84 const xml_tree *rec_pnt = tree; 85 while ( rec_pnt ){ 86 os << setw(10) << rec_pnt->index << string( rec_pnt->depth, ' ' ) 87 << rec_pnt->tag; 88 if ( rec_pnt->textclass.empty() ){ 89 os << endl; 90 } 91 else { 92 os << " (" << rec_pnt->textclass << ")" << endl; 93 } 94 print( os, rec_pnt->link ); 95 rec_pnt = rec_pnt->next; 96 } 97 } 98 operator <<(ostream & os,const xml_tree * tree)99 ostream& operator<<( ostream& os, const xml_tree* tree ){ 100 /// print an xml_tree 101 os << endl; 102 print( os, tree ); 103 return os; 104 } 105 Engine()106 Engine::Engine(): 107 /// default constructor 108 _reader(0), 109 _out_doc(0), 110 _root_node(0), 111 _external_node(0), 112 _current_node(0), 113 _last_added(0), 114 _last_depth(2), 115 _doc_type( TEXT ), 116 _dbg_file(0), 117 _os(0), 118 _ok(false), 119 _done(false), 120 _header_done(false), 121 _finished(false), 122 _debug(false) 123 { 124 } 125 ~Engine()126 Engine::~Engine(){ 127 /// destructor 128 xmlFreeTextReader( _reader ); 129 delete _out_doc; 130 delete _os; 131 } 132 doc(bool disconnect)133 Document *Engine::doc( bool disconnect ){ 134 /// returns the associated FoLiA document. 135 /*! 136 \param disconnect When true, handle control over to the caller. 137 The caller has to delete it then to avoid memory leaks 138 139 */ 140 Document *result = _out_doc; 141 if ( disconnect ){ 142 _out_doc = 0; 143 } 144 return result; 145 } 146 set_debug(bool d)147 bool Engine::set_debug( bool d ) { 148 /// switch debugging on/off depending on parameter 'd' 149 /*! 150 \param d when true switch debugging to ON, otherwise OFF 151 152 When debugging is switched ON and NO debug file is associated yet, 153 it is created. 154 */ 155 bool res = _debug; 156 if ( d ){ 157 if ( !_dbg_file ){ 158 _dbg_file 159 = new TiCC::LogStream( cerr, "folia-engine", StampMessage ); 160 } 161 } 162 _debug = d; 163 return res; 164 } 165 set_dbg_stream(TiCC::LogStream * ls)166 void Engine::set_dbg_stream( TiCC::LogStream *ls ){ 167 /// switch debugging to another LogStream 168 if ( _dbg_file ){ 169 delete _dbg_file; 170 } 171 _dbg_file = ls; 172 } 173 un_declare(const AnnotationType & at,const string & setname)174 void Engine::un_declare( const AnnotationType& at, 175 const string& setname ){ 176 /// remove the annotation declaration for the given type and set 177 /*! 178 \param at the AnnotationType 179 \param setname the set so remove 180 181 \note an AnntotationType can have several set-names assigned to it. 182 When setname is empty ("") ALL set-names are removed 183 */ 184 if ( !ok() ){ 185 throw logic_error( "declare() called on invalid engine!" ); 186 } 187 else if ( _header_done ){ 188 throw logic_error( "declare() called on already (partially) saved document!" ); 189 } 190 else { 191 _out_doc->un_declare( at, setname ); 192 } 193 } 194 declare(const AnnotationType & at,const string & setname,const string & args)195 void Engine::declare( const AnnotationType& at, 196 const string& setname, 197 const string& args ) { 198 /// declare a set for a given annotation type 199 /*! 200 \param at the AnnotationType 201 \param setname The set-name to use 202 \param args additional arguments in string annotation. Can be used to add 203 extra arguments like a processor name or an annotator 204 */ 205 KWargs kwargs( args ); 206 declare( at, setname, kwargs ); 207 } 208 declare(const AnnotationType & at,const string & setname,const KWargs & args)209 void Engine::declare( const AnnotationType& at, 210 const string& setname, 211 const KWargs& args ) { 212 /// declare a set for a given annotation type 213 /*! 214 \param at the AnnotationType 215 \param setname The set-name to use 216 \param args additional arguments as a KWargs attribute-value list. 217 can be used to add extra arguments like a processor name or 218 an annotator 219 */ 220 if ( !ok() ){ 221 throw logic_error( "declare() called on invalid engine!" ); 222 } 223 else if ( _header_done ){ 224 throw logic_error( "declare() called on already (partially) saved document!" ); 225 } 226 else { 227 _out_doc->declare( at, setname, args ); 228 } 229 } 230 is_declared(const AnnotationType & at,const string & setname) const231 bool Engine::is_declared( const AnnotationType& at, 232 const string& setname ) const { 233 /// check if an annotation for the provided type and setname is present 234 /*! 235 \param at the AnnotationType 236 \param setname the set-name to test 237 \return true if declared, false otherwise. 238 */ 239 if ( !ok() ){ 240 throw logic_error( "is_declared() called on invalid engine!" ); 241 } 242 else { 243 return _out_doc->declared( at, setname ); 244 } 245 } 246 is_declared(const AnnotationType & at,const string & setname,const string & annotator,const AnnotatorType & annotator_type,const string & processor) const247 bool Engine::is_declared( const AnnotationType& at, 248 const string& setname, 249 const string& annotator, 250 const AnnotatorType& annotator_type, 251 const string& processor ) const { 252 /// check if an annotation for the provided type and setname is present 253 /*! 254 \param at the AnnotationType 255 \param setname the set-name to test 256 \param annotator the name of the annotator to test 257 \param annotator_type the AnnotatorType to test 258 \param processor the desired processor 259 \return true if declared, false otherwise. 260 */ 261 if ( !ok() ){ 262 throw logic_error( "is_declared() called on invalid engine!" ); 263 } 264 else { 265 return _out_doc->declared( at, setname, annotator, annotator_type, processor ); 266 } 267 } 268 is_declared(const AnnotationType & at,const string & setname,const string & annotator,const string & annotator_type,const string & processor) const269 bool Engine::is_declared( const AnnotationType& at, 270 const string& setname, 271 const string& annotator, 272 const string& annotator_type, 273 const string& processor ) const { 274 /// check if an annotation for the provided type and setname is present 275 /*! 276 \param at the AnnotationType 277 \param setname the set-name to test 278 \param annotator the name of the annotator to test 279 \param annotator_type the AnnotatorType to test, encoded as a string 280 \param processor the desired processor 281 \return true if declared, false otherwise. 282 */ 283 AnnotatorType ant = UNDEFINED; 284 try { 285 ant = TiCC::stringTo<AnnotatorType>(annotator_type); 286 } 287 catch (...){ 288 throw logic_error( annotator_type + " is NOT a valid annotator type" ); 289 } 290 return is_declared( at, setname, annotator, ant, processor ); 291 } 292 set_metadata(const std::string & att,const std::string & val)293 void Engine::set_metadata( const std::string& att, 294 const std::string& val){ 295 /// set a metadata value in the associated document 296 /*! 297 \param att the attribute to set 298 \param val the value of the attribute 299 */ 300 if ( !ok() ){ 301 throw logic_error( "set_metadata() called on invalid engine!" ); 302 } 303 else { 304 return _out_doc->set_metadata( att, val ); 305 } 306 } 307 extract_style(const string & value)308 pair<string,string> extract_style( const string& value ){ 309 /// parse a string to extract an xml style-sheet value 310 /*! 311 \param value the line to parse 312 \return a pait of strings containing the type and the href values 313 */ 314 string type; 315 string href; 316 vector<string> v = TiCC::split( value ); 317 if ( v.size() == 2 ){ 318 vector<string> w = TiCC::split_at( v[0], "=" ); 319 if ( w.size() == 2 && w[0] == "type" ){ 320 type = w[1].substr(1,w[1].length()-2); 321 } 322 w = TiCC::split_at( v[1], "=" ); 323 if ( w.size() == 2 && w[0] == "href" ){ 324 href = w[1].substr(1,w[1].length()-2); 325 } 326 return make_pair(type,href); 327 } 328 else { 329 throw XmlError( "couldn't parse xml-style-sheet line: " + value ); 330 } 331 } 332 get_attributes(xmlTextReader * tr)333 KWargs get_attributes( xmlTextReader *tr ){ 334 /// extract a KWargs attribute/value list from the TextReader location 335 /*! 336 \param tr the xmlTextReader pointer 337 \return a KWargs list of all attribute/value pairs found 338 */ 339 KWargs result; 340 if ( xmlTextReaderHasAttributes(tr) ){ 341 xmlTextReaderMoveToFirstAttribute(tr); 342 do { 343 string att = (const char*)xmlTextReaderConstName(tr); 344 string val = (const char*)xmlTextReaderConstValue(tr); 345 result[att] = val; 346 } 347 while ( xmlTextReaderMoveToNextAttribute(tr) ); 348 } 349 return result; 350 } 351 create_text_reader(const string & buf)352 xmlTextReader *create_text_reader( const string& buf ){ 353 /// create a new xmlTextRead on a buffer 354 /*! 355 \param buf the input buffer. 356 The buffer may contain a complete (FoLiA-) XML document as a string 357 OR a filename denoting such a document, which may be .bz2 and .gz 358 encoded 359 */ 360 if ( TiCC::match_front( buf, "<?xml " ) ){ 361 return xmlReaderForMemory( buf.c_str(), buf.size(), 362 "input_buffer", 0, XML_PARSER_OPTIONS ); 363 } 364 else if ( TiCC::match_back( buf, ".bz2" ) ){ 365 string buffer = TiCC::bz2ReadFile( buf ); 366 if ( buffer.empty() ){ 367 throw runtime_error( "folia::Engine(), empty file? (" + buf 368 + ")" ); 369 } 370 // 371 // next step fails for unclear reasons 372 // so we use an intermediate file. Which works, but is clumsy 373 // 374 // return xmlReaderForMemory( buffer.c_str(), buffer.size()+1, 375 // buf.c_str(), 0, XML_PARSER_OPTIONS ); 376 TiCC::tmp_stream ts( "folia" ); 377 string tmp_file = ts.tmp_name(); 378 ofstream& os = ts.os(); 379 os << buffer << endl; 380 ts.close(); 381 xmlTextReader *result 382 = xmlReaderForFile( tmp_file.c_str(), 0, XML_PARSER_OPTIONS ); 383 return result; 384 } 385 // libxml2 can handle .xml and .xml.gz 386 return xmlReaderForFile( buf.c_str(), 0, XML_PARSER_OPTIONS ); 387 } 388 add_text(int depth)389 void Engine::add_text( int depth ){ 390 /// when parsing, add a new XmlText node 391 /*! 392 \param depth the depth (location) in the tree where to add 393 */ 394 string value = (const char*)xmlTextReaderConstValue(_reader); 395 string trimmed = TiCC::trim(value); 396 if ( !trimmed.empty() ){ 397 throw XmlError( "spurious text " + trimmed + " found." ); 398 } 399 if ( _debug ){ 400 DBG << "add_text(" << value << ") depth=" << depth << endl; 401 } 402 XmlText *txt = new XmlText(); 403 txt->setvalue( value ); 404 append_node( txt, depth ); 405 } 406 add_comment(int depth)407 void Engine::add_comment( int depth ){ 408 /// when parsing, add a new _XmlComment node 409 /*! 410 \param depth the depth (location) in the tree where to add 411 */ 412 if ( _debug ){ 413 DBG << "add_comment " << endl; 414 } 415 string tag = "_XmlComment"; 416 FoliaElement *t = AbstractElement::createElement( tag, _out_doc ); 417 append_node( t, depth ); 418 } 419 add_default_node(int depth)420 void Engine::add_default_node( int depth ){ 421 /// when debugging, output a message. Does nothing else 422 if ( _debug ){ 423 string local_name = (const char*)xmlTextReaderConstLocalName(_reader); 424 int type = xmlTextReaderNodeType(_reader); 425 DBG << "add_node " << type << " name=" << local_name 426 << " depth " << _last_depth << " ==> " << depth << endl; 427 } 428 } 429 check_empty(xmlNode * node)430 void check_empty( xmlNode *node ){ 431 /// assure that node == 0 OR just contains whitespace or comment 432 /*! 433 \param node the node to check 434 will throw when node is anything other than xml-comment or whitespace 435 */ 436 if ( node ){ 437 if ( node->type == XML_COMMENT_NODE ){ 438 check_empty( node->next ); 439 } 440 else if ( node->type == XML_TEXT_NODE ){ 441 string txt = TextValue(node); 442 txt = TiCC::trim(txt); 443 if ( !txt.empty() ){ 444 string tg = "<" + TiCC::Name(node->prev) + ">"; 445 throw XmlError( "found extra text '" + txt + "' after element " 446 + tg + ", NOT allowed there." ); 447 } 448 } 449 else { 450 string tg = "<" + TiCC::Name(node->prev) + ">"; 451 throw XmlError( "found unexpected node '" + TiCC::Name(node) 452 + "' after element " + tg + ", NOT allowed there." ); 453 } 454 } 455 } 456 init_doc(const string & file_name,const string & out_name)457 bool Engine::init_doc( const string& file_name, 458 const string& out_name ){ 459 /// init an associated document for this Engine 460 /*! 461 \param file_name the input file to use for parsing 462 \param out_name when not empty, add an output-file with this name 463 464 Initializing includes parsing the Document's metadata, style-sheet 465 upto and including the top \<text or \<speech> node 466 */ 467 _ok = false; 468 _out_doc = new Document(); 469 _out_doc->set_incremental( true ); 470 if ( !out_name.empty() ){ 471 _os = new ofstream( out_name ); 472 _out_name = out_name; 473 } 474 _out_doc->_source_filename = file_name; 475 _reader = create_text_reader( file_name ); 476 if ( _reader == 0 ){ 477 _ok = false; 478 throw( runtime_error( "folia::Engine(), init failed on '" + file_name 479 + "' (File not found)" ) ); 480 } 481 int index = 0; 482 while ( xmlTextReaderRead(_reader) > 0 ){ 483 int type = xmlTextReaderNodeType(_reader ); 484 string local_name = (const char*)xmlTextReaderConstLocalName(_reader ); 485 switch ( type ){ 486 case XML_READER_TYPE_ELEMENT: 487 ++index; 488 if ( local_name == "FoLiA" ){ 489 // found the root 490 const xmlChar *pnt = xmlTextReaderConstPrefix(_reader); 491 if ( pnt ){ 492 _out_doc->_foliaNsIn_prefix = xmlStrdup(pnt ); 493 ns_prefix = (const char*)pnt; 494 } 495 pnt = xmlTextReaderConstNamespaceUri(_reader); 496 if ( pnt ){ 497 _out_doc->_foliaNsIn_href = xmlStrdup(pnt); 498 string ns = (const char*)_out_doc->_foliaNsIn_href; 499 if ( ns != NSFOLIA ){ 500 _ok = false; 501 throw XmlError( "Folia Document should have namespace declaration " 502 + NSFOLIA + " but found: " + ns ); 503 } 504 } 505 KWargs in_args = get_attributes( _reader ); 506 string id; 507 if ( !in_args.empty() ){ 508 id = in_args["xml:id"]; 509 } 510 for ( auto it =in_args.begin(); it != in_args.end(); ){ 511 // remove all xmlns attributes 512 if ( it->first.find( "xmlns" ) == 0 ){ 513 it = in_args.erase( it ); 514 } 515 else { 516 ++it; 517 } 518 } 519 if ( !id.empty() ){ 520 FoliaElement *root = new FoLiA( in_args, _out_doc ); 521 _out_doc->foliadoc = root; 522 } 523 else { 524 _ok = false; 525 throw XmlError( "Engine: invalid FoLiA. missing ID" ); 526 } 527 } 528 else if ( local_name == "metadata" ) { 529 xmlNode *node = xmlTextReaderExpand(_reader); 530 check_empty( node->next ); 531 _out_doc->parse_metadata( node ); 532 } 533 else if ( local_name == "text" ){ 534 _doc_type = TEXT; 535 KWargs args = get_attributes(_reader); 536 FoliaElement *text =_out_doc->setTextRoot( args ); 537 _root_node = text; 538 _current_node = text; 539 _ok = true; 540 _start_index = index; 541 _out_doc->save_orig_ann_defaults(); 542 return _ok; 543 } 544 else if ( local_name == "speech" ){ 545 _doc_type = SPEECH; 546 KWargs args = get_attributes(_reader); 547 FoliaElement *sp = _out_doc->setSpeechRoot( args ); 548 _root_node = sp; 549 _current_node = sp; 550 _ok = true; 551 _start_index = index; 552 _out_doc->save_orig_ann_defaults(); 553 return _ok; 554 } 555 break; 556 case XML_READER_TYPE_PROCESSING_INSTRUCTION: 557 // A PI 558 if ( local_name == "xml-stylesheet" ){ 559 string sv = (const char*)xmlTextReaderConstValue(_reader); 560 pair<string,string> p = extract_style( sv ); 561 _out_doc->addStyle( p.first, p.second ); 562 } 563 else { 564 cerr << "unhandled PI: " << local_name << endl; 565 } 566 break; 567 default: 568 break; 569 }; 570 } 571 _out_doc->save_orig_ann_defaults(); 572 _ok = true; 573 return _ok; 574 } 575 append_node(FoliaElement * t,int depth)576 void Engine::append_node( FoliaElement *t, 577 int depth ){ 578 /// append a FoliaElement to the associated document 579 /*! 580 \param t the FoliaElement 581 \param depth the location to use for adding 582 */ 583 if ( _debug ){ 584 DBG << "append_node(" << t << ") current node= " << _current_node << endl; 585 DBG << "append_node(): last node= " << _last_added << endl; 586 } 587 if ( depth == _last_depth ){ 588 if ( _debug ){ 589 DBG << "append_node(): EQUAL!" << endl; 590 } 591 } 592 else if ( depth > _last_depth ){ 593 if ( _debug ){ 594 DBG << "append_node(): DEEPER!" << endl; 595 } 596 _current_node = _last_added; 597 } 598 else if ( depth < _last_depth ){ 599 if ( _debug ){ 600 DBG << "append_node(): UP!" << endl; 601 } 602 for ( int i=0; i < _last_depth - depth; ++i ){ 603 _current_node = _current_node->parent(); 604 if ( _debug ){ 605 DBG << "up node = " << _current_node << endl; 606 } 607 } 608 } 609 _last_depth = depth; 610 _current_node->append( t ); 611 if ( _debug ){ 612 DBG << "append_node() result = " << _current_node << endl; 613 } 614 _last_added = t; 615 } 616 handle_match(const string & local_name,int new_depth)617 FoliaElement *Engine::handle_match( const string& local_name, 618 int new_depth ){ 619 /// expand a matched tag into a FoLiA subtree 620 /*! 621 \param local_name the tag to create 622 \param new_depth the location in the Document to attach to 623 \return an expanded FoLiA subtree 624 */ 625 FoliaElement *t = AbstractElement::createElement( local_name, _out_doc ); 626 if ( t ){ 627 if ( _debug ){ 628 DBG << "created FoliaElement: name=" << local_name << endl; 629 } 630 xmlNode *fd = xmlTextReaderExpand(_reader); 631 t->parseXml( fd ); 632 append_node( t, new_depth ); 633 _external_node = t; 634 if ( _debug ){ 635 DBG << "expose external node: " << t << endl; 636 } 637 return t; 638 } 639 else if ( !_out_doc->permissive() ){ 640 _ok = false; 641 throw XmlError( "folia::engine failed to create node: " 642 + local_name ); 643 } 644 else { 645 return 0; 646 } 647 } 648 get_node(const string & tag)649 FoliaElement *Engine::get_node( const string& tag ){ 650 /// return the next node in the Engine with 'tag' 651 /*! 652 \param tag the tag or a list of tags we are looking for 653 \return the FoliaElement found. 654 655 tag may be a single tag like 'lemma' but also a list of '|' separated 656 tags like 'lemma|pos|description'. In the latter case all named tags 657 are tested and the first found is returned 658 659 The returned FoliaElement is a FoLiA subtree expaned from the 660 xmlTextReader. Further parsing will continue at the next sibbling 661 of the parent. 662 */ 663 if ( _done ){ 664 if ( _debug ){ 665 DBG << "Engine::get_node(). we are done" << endl; 666 } 667 return 0; 668 } 669 if ( _debug ){ 670 DBG << "Engine::get_node(), for tag=" << tag << endl; 671 } 672 int ret = 0; 673 if ( _external_node != 0 ){ 674 // so our last action was to output a pointer to a subtree. 675 // continue with the next node, avoiding the subtree 676 _external_node = 0; 677 ret = xmlTextReaderNext(_reader); 678 } 679 else { 680 // so we are the first time here, just get the first node 681 ret = xmlTextReaderRead(_reader); 682 } 683 if ( xmlTextReaderReadState(_reader) < 0 ){ 684 throw runtime_error( "get_node() reading failed" ); 685 } 686 if ( ret == 0 ){ 687 if ( _debug ){ 688 DBG << "get node name, DONE" << endl; 689 } 690 _done = true; 691 return 0; 692 } 693 vector<string> tv = TiCC::split_at( tag, "|" ); 694 set<string> tags; 695 for ( const auto& t : tv ){ 696 tags.insert(t); 697 } 698 while ( ret ){ 699 int type = xmlTextReaderNodeType(_reader); 700 int new_depth = xmlTextReaderDepth(_reader); 701 switch ( type ){ 702 case XML_READER_TYPE_ELEMENT: { 703 string local_name = (const char*)xmlTextReaderConstLocalName(_reader); 704 if ( _debug ){ 705 DBG << "get node XML_ELEMENT name=" << local_name 706 << " depth " << _last_depth << " ==> " << new_depth << endl; 707 } 708 if ( tags.find(local_name) != tags.end() ){ 709 if ( _debug ){ 710 DBG << "matched search tag: " << local_name << endl; 711 } 712 _external_node = handle_match( local_name, new_depth ); 713 return _external_node; 714 } 715 else if ( local_name == "t" 716 || local_name == "ph" ){ 717 handle_content( local_name, new_depth ); 718 } 719 else { 720 handle_element( local_name, new_depth ); 721 } 722 } 723 break; 724 case XML_READER_TYPE_TEXT: { 725 add_text( new_depth ); 726 } 727 break; 728 case XML_READER_TYPE_COMMENT: { 729 add_comment( new_depth ); 730 } 731 break; 732 default: { 733 add_default_node( new_depth ); 734 } 735 break; 736 } 737 ret = xmlTextReaderRead(_reader); 738 } 739 _done = true; 740 return 0; 741 } 742 create_simple_tree(const string & in_file) const743 xml_tree *Engine::create_simple_tree( const string& in_file ) const { 744 /// create a lightweight tree for enumerating all XML_ELEMENTS encountered 745 /*! 746 \param in_file The file to create an xmlTextReader on. May be a string 747 buffer containing a complete XML file too 748 \return the light-weight tree with the relevant nodes 749 */ 750 xmlTextReader *cur_reader = create_text_reader( in_file ); 751 if ( xmlTextReaderReadState(cur_reader) < 0 ){ 752 throw runtime_error( "create_simple_tree() init failed" ); 753 } 754 if ( _debug ){ 755 DBG << "enumerate_nodes()" << endl; 756 } 757 xml_tree *records = 0; 758 xml_tree *rec_pnt = 0; 759 int index = 0; 760 int current_depth = 0; 761 while ( xmlTextReaderRead(cur_reader) > 0 ){ 762 int depth = xmlTextReaderDepth(cur_reader); 763 int type = xmlTextReaderNodeType(cur_reader); 764 if ( type == XML_READER_TYPE_ELEMENT 765 || type == XML_READER_TYPE_COMMENT ){ 766 string local_name = (const char*)xmlTextReaderConstLocalName(cur_reader); 767 KWargs atts = get_attributes( cur_reader ); 768 string nsu; 769 string txt_class; 770 for ( auto const& v : atts ){ 771 if ( v.first == "xmlns:xlink" ){ 772 // only at top level 773 continue; 774 } 775 if ( v.first.find("xmlns") == 0 ){ 776 nsu = v.second; 777 } 778 if ( v.first == "textclass" 779 || ( local_name == "t" && v.first == "class" ) ){ 780 txt_class = v.second; 781 } 782 } 783 if ( nsu.empty() || nsu == NSFOLIA ){ 784 xml_tree *add_rec = new xml_tree( depth, index, local_name, txt_class ); 785 if ( _debug ){ 786 DBG << "new record " << index << " " << local_name << " (" 787 << depth << ")" << endl; 788 } 789 if ( rec_pnt == 0 ){ 790 records = add_rec; 791 rec_pnt = records; 792 } 793 else if ( depth == current_depth ){ 794 add_rec->parent = rec_pnt->parent; 795 rec_pnt->next = add_rec; 796 rec_pnt = rec_pnt->next; 797 } 798 else if ( depth > current_depth ){ 799 add_rec->parent = rec_pnt; 800 rec_pnt->link = add_rec; 801 rec_pnt = rec_pnt->link; 802 } 803 else { // depth < current_depth 804 while ( rec_pnt && rec_pnt->depth > depth ){ 805 rec_pnt = rec_pnt->parent; 806 } 807 if ( rec_pnt == 0 ){ 808 rec_pnt = records; 809 } 810 while ( rec_pnt->next ){ 811 rec_pnt = rec_pnt->next; 812 } 813 add_rec->parent = rec_pnt->parent; 814 rec_pnt->next = add_rec; 815 rec_pnt = rec_pnt->next; 816 } 817 current_depth = rec_pnt->depth; 818 } 819 else { 820 if ( _debug ){ 821 DBG << "name=" << local_name << " atts=" << atts << endl; 822 DBG << "create_simple_tree() node in alien namespace '" 823 << nsu << "' is SKIPPED!" << endl; 824 } 825 } 826 ++index; 827 } 828 } 829 if ( xmlTextReaderReadState(cur_reader) < 0 ){ 830 throw runtime_error( "create_simple_tree() failed" ); 831 } 832 xmlFreeTextReader( cur_reader ); 833 return records; 834 } 835 count_nodes(FoliaElement * fe)836 int count_nodes( FoliaElement *fe ){ 837 /// count all 'real' FoliaElements including and below this one 838 /*! 839 \param fe the The element to start at 840 \return the 'size' of the subtree below fe. We need this number to know 841 where to proceed processing 842 */ 843 int result = 0; 844 // cerr << "DEPTH " << fe << endl; 845 if ( fe 846 && fe->xmltag() != "_XmlText" 847 && fe->element_id() != HeadFeature_t 848 && !isAttributeFeature(fe->xmltag()) ){ 849 result += 1; 850 if ( fe->size() > 0 ){ 851 // cerr << "size=" << fe->size() << endl; 852 for ( size_t i=0; i < fe->size(); ++i ){ 853 // cerr << "i=" << i << endl; 854 result += count_nodes( fe->index(i) ); 855 } 856 } 857 } 858 // cerr << "return DEPTH " << fe << " =" << result << endl; 859 return result; 860 } 861 handle_content(const string & t_or_ph,int new_depth)862 int Engine::handle_content( const string& t_or_ph, int new_depth ){ 863 /// process a matched 't' or 'ph' tag into a FoLiA subtree 864 /*! 865 \param t_or_ph a t or ph tags 866 \param new_depth the location in the Document to attach to 867 \return the number of FoliaElement nodes added 868 */ 869 KWargs atts = get_attributes( _reader ); 870 if ( _debug ){ 871 DBG << "expanding content of <" << t_or_ph << "> atts=" << atts << endl; 872 } 873 FoliaElement *t = AbstractElement::createElement( t_or_ph, _out_doc ); 874 if ( t ){ 875 t->setAttributes( atts ); 876 // just take as is... 877 xmlNode *fd = xmlTextReaderExpand(_reader); 878 t->parseXml( fd ); 879 if ( _debug ){ 880 DBG << "parsed " << t << endl; 881 } 882 append_node( t, new_depth ); 883 // skip subtree 884 xmlTextReaderNext(_reader); 885 int type = xmlTextReaderNodeType(_reader); 886 if ( type == XML_READER_TYPE_TEXT ){ 887 string value = (const char*)xmlTextReaderConstValue(_reader); 888 string trimmed = TiCC::trim(value); 889 if ( !trimmed.empty() ){ 890 throw XmlError( "spurious text " + trimmed + " found after node <" 891 + t_or_ph + ">" ); 892 } 893 } 894 return count_nodes( t ); 895 } 896 else { 897 _ok = false; 898 throw XmlError( "folia::engine failed to create node: " + t_or_ph ); 899 } 900 } 901 handle_element(const string & local_name,int new_depth)902 void Engine::handle_element( const string& local_name, 903 int new_depth ){ 904 /// process a matched tag into a FoLiA subtree 905 /*! 906 \param local_name the tag 907 \param new_depth the location in the Document to attach to 908 */ 909 KWargs atts = get_attributes( _reader ); 910 if ( _debug ){ 911 DBG << "name=" << local_name << " atts=" << atts << endl; 912 } 913 if ( local_name == "wref" ){ 914 string id = atts["id"]; 915 if ( id.empty() ){ 916 _ok = false; 917 throw XmlError( "folia::engine, reference missing an 'id'" ); 918 } 919 FoliaElement *ref = (*_out_doc)[id]; 920 if ( !ref ){ 921 _ok = false; 922 throw XmlError( "folia::engine, unresolvable reference: " 923 + id ); 924 } 925 ref->increfcount(); 926 append_node( ref, new_depth ); 927 } 928 else { 929 FoliaElement *t = AbstractElement::createElement( local_name, _out_doc ); 930 if ( t ){ 931 if ( local_name == "foreign-data" ){ 932 xmlNode *fd = xmlTextReaderExpand(_reader); 933 t->parseXml( fd ); 934 append_node( t, new_depth ); 935 // skip subtree 936 xmlTextReaderNext(_reader); 937 } 938 else { 939 string nsu; 940 for ( auto const& v : atts ){ 941 if ( v.first.find("xmlns:") == 0 ){ 942 nsu = v.second; 943 break; 944 } 945 } 946 947 // We could use std::find_if here, but that is less readable: 948 // auto const& a = find_if( atts.begin(), atts.end(), 949 // []( const pair<string,string>& av ){ 950 // return av.first.find("xmlns:") == 0; 951 // } ); 952 // if ( a != atts.end() ){ 953 // nsu = a->second; 954 // } 955 956 if ( nsu.empty() || nsu == NSFOLIA ){ 957 if ( local_name == "desc" 958 || local_name == "content" 959 || local_name == "comment" ){ 960 if ( xmlTextReaderIsEmptyElement(_reader) ){ 961 if ( _debug ){ 962 DBG << "Element is empty." << endl; 963 } 964 } 965 else { 966 xmlTextReaderRead(_reader); 967 const char *val = (const char*)xmlTextReaderConstValue(_reader); 968 if ( val ) { 969 if ( _debug ){ 970 DBG << "processing a <" << local_name << "> with value '" 971 << val << "'" << endl; 972 } 973 atts["value"] = val; 974 } 975 else { 976 if ( _debug ){ 977 DBG << "processing a <" << local_name 978 << "> with empty value " << endl; 979 } 980 } 981 } 982 } 983 if ( _debug ){ 984 DBG << "SET ATTRIBUTES: " << atts << endl; 985 } 986 t->setAttributes( atts ); 987 append_node( t, new_depth ); 988 } 989 else { 990 if ( _debug ){ 991 DBG << "a node in an alien namespace'" << nsu << endl; 992 } 993 // just take as is... 994 append_node( t, new_depth ); 995 xmlNode *fd = xmlTextReaderExpand(_reader); 996 t->parseXml( fd ); 997 // skip subtree 998 xmlTextReaderNext(_reader); 999 } 1000 } 1001 } 1002 else { 1003 _ok = false; 1004 throw XmlError( "folia::engine failed to create node: " 1005 + local_name ); 1006 } 1007 } 1008 } 1009 output_header()1010 bool Engine::output_header(){ 1011 /// output the 'header' of the Folia document to the associated output 1012 /// stream 1013 1014 /// This outputs ALL metadata from the Document upto and including 1015 /// the opening \<text> of \<speech> node 1016 if ( _debug ){ 1017 DBG << "Engine::output_header()" << endl; 1018 } 1019 if ( !_os ){ 1020 throw logic_error( "folia::Engine::output_header() impossible. No output file specified!" ); 1021 return false; 1022 } 1023 if ( _finished ){ 1024 return true; 1025 } 1026 else if ( _header_done ){ 1027 throw logic_error( "folia::Engine::output_header() is called twice!" ); 1028 return false; 1029 } 1030 _header_done = true; 1031 stringstream ss; 1032 _out_doc->save( ss, ns_prefix ); 1033 string data = ss.str(); 1034 string search_b1; 1035 string search_b2; 1036 string search_e; 1037 if ( _doc_type == TEXT ){ 1038 if ( !ns_prefix.empty() ){ 1039 search_b1 = "<" + ns_prefix + ":" + "text>"; 1040 search_b2 = "<" + ns_prefix + ":" + "text "; 1041 search_e = "</" + ns_prefix + ":" + "text>"; 1042 } 1043 else { 1044 search_b1 = "<text>"; 1045 search_b2 = "<text "; 1046 search_e = "</text>"; 1047 } 1048 } 1049 else { 1050 if ( !ns_prefix.empty() ){ 1051 search_b1 = "<" + ns_prefix + ":" + "speech>"; 1052 search_b2 = "<" + ns_prefix + ":" + "speech "; 1053 search_e = "</" + ns_prefix + ":" + "speech>"; 1054 } 1055 else { 1056 search_b1 = "<speech>"; 1057 search_b2 = "<speech "; 1058 search_e = "</speech>"; 1059 } 1060 } 1061 string::size_type bpos1 = data.find( search_b1 ); 1062 string::size_type bpos2 = data.find( search_b2 ); 1063 string::size_type pos1; 1064 if ( bpos1 < bpos2 ){ 1065 pos1 = bpos1; 1066 } 1067 else { 1068 pos1 = bpos2; 1069 } 1070 string::size_type pos2; 1071 if ( _root_node->size() == 0 ){ 1072 pos2 = data.find( "/>" , pos1 ); 1073 } 1074 else { 1075 pos2 = data.find( ">" , pos1 ); 1076 } 1077 string head = data.substr( 0, pos2 ) + ">"; 1078 if ( _root_node->size() == 0 ){ 1079 pos2 += 2; 1080 } 1081 else { 1082 pos2 = data.find( search_e, pos1 ); 1083 int add = search_e.size(); 1084 pos2 += add; 1085 } 1086 _footer = " " + search_e + data.substr( pos2 ); 1087 *_os << head << endl; 1088 return true; 1089 } 1090 output_footer()1091 bool Engine::output_footer(){ 1092 /// output the remains of the associated Document 1093 /// might call flush() first 1094 1095 /// further processing in this Engine is illegal 1096 if ( _debug ){ 1097 DBG << "Engine::output_footer()" << endl; 1098 } 1099 if ( _finished ){ 1100 return true; 1101 } 1102 if ( !_os ){ 1103 throw logic_error( "folia::Engine::output_footer() impossible. No output file specified!" ); 1104 return false; 1105 } 1106 else if ( flush() ){ 1107 *_os << _footer << endl; 1108 _finished = true; 1109 return true; 1110 } 1111 else { 1112 return false; 1113 } 1114 } 1115 flush()1116 bool Engine::flush() { 1117 /// output all NEW information in the output Document to the output stream 1118 1119 /// may call output_header() first 1120 if ( _debug ){ 1121 DBG << "Engine::flush()" << endl; 1122 } 1123 if ( !_os ){ 1124 throw logic_error( "folia::Engine::flush() impossible. No outputfile specified!" ); 1125 return false; 1126 } 1127 if ( _finished ){ 1128 return true; 1129 } 1130 else if ( !_header_done ){ 1131 output_header(); 1132 } 1133 stack<FoliaElement*> rem_list; 1134 size_t len = _root_node->size(); 1135 for ( size_t i=0; i < len; ++i ){ 1136 rem_list.push( _root_node->index(i) ); 1137 *_os << " " << _root_node->index(i)->xmlstring(true,2,false) << endl; 1138 } 1139 while ( !rem_list.empty() ){ 1140 // we've kept a stack of elements to remove, as removing at the back 1141 // is the safest and cheapest thing to do 1142 _root_node->remove( rem_list.top() ); 1143 destroy( rem_list.top() ); 1144 rem_list.pop(); 1145 } 1146 return true; 1147 } 1148 finish()1149 bool Engine::finish() { 1150 /// finalize the Engine bij calling output_footer 1151 if ( _debug ){ 1152 DBG << "Engine::finish()" << endl; 1153 } 1154 if ( !_os ){ 1155 throw logic_error( "folia::Engine::finish() impossible. No outputfile specified!" ); 1156 return false; 1157 } 1158 if ( _finished ){ 1159 return true; 1160 } 1161 return output_footer(); 1162 } 1163 save(const string & name,bool do_canon)1164 void Engine::save( const string& name, bool do_canon ){ 1165 /// save the associated Document to a file 1166 /*! 1167 \param name the file-name 1168 \param do_canon output in Canonical format 1169 */ 1170 if ( _os && name == _out_name ){ 1171 throw logic_error( "folia::Engine::save() impossible. Already connected to a stream with the same name (" + name + ")" ); 1172 } 1173 _out_doc->save( name, ns_prefix, do_canon ); 1174 } 1175 save(ostream & os,bool do_canon)1176 void Engine::save( ostream& os, bool do_canon ){ 1177 /// save the associated Document to a stream 1178 /*! 1179 \param os the stream 1180 \param do_canon output in Canonical format 1181 */ 1182 _out_doc->save( os, ns_prefix, do_canon ); 1183 } 1184 1185 init_doc(const string & i,const string & o)1186 bool TextEngine::init_doc( const string& i, const string& o ){ 1187 /// init an associated document for this TextEngine 1188 /*! 1189 \param i the input file to use for parsing 1190 \param o when not empty, add an output-file with this name 1191 1192 Sets the _in_file property to i and marks _is_setup FALSE 1193 then calls Engine::init_doc to do the real work. 1194 */ 1195 _in_file = i; 1196 _is_setup = false; 1197 // set_debug(true); 1198 return Engine::init_doc( i, o ); 1199 } 1200 setup(const string & textclass,bool prefer_struct)1201 void TextEngine::setup( const string& textclass, bool prefer_struct ){ 1202 /// set the TextEngine ready for parsing 1203 /*! 1204 \param textclass Determines which textnodes to search for 1205 \param prefer_struct If TRUE, set the TextEngine up for returning 1206 Structure nodes like sentences or paragraphs above returning 1207 just Word or String nodes 1208 */ 1209 string txtc = textclass; 1210 if ( txtc == "current" ){ 1211 txtc.clear(); 1212 } 1213 text_parent_map = enumerate_text_parents( txtc, prefer_struct ); 1214 _next_text_node = _start_index; 1215 if ( !text_parent_map.empty() ){ 1216 _next_text_node = text_parent_map.begin()->first; 1217 } 1218 _node_count = _start_index; 1219 _is_setup = true; 1220 } 1221 get_structure_parent(const xml_tree * pnt)1222 xml_tree *get_structure_parent( const xml_tree *pnt ){ 1223 /// return the nearest StructureElement above this node 1224 /*! 1225 \param pnt a (text) element in the simple tree. 1226 \return the first parent which is an AbstractStructureElement 1227 and NOT a Word 1228 */ 1229 if ( pnt->parent->tag != "w" 1230 && isSubClass( stringToElementType(pnt->parent->tag), 1231 AbstractStructureElement_t ) ){ 1232 return pnt->parent; 1233 } 1234 else { 1235 return get_structure_parent( pnt->parent ); 1236 } 1237 } 1238 search_text_parents(const xml_tree * start,const string & textclass,bool prefer_struct) const1239 map<int,int> TextEngine::search_text_parents( const xml_tree* start, 1240 const string& textclass, 1241 bool prefer_struct ) const{ 1242 /// scan the whole TextEngine for TextContent nodes 1243 /*! 1244 \param start the tree to search 1245 \param textclass the text-class we are interested in 1246 \param prefer_struct If TRUE, set the TextEngine up for returning 1247 Structure nodes like sentences or paragraphs above returning 1248 just Word or String nodes 1249 \return a map containing for every found text_parent the index of 1250 the NEXT value to search. TO DO: very mysty and mystic 1251 */ 1252 map<int,int> result; 1253 const xml_tree *pnt = start; 1254 while ( pnt ){ 1255 if ( _debug ){ 1256 DBG << "bekijk:" << pnt->tag << "-" << pnt->index << endl; 1257 } 1258 if ( pnt->tag == "wref" 1259 || pnt->tag == "original" ){ 1260 // 1261 // DON'T see a wref as a valid textparent. 1262 // The word is connected elsewhere too 1263 // Also an 'original' node is assumed to be part of a correction 1264 // so hope for a 'new' node to be found! 1265 pnt = pnt->next; 1266 continue; 1267 } 1268 map<int,int> deeper = search_text_parents( pnt->link, 1269 textclass, 1270 prefer_struct ); 1271 if ( !deeper.empty() ){ 1272 if ( _debug ){ 1273 DBG << "deeper we found: " << deeper << endl; 1274 } 1275 result.insert( deeper.begin(), deeper.end() ); 1276 } 1277 pnt = pnt->next; 1278 } 1279 if ( result.empty() ){ 1280 // so no deeper text found 1281 // lets see at this level.... 1282 pnt = start; 1283 while ( pnt ){ 1284 if ( pnt->tag == "t" && pnt->textclass == textclass ){ 1285 // OK text in the right textclass 1286 if ( prefer_struct ){ 1287 // search for a suitable parent 1288 xml_tree *par = get_structure_parent( pnt ); 1289 int index = par->index; 1290 int next = INT_MAX; 1291 if ( par->next ){ 1292 next = par->index; 1293 } 1294 result[index] = next; 1295 break; 1296 } 1297 else { 1298 int index = pnt->parent->index; 1299 int next = INT_MAX; 1300 if ( pnt->parent->next ){ 1301 next = pnt->parent->next->index; 1302 } 1303 else if ( pnt->parent->parent->next ){ 1304 next = pnt->parent->parent->next->index; 1305 } 1306 result[index] = next; 1307 break; 1308 } 1309 } 1310 pnt = pnt->next; 1311 } 1312 } 1313 if ( _debug && start && !result.empty() ){ 1314 DBG << "return " << result << " for " << start->parent->tag << endl; 1315 } 1316 return result; 1317 } 1318 enumerate_text_parents(const string & textclass,bool prefer_struct)1319 const map<int,int>& TextEngine::enumerate_text_parents( const string& textclass, 1320 bool prefer_struct ) { 1321 /// Loop over the full input, looking for textnodes in class 'textclass' 1322 /*! 1323 \param textclass the text-class we are interested in 1324 \param prefer_struct If TRUE, set the TextEngine up for returning 1325 Structure nodes like sentences or paragraphs above returning 1326 just Word or String nodes 1327 \return a reference to a map of text parent nodes 1328 1329 this function recurses to the DEEPEST text possible, and enumerates their 1330 parents. It creates a mapping of text parents indices to their successor 1331 */ 1332 if ( _done ){ 1333 throw runtime_error( "enumerate_text_parents() called on a done engine" ); 1334 } 1335 if ( _debug ){ 1336 DBG << "enumerate_text_parents(" << textclass << ")" << endl; 1337 } 1338 // 1339 // we start by creating a tree of all nodes 1340 xml_tree *tree = create_simple_tree(_in_file); 1341 // 1342 // now search that tree for nodes in 'textclass' 1343 // if is a <t>, then remember the index of its parent 1344 // but when 'prefer_struct' is specified, return the direct structure above 1345 // when present. 1346 text_parent_map.clear(); 1347 xml_tree *rec_pnt = tree; 1348 while ( rec_pnt ){ 1349 map<int,int> deeper = search_text_parents( rec_pnt->link, 1350 textclass, 1351 prefer_struct ); 1352 text_parent_map.insert( deeper.begin(), deeper.end() ); 1353 rec_pnt = rec_pnt->next; 1354 } 1355 if ( _debug ){ 1356 DBG << "complete tree: " << endl; 1357 print( DBG, tree ); 1358 DBG << "Search map = " << text_parent_map << endl; 1359 } 1360 for ( auto it = text_parent_map.begin(); 1361 it != text_parent_map.end(); 1362 ++it ){ 1363 auto nit = it; 1364 ++nit; 1365 if ( nit != text_parent_map.end() ){ 1366 it->second = nit->first; 1367 } 1368 } 1369 if ( _debug ){ 1370 DBG << "Reduced Search map = " << text_parent_map << endl; 1371 } 1372 delete tree; 1373 return text_parent_map; 1374 } 1375 next_text_parent()1376 FoliaElement *TextEngine::next_text_parent(){ 1377 /// return the next node to handle 1378 /*! 1379 \return a FoLiAElement pointer to a 'textparent' subtree, or 0 when done 1380 1381 The caller may use this pointer to modify the subtree BELOW that pointer 1382 at will. 1383 1384 next_text_parent should be called until no more candidates are found. 1385 At that moment, the complete input FoLiA is parsed and stored in _out_doc 1386 adn can be saved or handled over for further processing. 1387 1388 */ 1389 if ( _done ){ 1390 if ( _debug ){ 1391 DBG << "next_text_parent(). engine is done" << endl; 1392 } 1393 return 0; 1394 } 1395 if ( !_is_setup ){ 1396 throw runtime_error( "TextEngine: not setup yet!" ); 1397 } 1398 if ( text_parent_map.empty() ){ 1399 if ( _debug ){ 1400 DBG << "next_text_parent(). the parent map is empty." << endl; 1401 } 1402 return 0; 1403 } 1404 1405 int ret = 0; 1406 if ( _external_node != 0 ){ 1407 // so our last action was to output a pointer to a subtree. 1408 // continue with the next node, avoiding the subtree 1409 _external_node = 0; 1410 ret = xmlTextReaderNext(_reader); 1411 } 1412 else { 1413 // so we are the first time here, get first result 1414 ret = xmlTextReaderRead(_reader); 1415 } 1416 if ( ret == 0 ){ 1417 if ( _debug ){ 1418 DBG << "next_text_parent(), DONE" << endl; 1419 } 1420 _done = true; 1421 return 0; 1422 } 1423 while ( ret ){ 1424 int type = xmlTextReaderNodeType(_reader); 1425 if ( _debug ){ 1426 DBG << "MAIN LOOP search next_text_parent(), type=" << type 1427 << " current node=" << _node_count 1428 << " search for node=" << _next_text_node << endl; 1429 } 1430 int new_depth = xmlTextReaderDepth(_reader); 1431 switch ( type ){ 1432 case XML_READER_TYPE_ELEMENT: { 1433 string local_name = (const char*)xmlTextReaderConstLocalName(_reader); 1434 if ( _debug ){ 1435 DBG << "next element: " << local_name << " cnt =" << _node_count << endl; 1436 } 1437 if ( _node_count == _next_text_node ){ 1438 // HIT! 1439 if ( _debug ){ 1440 DBG << "at index=" << _node_count << " WE HIT a next element for: " << local_name << endl; 1441 } 1442 _external_node = handle_match( local_name, new_depth ); 1443 int skips = count_nodes( _external_node ); 1444 // we are to output a tree of skips nodes 1445 _node_count += skips; // so next time we resume with this count 1446 _next_text_node = text_parent_map[_next_text_node]; 1447 // and we have to search for _next_text_node 1448 if ( _debug ){ 1449 DBG << " increment _node_count with: " << skips << " to " 1450 << _node_count << " searching for: " 1451 << _next_text_node << endl; 1452 } 1453 return _external_node; 1454 } 1455 else if ( local_name == "t" 1456 || local_name == "ph" ){ 1457 _node_count += handle_content( local_name, new_depth ); 1458 } 1459 else { 1460 handle_element( local_name, new_depth ); 1461 ++_node_count; 1462 } 1463 } 1464 break; 1465 case XML_READER_TYPE_TEXT: { 1466 add_text( new_depth ); 1467 } 1468 break; 1469 case XML_READER_TYPE_COMMENT: { 1470 add_comment( new_depth ); 1471 } 1472 break; 1473 default: { 1474 add_default_node( new_depth ); 1475 } 1476 break; 1477 } 1478 ret = xmlTextReaderRead(_reader); 1479 } 1480 _done = true; 1481 return 0; 1482 } 1483 1484 } // namespace folia 1485