1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of libfolia
7 
8   libfolia is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   libfolia is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 #include <cassert>
27 #include <iostream>
28 #include <iomanip>
29 #include <fstream>
30 #include <cstring>
31 #include <cstdio>
32 #include <string>
33 #include <stack>
34 #include <stdexcept>
35 #include <algorithm>
36 #include "ticcutils/PrettyPrint.h"
37 #include "ticcutils/FileUtils.h"
38 #include "ticcutils/XMLtools.h"
39 #include "ticcutils/zipper.h"
40 #include "libfolia/folia.h"
41 
42 using namespace std;
43 
44 /// define a static default LogStream
45 TiCC::LogStream DBG_CERR(cerr,"folia-engine:");
46 
47 /// direct Debugging info to the internal file, if present, or to the default stream
48 #define DBG *TiCC::Log((_dbg_file?_dbg_file:&DBG_CERR))
49 
50 namespace folia {
51 
52   using TiCC::operator<<;
53 
xml_tree(int d,int i,const std::string & t,const std::string & c)54   xml_tree::xml_tree( int d,
55 		      int i,
56 		      const std::string& t,
57 		      const std::string& c ):
58     /// create an xml_tree element with the given parameters
59     depth(d),
60     index(i),
61     tag(t),
62     textclass(c),
63     parent(0),
64     link(0),
65     next(0)
66   {}
67 
~xml_tree()68   xml_tree::~xml_tree() {
69     /// delete an xml_tree
70     if ( link ){
71       delete link;
72     }
73     if ( next ){
74       delete next;
75     }
76   }
77 
print(ostream & os,const xml_tree * tree)78   void print( ostream& os, const xml_tree* tree ){
79     //! pretty print an xml_tree
80     /*!
81       \param os the output stream
82       \param tree the tree
83     */
84     const xml_tree *rec_pnt = tree;
85     while ( rec_pnt ){
86       os << setw(10) << rec_pnt->index << string( rec_pnt->depth, ' ' )
87 	 << rec_pnt->tag;
88       if ( rec_pnt->textclass.empty() ){
89 	os << endl;
90       }
91       else {
92 	os << " (" << rec_pnt->textclass << ")" << endl;
93       }
94       print( os, rec_pnt->link );
95       rec_pnt = rec_pnt->next;
96     }
97   }
98 
operator <<(ostream & os,const xml_tree * tree)99   ostream& operator<<( ostream& os, const xml_tree* tree ){
100     /// print an xml_tree
101     os << endl;
102     print( os, tree );
103     return os;
104   }
105 
Engine()106   Engine::Engine():
107     /// default constructor
108     _reader(0),
109     _out_doc(0),
110     _root_node(0),
111     _external_node(0),
112     _current_node(0),
113     _last_added(0),
114     _last_depth(2),
115     _doc_type( TEXT ),
116     _dbg_file(0),
117     _os(0),
118     _ok(false),
119     _done(false),
120     _header_done(false),
121     _finished(false),
122     _debug(false)
123   {
124   }
125 
~Engine()126   Engine::~Engine(){
127     /// destructor
128     xmlFreeTextReader( _reader );
129     delete _out_doc;
130     delete _os;
131   }
132 
doc(bool disconnect)133   Document *Engine::doc( bool disconnect ){
134     /// returns the associated FoLiA document.
135     /*!
136       \param disconnect When true, handle control over to the caller.
137       The caller has to delete it then to avoid memory leaks
138 
139     */
140     Document *result = _out_doc;
141     if ( disconnect ){
142       _out_doc = 0;
143     }
144     return result;
145   }
146 
set_debug(bool d)147   bool Engine::set_debug( bool d ) {
148     /// switch debugging on/off depending on parameter 'd'
149     /*!
150       \param d when true switch debugging to ON, otherwise OFF
151 
152       When debugging is switched ON and NO debug file is associated yet,
153       it is created.
154     */
155     bool res = _debug;
156     if ( d ){
157       if ( !_dbg_file ){
158 	_dbg_file
159 	  = new TiCC::LogStream( cerr, "folia-engine", StampMessage );
160       }
161     }
162     _debug = d;
163     return res;
164   }
165 
set_dbg_stream(TiCC::LogStream * ls)166   void Engine::set_dbg_stream( TiCC::LogStream *ls ){
167     /// switch debugging to another LogStream
168     if ( _dbg_file ){
169       delete _dbg_file;
170     }
171     _dbg_file = ls;
172   }
173 
un_declare(const AnnotationType & at,const string & setname)174   void Engine::un_declare( const AnnotationType& at,
175 			   const string& setname ){
176     /// remove the annotation declaration for the given type and set
177     /*!
178       \param at the AnnotationType
179       \param setname the set so remove
180 
181       \note an AnntotationType can have several set-names assigned to it.
182       When setname is empty ("") ALL set-names are removed
183     */
184     if ( !ok() ){
185       throw logic_error( "declare() called on invalid engine!" );
186     }
187     else if ( _header_done ){
188       throw logic_error( "declare() called on already (partially) saved document!" );
189     }
190     else {
191       _out_doc->un_declare( at, setname );
192     }
193   }
194 
declare(const AnnotationType & at,const string & setname,const string & args)195   void Engine::declare( const AnnotationType& at,
196 			const string& setname,
197 			const string& args ) {
198     /// declare a set for a given annotation type
199     /*!
200       \param at the AnnotationType
201       \param setname The set-name to use
202       \param args additional arguments in string annotation. Can be used to add
203       extra arguments like a processor name or an annotator
204     */
205     KWargs kwargs( args );
206     declare( at, setname, kwargs );
207   }
208 
declare(const AnnotationType & at,const string & setname,const KWargs & args)209   void Engine::declare( const AnnotationType& at,
210 			const string& setname,
211 			const KWargs& args ) {
212     /// declare a set for a given annotation type
213     /*!
214       \param at the AnnotationType
215       \param setname The set-name to use
216       \param args additional arguments as a KWargs attribute-value list.
217       can be used to add extra arguments like a processor name or
218       an annotator
219     */
220     if ( !ok() ){
221       throw logic_error( "declare() called on invalid engine!" );
222     }
223     else if ( _header_done ){
224       throw logic_error( "declare() called on already (partially) saved document!" );
225     }
226     else {
227       _out_doc->declare( at, setname, args );
228     }
229   }
230 
is_declared(const AnnotationType & at,const string & setname) const231   bool Engine::is_declared( const AnnotationType& at,
232 			    const string& setname ) const {
233     /// check if an annotation for the provided type and setname is present
234     /*!
235       \param at the AnnotationType
236       \param setname the set-name to test
237       \return true if declared, false otherwise.
238     */
239     if ( !ok() ){
240       throw logic_error( "is_declared() called on invalid engine!" );
241     }
242     else {
243       return _out_doc->declared( at, setname );
244     }
245   }
246 
is_declared(const AnnotationType & at,const string & setname,const string & annotator,const AnnotatorType & annotator_type,const string & processor) const247   bool Engine::is_declared( const AnnotationType& at,
248 			    const string& setname,
249 			    const string& annotator,
250 			    const AnnotatorType& annotator_type,
251 			    const string& processor ) const {
252     /// check if an annotation for the provided type and setname is present
253     /*!
254       \param at the AnnotationType
255       \param setname the set-name to test
256       \param annotator the name of the annotator to test
257       \param annotator_type the AnnotatorType to test
258       \param processor the desired processor
259       \return true if declared, false otherwise.
260     */
261     if ( !ok() ){
262       throw logic_error( "is_declared() called on invalid engine!" );
263     }
264     else {
265       return _out_doc->declared( at, setname, annotator, annotator_type, processor );
266     }
267   }
268 
is_declared(const AnnotationType & at,const string & setname,const string & annotator,const string & annotator_type,const string & processor) const269   bool Engine::is_declared( const AnnotationType& at,
270 			    const string& setname,
271 			    const string& annotator,
272 			    const string& annotator_type,
273 			    const string& processor ) const {
274     /// check if an annotation for the provided type and setname is present
275     /*!
276       \param at the AnnotationType
277       \param setname the set-name to test
278       \param annotator the name of the annotator to test
279       \param annotator_type the AnnotatorType to test, encoded as a string
280       \param processor the desired processor
281       \return true if declared, false otherwise.
282     */
283     AnnotatorType ant = UNDEFINED;
284     try {
285       ant = TiCC::stringTo<AnnotatorType>(annotator_type);
286     }
287     catch (...){
288       throw logic_error( annotator_type + " is NOT a valid annotator type" );
289     }
290     return is_declared( at, setname, annotator, ant, processor );
291   }
292 
set_metadata(const std::string & att,const std::string & val)293   void Engine::set_metadata( const std::string& att,
294 			     const std::string& val){
295     /// set a metadata value in the associated document
296     /*!
297       \param att the attribute to set
298       \param val the value of the attribute
299     */
300     if ( !ok() ){
301       throw logic_error( "set_metadata() called on invalid engine!" );
302     }
303     else {
304       return _out_doc->set_metadata( att, val );
305     }
306   }
307 
extract_style(const string & value)308   pair<string,string> extract_style( const string& value ){
309     /// parse a string to extract an xml style-sheet value
310     /*!
311       \param value the line to parse
312       \return a pait of strings containing the type and the href values
313     */
314     string type;
315     string href;
316     vector<string> v = TiCC::split( value );
317     if ( v.size() == 2 ){
318       vector<string> w = TiCC::split_at( v[0], "=" );
319       if ( w.size() == 2 && w[0] == "type" ){
320 	type = w[1].substr(1,w[1].length()-2);
321       }
322       w = TiCC::split_at( v[1], "=" );
323       if ( w.size() == 2 && w[0] == "href" ){
324 	href = w[1].substr(1,w[1].length()-2);
325       }
326       return make_pair(type,href);
327     }
328     else {
329       throw XmlError( "couldn't parse xml-style-sheet line: " + value );
330     }
331   }
332 
get_attributes(xmlTextReader * tr)333   KWargs get_attributes( xmlTextReader *tr ){
334     /// extract a KWargs attribute/value list from the TextReader location
335     /*!
336       \param tr the xmlTextReader pointer
337       \return a KWargs list of all attribute/value pairs found
338     */
339     KWargs result;
340     if ( xmlTextReaderHasAttributes(tr) ){
341       xmlTextReaderMoveToFirstAttribute(tr);
342       do {
343 	string att = (const char*)xmlTextReaderConstName(tr);
344 	string val = (const char*)xmlTextReaderConstValue(tr);
345 	result[att] = val;
346       }
347       while ( xmlTextReaderMoveToNextAttribute(tr) );
348     }
349     return result;
350   }
351 
create_text_reader(const string & buf)352   xmlTextReader *create_text_reader( const string& buf ){
353     /// create a new xmlTextRead on a buffer
354     /*!
355       \param buf the input buffer.
356       The buffer may contain a complete (FoLiA-) XML document as a string
357       OR a filename denoting such a document, which may be .bz2 and .gz
358       encoded
359     */
360     if ( TiCC::match_front( buf, "<?xml " ) ){
361       return xmlReaderForMemory( buf.c_str(), buf.size(),
362 				 "input_buffer", 0, XML_PARSER_OPTIONS );
363     }
364     else if ( TiCC::match_back( buf, ".bz2" ) ){
365       string buffer = TiCC::bz2ReadFile( buf );
366       if ( buffer.empty() ){
367 	throw runtime_error( "folia::Engine(), empty file? (" + buf
368 			      + ")" );
369       }
370       //
371       // next step fails for unclear reasons
372       // so we use an intermediate file. Which works, but is clumsy
373       //
374       // return xmlReaderForMemory( buffer.c_str(), buffer.size()+1,
375       //  				 buf.c_str(), 0, XML_PARSER_OPTIONS );
376       TiCC::tmp_stream ts( "folia" );
377       string tmp_file = ts.tmp_name();
378       ofstream& os = ts.os();
379       os << buffer << endl;
380       ts.close();
381       xmlTextReader *result
382 	= xmlReaderForFile( tmp_file.c_str(), 0, XML_PARSER_OPTIONS );
383       return result;
384     }
385     // libxml2 can handle .xml and .xml.gz
386     return xmlReaderForFile( buf.c_str(), 0, XML_PARSER_OPTIONS );
387   }
388 
add_text(int depth)389   void Engine::add_text( int depth ){
390     /// when parsing, add a new XmlText node
391     /*!
392       \param depth the depth (location) in the tree where to add
393     */
394     string value = (const char*)xmlTextReaderConstValue(_reader);
395     string trimmed = TiCC::trim(value);
396     if ( !trimmed.empty() ){
397       throw XmlError( "spurious text " + trimmed + " found." );
398     }
399     if ( _debug ){
400       DBG << "add_text(" << value << ") depth=" << depth << endl;
401     }
402     XmlText *txt = new XmlText();
403     txt->setvalue( value );
404     append_node( txt, depth );
405   }
406 
add_comment(int depth)407   void Engine::add_comment( int depth ){
408     /// when parsing, add a new _XmlComment node
409     /*!
410       \param depth the depth (location) in the tree where to add
411     */
412     if ( _debug ){
413       DBG << "add_comment " << endl;
414     }
415     string tag = "_XmlComment";
416     FoliaElement *t = AbstractElement::createElement( tag, _out_doc );
417     append_node( t, depth );
418   }
419 
add_default_node(int depth)420   void Engine::add_default_node( int depth ){
421     /// when debugging, output a message. Does nothing else
422     if ( _debug ){
423       string local_name = (const char*)xmlTextReaderConstLocalName(_reader);
424       int type = xmlTextReaderNodeType(_reader);
425       DBG << "add_node " << type <<  " name=" << local_name
426 	  << " depth " << _last_depth << " ==> " << depth << endl;
427     }
428   }
429 
check_empty(xmlNode * node)430   void check_empty( xmlNode *node ){
431     /// assure that node == 0 OR just contains whitespace or comment
432     /*!
433       \param node the node to check
434       will throw when node is anything other than xml-comment or whitespace
435     */
436     if ( node ){
437       if ( node->type == XML_COMMENT_NODE ){
438 	check_empty( node->next );
439       }
440       else if ( node->type == XML_TEXT_NODE ){
441 	string txt = TextValue(node);
442 	txt = TiCC::trim(txt);
443 	if ( !txt.empty() ){
444 	  string tg = "<" + TiCC::Name(node->prev) + ">";
445 	  throw XmlError( "found extra text '" + txt + "' after element "
446 			  + tg + ", NOT allowed there." );
447 	}
448       }
449       else {
450 	string tg = "<" + TiCC::Name(node->prev) + ">";
451 	throw XmlError( "found unexpected node '" + TiCC::Name(node)
452 			+ "' after element " + tg + ", NOT allowed there." );
453       }
454     }
455   }
456 
init_doc(const string & file_name,const string & out_name)457   bool Engine::init_doc( const string& file_name,
458 			 const string& out_name ){
459     /// init an associated document for this Engine
460     /*!
461       \param file_name the input file to use for parsing
462       \param out_name when not empty, add an output-file with this name
463 
464       Initializing includes parsing the Document's metadata, style-sheet
465       upto and including the top \<text or \<speech> node
466     */
467     _ok = false;
468     _out_doc = new Document();
469     _out_doc->set_incremental( true );
470     if ( !out_name.empty() ){
471       _os = new ofstream( out_name );
472       _out_name = out_name;
473     }
474     _out_doc->_source_filename = file_name;
475     _reader = create_text_reader( file_name );
476     if ( _reader == 0 ){
477       _ok = false;
478       throw( runtime_error( "folia::Engine(), init failed on '" + file_name
479 			    + "' (File not found)" ) );
480     }
481     int index = 0;
482     while ( xmlTextReaderRead(_reader) > 0 ){
483       int type =  xmlTextReaderNodeType(_reader );
484       string local_name = (const char*)xmlTextReaderConstLocalName(_reader );
485       switch ( type ){
486       case XML_READER_TYPE_ELEMENT:
487 	++index;
488 	if ( local_name == "FoLiA" ){
489 	  // found the root
490 	  const xmlChar *pnt = xmlTextReaderConstPrefix(_reader);
491 	  if ( pnt ){
492 	    _out_doc->_foliaNsIn_prefix = xmlStrdup(pnt );
493 	    ns_prefix = (const char*)pnt;
494 	  }
495 	  pnt = xmlTextReaderConstNamespaceUri(_reader);
496 	  if ( pnt ){
497 	    _out_doc->_foliaNsIn_href = xmlStrdup(pnt);
498 	    string ns = (const char*)_out_doc->_foliaNsIn_href;
499 	    if ( ns != NSFOLIA ){
500 	      _ok = false;
501 	      throw XmlError( "Folia Document should have namespace declaration "
502 			      + NSFOLIA + " but found: " + ns );
503 	    }
504 	  }
505 	  KWargs in_args = get_attributes( _reader );
506 	  string id;
507 	  if ( !in_args.empty() ){
508 	    id = in_args["xml:id"];
509 	  }
510 	  for ( auto it =in_args.begin(); it != in_args.end();  ){
511 	    // remove all xmlns attributes
512 	    if ( it->first.find( "xmlns" ) == 0 ){
513 	      it = in_args.erase( it );
514 	    }
515 	    else {
516 	      ++it;
517 	    }
518 	  }
519 	  if ( !id.empty() ){
520 	    FoliaElement *root = new FoLiA( in_args, _out_doc );
521 	    _out_doc->foliadoc = root;
522 	  }
523 	  else {
524 	    _ok = false;
525 	    throw XmlError( "Engine: invalid FoLiA. missing ID" );
526 	  }
527 	}
528 	else if ( local_name == "metadata" ) {
529 	  xmlNode *node = xmlTextReaderExpand(_reader);
530 	  check_empty( node->next );
531 	  _out_doc->parse_metadata( node );
532 	}
533 	else if ( local_name == "text" ){
534 	  _doc_type = TEXT;
535 	  KWargs args = get_attributes(_reader);
536 	  FoliaElement *text =_out_doc->setTextRoot( args );
537 	  _root_node = text;
538 	  _current_node = text;
539 	  _ok = true;
540 	  _start_index = index;
541 	  _out_doc->save_orig_ann_defaults();
542 	  return _ok;
543 	}
544 	else if ( local_name == "speech" ){
545 	  _doc_type = SPEECH;
546 	  KWargs args = get_attributes(_reader);
547 	  FoliaElement *sp = _out_doc->setSpeechRoot( args );
548 	  _root_node = sp;
549 	  _current_node = sp;
550 	  _ok = true;
551 	  _start_index = index;
552 	  _out_doc->save_orig_ann_defaults();
553 	  return _ok;
554 	}
555 	break;
556       case XML_READER_TYPE_PROCESSING_INSTRUCTION:
557 	// A PI
558 	if ( local_name == "xml-stylesheet" ){
559 	  string sv = (const char*)xmlTextReaderConstValue(_reader);
560 	  pair<string,string> p = extract_style( sv );
561 	  _out_doc->addStyle( p.first, p.second );
562 	}
563 	else {
564 	  cerr << "unhandled PI: " << local_name << endl;
565 	}
566 	break;
567       default:
568 	break;
569       };
570     }
571     _out_doc->save_orig_ann_defaults();
572     _ok = true;
573     return _ok;
574   }
575 
append_node(FoliaElement * t,int depth)576   void Engine::append_node( FoliaElement *t,
577 			    int depth ){
578     /// append a FoliaElement to the associated document
579     /*!
580       \param t the FoliaElement
581       \param depth the location to use for adding
582     */
583     if ( _debug ){
584       DBG << "append_node(" << t << ") current node= " << _current_node << endl;
585       DBG << "append_node(): last node= " << _last_added << endl;
586     }
587     if ( depth == _last_depth ){
588       if ( _debug ){
589 	DBG << "append_node(): EQUAL!" << endl;
590       }
591     }
592     else if ( depth > _last_depth ){
593       if ( _debug ){
594 	DBG << "append_node(): DEEPER!" << endl;
595       }
596       _current_node = _last_added;
597     }
598     else if ( depth < _last_depth  ){
599       if ( _debug ){
600 	DBG << "append_node(): UP!" << endl;
601       }
602       for ( int i=0; i < _last_depth - depth; ++i ){
603 	_current_node = _current_node->parent();
604 	if ( _debug ){
605 	  DBG << "up node = " << _current_node << endl;
606 	}
607       }
608     }
609     _last_depth = depth;
610     _current_node->append( t );
611     if ( _debug ){
612       DBG << "append_node() result = " << _current_node << endl;
613     }
614     _last_added = t;
615   }
616 
handle_match(const string & local_name,int new_depth)617   FoliaElement *Engine::handle_match( const string& local_name,
618 				      int new_depth ){
619     /// expand a matched tag into a FoLiA subtree
620     /*!
621       \param local_name the tag to create
622       \param new_depth the location in the Document to attach to
623       \return an expanded FoLiA subtree
624     */
625     FoliaElement *t = AbstractElement::createElement( local_name, _out_doc );
626     if ( t ){
627       if ( _debug ){
628 	DBG << "created FoliaElement: name=" << local_name << endl;
629       }
630       xmlNode *fd = xmlTextReaderExpand(_reader);
631       t->parseXml( fd );
632       append_node( t, new_depth );
633       _external_node = t;
634       if ( _debug ){
635 	DBG << "expose external node: " << t << endl;
636       }
637       return t;
638     }
639     else if ( !_out_doc->permissive() ){
640       _ok = false;
641       throw XmlError( "folia::engine failed to create node: "
642 		      + local_name );
643     }
644     else {
645       return 0;
646     }
647   }
648 
get_node(const string & tag)649   FoliaElement *Engine::get_node( const string& tag ){
650     /// return the next node in the Engine with 'tag'
651     /*!
652       \param tag the tag or a list of tags we are looking for
653       \return the FoliaElement found.
654 
655       tag may be a single tag like 'lemma' but also a list of '|' separated
656       tags like 'lemma|pos|description'. In the latter case all named tags
657       are tested and the first found is returned
658 
659       The returned FoliaElement is a FoLiA subtree expaned from the
660       xmlTextReader. Further parsing will continue at the next sibbling
661       of the parent.
662     */
663     if ( _done ){
664       if ( _debug ){
665 	DBG << "Engine::get_node(). we are done" << endl;
666       }
667       return 0;
668     }
669     if ( _debug ){
670       DBG << "Engine::get_node(), for tag=" << tag << endl;
671     }
672     int ret = 0;
673     if ( _external_node != 0 ){
674       // so our last action was to output a pointer to a subtree.
675       // continue with the next node, avoiding the subtree
676       _external_node = 0;
677       ret = xmlTextReaderNext(_reader);
678     }
679     else {
680       // so we are the first time here, just get the first node
681       ret = xmlTextReaderRead(_reader);
682     }
683     if ( xmlTextReaderReadState(_reader) < 0 ){
684       throw runtime_error( "get_node() reading failed" );
685     }
686     if ( ret == 0 ){
687       if ( _debug ){
688 	DBG << "get node name, DONE" << endl;
689       }
690       _done = true;
691       return 0;
692     }
693     vector<string> tv = TiCC::split_at( tag, "|" );
694     set<string> tags;
695     for ( const auto& t : tv ){
696       tags.insert(t);
697     }
698     while ( ret ){
699       int type = xmlTextReaderNodeType(_reader);
700       int new_depth = xmlTextReaderDepth(_reader);
701       switch ( type ){
702       case XML_READER_TYPE_ELEMENT: {
703 	string local_name = (const char*)xmlTextReaderConstLocalName(_reader);
704 	if ( _debug ){
705 	  DBG << "get node XML_ELEMENT name=" << local_name
706 	      << " depth " << _last_depth << " ==> " << new_depth << endl;
707 	}
708 	if ( tags.find(local_name) != tags.end() ){
709 	  if ( _debug ){
710 	    DBG << "matched search tag: " << local_name << endl;
711 	  }
712 	  _external_node = handle_match( local_name, new_depth );
713 	  return _external_node;
714 	}
715 	else if ( local_name == "t"
716 		  || local_name == "ph" ){
717 	  handle_content( local_name, new_depth );
718 	}
719 	else {
720 	  handle_element( local_name, new_depth );
721 	}
722       }
723 	break;
724       case XML_READER_TYPE_TEXT: {
725 	add_text( new_depth );
726       }
727 	break;
728       case XML_READER_TYPE_COMMENT: {
729 	add_comment( new_depth );
730       }
731 	break;
732       default: {
733 	add_default_node( new_depth );
734       }
735 	break;
736       }
737       ret = xmlTextReaderRead(_reader);
738     }
739     _done = true;
740     return 0;
741   }
742 
create_simple_tree(const string & in_file) const743   xml_tree *Engine::create_simple_tree( const string& in_file ) const {
744     /// create a lightweight tree for enumerating all XML_ELEMENTS encountered
745     /*!
746       \param in_file The file to create an xmlTextReader on. May be a string
747       buffer containing a complete XML file too
748       \return the light-weight tree with the relevant nodes
749     */
750     xmlTextReader *cur_reader = create_text_reader( in_file );
751     if ( xmlTextReaderReadState(cur_reader) < 0 ){
752       throw runtime_error( "create_simple_tree() init failed" );
753     }
754     if ( _debug ){
755       DBG << "enumerate_nodes()" << endl;
756     }
757     xml_tree *records = 0;
758     xml_tree *rec_pnt = 0;
759     int index = 0;
760     int current_depth = 0;
761     while ( xmlTextReaderRead(cur_reader) > 0 ){
762       int depth = xmlTextReaderDepth(cur_reader);
763       int type = xmlTextReaderNodeType(cur_reader);
764       if ( type == XML_READER_TYPE_ELEMENT
765 	   || type == XML_READER_TYPE_COMMENT ){
766 	string local_name = (const char*)xmlTextReaderConstLocalName(cur_reader);
767 	KWargs atts = get_attributes( cur_reader );
768 	string nsu;
769 	string txt_class;
770 	for ( auto const& v : atts ){
771 	  if ( v.first == "xmlns:xlink" ){
772 	    // only at top level
773 	    continue;
774 	  }
775 	  if ( v.first.find("xmlns") == 0 ){
776 	    nsu = v.second;
777 	  }
778 	  if ( v.first == "textclass"
779 	       || ( local_name == "t" && v.first == "class" ) ){
780 	    txt_class = v.second;
781 	  }
782 	}
783 	if ( nsu.empty() || nsu == NSFOLIA ){
784 	  xml_tree *add_rec = new xml_tree( depth, index, local_name, txt_class );
785 	  if ( _debug ){
786 	    DBG << "new record " << index << " " << local_name << " ("
787 		<< depth << ")" << endl;
788 	  }
789 	  if ( rec_pnt == 0 ){
790 	    records = add_rec;
791 	    rec_pnt = records;
792 	  }
793 	  else if ( depth == current_depth ){
794 	    add_rec->parent = rec_pnt->parent;
795 	    rec_pnt->next = add_rec;
796 	    rec_pnt = rec_pnt->next;
797 	  }
798 	  else if ( depth > current_depth ){
799 	    add_rec->parent = rec_pnt;
800 	    rec_pnt->link = add_rec;
801 	    rec_pnt = rec_pnt->link;
802 	  }
803 	  else { // depth < current_depth
804 	    while ( rec_pnt && rec_pnt->depth > depth ){
805 	      rec_pnt = rec_pnt->parent;
806 	    }
807 	    if ( rec_pnt == 0 ){
808 	      rec_pnt = records;
809 	    }
810 	    while ( rec_pnt->next ){
811 	      rec_pnt = rec_pnt->next;
812 	    }
813 	    add_rec->parent = rec_pnt->parent;
814 	    rec_pnt->next = add_rec;
815 	    rec_pnt = rec_pnt->next;
816 	  }
817 	  current_depth = rec_pnt->depth;
818 	}
819 	else {
820 	  if ( _debug ){
821 	    DBG << "name=" << local_name << " atts=" << atts << endl;
822 	    DBG << "create_simple_tree() node in alien namespace '"
823 		<< nsu << "' is SKIPPED!" << endl;
824 	  }
825 	}
826 	++index;
827       }
828     }
829     if ( xmlTextReaderReadState(cur_reader) < 0 ){
830       throw runtime_error( "create_simple_tree() failed" );
831     }
832     xmlFreeTextReader( cur_reader );
833     return records;
834   }
835 
count_nodes(FoliaElement * fe)836   int count_nodes( FoliaElement *fe ){
837     /// count all 'real' FoliaElements including and below this one
838     /*!
839       \param fe the The element to start at
840       \return the 'size' of the subtree below fe. We need this number to know
841       where to proceed processing
842     */
843     int result = 0;
844     //    cerr << "DEPTH " << fe << endl;
845     if ( fe
846 	 && fe->xmltag() != "_XmlText"
847 	 && fe->element_id() != HeadFeature_t
848 	 && !isAttributeFeature(fe->xmltag()) ){
849       result += 1;
850       if ( fe->size() > 0 ){
851 	//	cerr << "size=" << fe->size() << endl;
852 	for ( size_t i=0; i < fe->size(); ++i ){
853 	  //	  cerr << "i=" << i << endl;
854 	  result += count_nodes( fe->index(i) );
855 	}
856       }
857     }
858     //    cerr << "return DEPTH " << fe << " ="  << result << endl;
859     return result;
860   }
861 
handle_content(const string & t_or_ph,int new_depth)862   int Engine::handle_content( const string& t_or_ph, int new_depth ){
863     /// process a matched 't' or 'ph' tag into a FoLiA subtree
864     /*!
865       \param t_or_ph a t or ph tags
866       \param new_depth the location in the Document to attach to
867       \return the number of FoliaElement nodes added
868     */
869     KWargs atts = get_attributes( _reader );
870     if ( _debug ){
871       DBG << "expanding content of <" << t_or_ph << "> atts=" << atts << endl;
872     }
873     FoliaElement *t = AbstractElement::createElement( t_or_ph, _out_doc );
874     if ( t ){
875       t->setAttributes( atts );
876       // just take as is...
877       xmlNode *fd = xmlTextReaderExpand(_reader);
878       t->parseXml( fd );
879       if ( _debug ){
880 	DBG << "parsed " << t << endl;
881       }
882       append_node( t, new_depth );
883       // skip subtree
884       xmlTextReaderNext(_reader);
885       int type = xmlTextReaderNodeType(_reader);
886       if ( type == XML_READER_TYPE_TEXT ){
887 	string value = (const char*)xmlTextReaderConstValue(_reader);
888 	string trimmed = TiCC::trim(value);
889 	if ( !trimmed.empty() ){
890 	  throw XmlError( "spurious text " + trimmed + " found after node <"
891 			  + t_or_ph + ">" );
892 	}
893       }
894       return count_nodes( t );
895     }
896     else {
897       _ok = false;
898       throw XmlError( "folia::engine failed to create node: " + t_or_ph );
899     }
900   }
901 
handle_element(const string & local_name,int new_depth)902   void Engine::handle_element( const string& local_name,
903 			       int new_depth ){
904     /// process a matched tag into a FoLiA subtree
905     /*!
906       \param local_name the tag
907       \param new_depth the location in the Document to attach to
908     */
909     KWargs atts = get_attributes( _reader );
910     if ( _debug ){
911       DBG << "name=" << local_name << " atts=" << atts << endl;
912     }
913     if ( local_name == "wref" ){
914       string id = atts["id"];
915       if ( id.empty() ){
916 	_ok = false;
917 	throw XmlError( "folia::engine, reference missing an 'id'" );
918       }
919       FoliaElement *ref = (*_out_doc)[id];
920       if ( !ref ){
921 	_ok = false;
922 	throw XmlError( "folia::engine, unresolvable reference: "
923 			+ id );
924       }
925       ref->increfcount();
926       append_node( ref, new_depth );
927     }
928     else {
929       FoliaElement *t = AbstractElement::createElement( local_name, _out_doc );
930       if ( t ){
931 	if ( local_name == "foreign-data" ){
932 	  xmlNode *fd = xmlTextReaderExpand(_reader);
933 	  t->parseXml( fd );
934 	  append_node( t, new_depth );
935 	  // skip subtree
936 	  xmlTextReaderNext(_reader);
937 	}
938 	else {
939 	  string nsu;
940 	  for ( auto const& v : atts ){
941 	    if ( v.first.find("xmlns:") == 0 ){
942 	      nsu = v.second;
943 	      break;
944 	    }
945 	  }
946 
947 	  // We could use std::find_if here, but that is less readable:
948 	  // auto const& a = find_if( atts.begin(), atts.end(),
949 	  // 			   []( const pair<string,string>& av ){
950 	  // 			     return av.first.find("xmlns:") == 0;
951 	  // 			   } );
952 	  // if ( a != atts.end() ){
953 	  //   nsu = a->second;
954 	  // }
955 
956 	  if ( nsu.empty() || nsu == NSFOLIA ){
957 	    if ( local_name == "desc"
958 		 || local_name == "content"
959 		 || local_name == "comment" ){
960 	      if ( xmlTextReaderIsEmptyElement(_reader) ){
961 		if ( _debug ){
962 		  DBG << "Element is empty." << endl;
963 		}
964 	      }
965 	      else {
966 		xmlTextReaderRead(_reader);
967 		const char *val = (const char*)xmlTextReaderConstValue(_reader);
968 		if ( val ) {
969 		  if ( _debug ){
970 		    DBG << "processing a <" << local_name << "> with value '"
971 			<< val << "'" << endl;
972 		  }
973 		  atts["value"] = val;
974 		}
975 		else {
976 		  if ( _debug ){
977 		    DBG << "processing a <" << local_name
978 			<< "> with empty value " << endl;
979 		  }
980 		}
981 	      }
982 	    }
983 	    if ( _debug ){
984 	      DBG << "SET ATTRIBUTES: " << atts << endl;
985 	    }
986 	    t->setAttributes( atts );
987 	    append_node( t, new_depth );
988 	  }
989 	  else {
990 	    if ( _debug ){
991 	      DBG << "a node in an alien namespace'" << nsu << endl;
992 	    }
993 	    // just take as is...
994 	    append_node( t, new_depth );
995 	    xmlNode *fd = xmlTextReaderExpand(_reader);
996 	    t->parseXml( fd );
997 	    // skip subtree
998 	    xmlTextReaderNext(_reader);
999 	  }
1000 	}
1001       }
1002       else {
1003 	_ok = false;
1004 	throw XmlError( "folia::engine failed to create node: "
1005 			+ local_name );
1006       }
1007     }
1008   }
1009 
output_header()1010   bool Engine::output_header(){
1011     /// output the 'header' of the Folia document to the associated output
1012     /// stream
1013 
1014     /// This outputs ALL metadata from the Document upto and including
1015     /// the opening \<text> of \<speech> node
1016     if ( _debug ){
1017       DBG << "Engine::output_header()" << endl;
1018     }
1019     if ( !_os ){
1020       throw logic_error( "folia::Engine::output_header() impossible. No output file specified!" );
1021       return false;
1022     }
1023     if ( _finished ){
1024       return true;
1025     }
1026     else if ( _header_done ){
1027       throw logic_error( "folia::Engine::output_header() is called twice!" );
1028       return false;
1029     }
1030     _header_done = true;
1031     stringstream ss;
1032     _out_doc->save( ss, ns_prefix );
1033     string data = ss.str();
1034     string search_b1;
1035     string search_b2;
1036     string search_e;
1037     if ( _doc_type == TEXT ){
1038       if ( !ns_prefix.empty() ){
1039 	search_b1 = "<" + ns_prefix + ":" + "text>";
1040 	search_b2 = "<" + ns_prefix + ":" + "text ";
1041 	search_e = "</" + ns_prefix + ":" + "text>";
1042       }
1043       else {
1044 	search_b1 = "<text>";
1045 	search_b2 = "<text ";
1046 	search_e = "</text>";
1047       }
1048     }
1049     else {
1050       if ( !ns_prefix.empty() ){
1051 	search_b1 = "<" + ns_prefix + ":" + "speech>";
1052 	search_b2 = "<" + ns_prefix + ":" + "speech ";
1053 	search_e = "</" + ns_prefix + ":" + "speech>";
1054       }
1055       else {
1056 	search_b1 = "<speech>";
1057 	search_b2 = "<speech ";
1058 	search_e = "</speech>";
1059       }
1060     }
1061     string::size_type bpos1 = data.find( search_b1 );
1062     string::size_type bpos2 = data.find( search_b2 );
1063     string::size_type pos1;
1064     if ( bpos1 < bpos2 ){
1065       pos1 = bpos1;
1066     }
1067     else {
1068       pos1 = bpos2;
1069     }
1070     string::size_type pos2;
1071     if ( _root_node->size() == 0 ){
1072       pos2 = data.find( "/>" , pos1 );
1073     }
1074     else {
1075       pos2 = data.find( ">" , pos1 );
1076     }
1077     string head = data.substr( 0, pos2 ) + ">";
1078     if ( _root_node->size() == 0 ){
1079       pos2 += 2;
1080     }
1081     else {
1082       pos2 = data.find( search_e, pos1 );
1083       int add = search_e.size();
1084       pos2 += add;
1085     }
1086     _footer = "  " + search_e + data.substr( pos2 );
1087     *_os << head << endl;
1088     return true;
1089   }
1090 
output_footer()1091   bool Engine::output_footer(){
1092     /// output the remains of the associated Document
1093     /// might call flush() first
1094 
1095     /// further processing in this Engine is illegal
1096     if ( _debug ){
1097       DBG << "Engine::output_footer()" << endl;
1098     }
1099     if ( _finished ){
1100       return true;
1101     }
1102     if ( !_os ){
1103       throw logic_error( "folia::Engine::output_footer() impossible. No output file specified!" );
1104       return false;
1105     }
1106     else if ( flush() ){
1107       *_os << _footer << endl;
1108       _finished = true;
1109       return true;
1110     }
1111     else {
1112       return false;
1113     }
1114   }
1115 
flush()1116   bool Engine::flush() {
1117     /// output all NEW information in the output Document to the output stream
1118 
1119     /// may call output_header() first
1120     if ( _debug ){
1121       DBG << "Engine::flush()" << endl;
1122     }
1123     if ( !_os ){
1124       throw logic_error( "folia::Engine::flush() impossible. No outputfile specified!" );
1125       return false;
1126     }
1127     if ( _finished ){
1128       return true;
1129     }
1130     else if ( !_header_done ){
1131       output_header();
1132     }
1133     stack<FoliaElement*> rem_list;
1134     size_t len = _root_node->size();
1135     for ( size_t i=0; i < len; ++i ){
1136       rem_list.push( _root_node->index(i) );
1137       *_os << "    " << _root_node->index(i)->xmlstring(true,2,false) << endl;
1138     }
1139     while ( !rem_list.empty() ){
1140       // we've kept a stack of elements to remove, as removing at the back
1141       // is the safest and cheapest thing to do
1142       _root_node->remove( rem_list.top() );
1143       destroy( rem_list.top() );
1144       rem_list.pop();
1145     }
1146     return true;
1147   }
1148 
finish()1149   bool Engine::finish() {
1150     /// finalize the Engine bij calling output_footer
1151     if ( _debug ){
1152       DBG << "Engine::finish()" << endl;
1153     }
1154     if ( !_os ){
1155       throw logic_error( "folia::Engine::finish() impossible. No outputfile specified!" );
1156       return false;
1157     }
1158     if ( _finished ){
1159       return true;
1160     }
1161     return output_footer();
1162   }
1163 
save(const string & name,bool do_canon)1164   void Engine::save( const string& name, bool do_canon ){
1165     /// save the associated Document to a file
1166     /*!
1167       \param name the file-name
1168       \param do_canon output in Canonical format
1169     */
1170     if ( _os && name == _out_name ){
1171       throw logic_error( "folia::Engine::save() impossible. Already connected to a stream with the same name (" + name + ")" );
1172     }
1173     _out_doc->save( name, ns_prefix, do_canon );
1174   }
1175 
save(ostream & os,bool do_canon)1176   void Engine::save( ostream& os, bool do_canon ){
1177     /// save the associated Document to a stream
1178     /*!
1179       \param os the stream
1180       \param do_canon output in Canonical format
1181     */
1182     _out_doc->save( os, ns_prefix, do_canon );
1183   }
1184 
1185 
init_doc(const string & i,const string & o)1186   bool TextEngine::init_doc( const string& i, const string& o ){
1187     /// init an associated document for this TextEngine
1188     /*!
1189       \param i the input file to use for parsing
1190       \param o when not empty, add an output-file with this name
1191 
1192       Sets the _in_file property to i and marks _is_setup FALSE
1193       then calls Engine::init_doc to do the real work.
1194     */
1195     _in_file = i;
1196     _is_setup = false;
1197     //    set_debug(true);
1198     return Engine::init_doc( i, o );
1199   }
1200 
setup(const string & textclass,bool prefer_struct)1201   void TextEngine::setup( const string& textclass, bool prefer_struct ){
1202     /// set the TextEngine ready for parsing
1203     /*!
1204       \param textclass Determines which textnodes to search for
1205       \param prefer_struct If TRUE, set the TextEngine up for returning
1206       Structure nodes like sentences or paragraphs above returning
1207       just Word or String nodes
1208     */
1209     string txtc = textclass;
1210     if ( txtc == "current" ){
1211       txtc.clear();
1212     }
1213     text_parent_map = enumerate_text_parents( txtc, prefer_struct );
1214     _next_text_node = _start_index;
1215     if ( !text_parent_map.empty() ){
1216       _next_text_node = text_parent_map.begin()->first;
1217     }
1218     _node_count = _start_index;
1219     _is_setup = true;
1220   }
1221 
get_structure_parent(const xml_tree * pnt)1222   xml_tree *get_structure_parent( const xml_tree *pnt ){
1223     ///  return the nearest StructureElement above this node
1224     /*!
1225       \param pnt a (text) element in the simple tree.
1226       \return the first parent which is an AbstractStructureElement
1227       and NOT a Word
1228     */
1229     if ( pnt->parent->tag != "w"
1230 	 && isSubClass( stringToElementType(pnt->parent->tag),
1231 			AbstractStructureElement_t ) ){
1232       return pnt->parent;
1233     }
1234     else {
1235       return get_structure_parent( pnt->parent );
1236     }
1237   }
1238 
search_text_parents(const xml_tree * start,const string & textclass,bool prefer_struct) const1239   map<int,int> TextEngine::search_text_parents( const xml_tree* start,
1240 						const string& textclass,
1241 						bool prefer_struct ) const{
1242     /// scan the whole TextEngine for TextContent nodes
1243     /*!
1244       \param start the tree to search
1245       \param textclass the text-class we are interested in
1246       \param prefer_struct If TRUE, set the TextEngine up for returning
1247       Structure nodes like sentences or paragraphs above returning
1248       just Word or String nodes
1249       \return a map containing for every found text_parent the index of
1250       the NEXT value to search. TO DO: very mysty and mystic
1251     */
1252     map<int,int> result;
1253     const xml_tree *pnt = start;
1254     while ( pnt ){
1255       if ( _debug ){
1256 	DBG << "bekijk:" << pnt->tag << "-" << pnt->index << endl;
1257       }
1258       if ( pnt->tag == "wref"
1259 	   || pnt->tag == "original" ){
1260 	//
1261 	// DON'T see a wref as a valid textparent.
1262 	// The word is connected elsewhere too
1263 	// Also an 'original' node is assumed to be part of a correction
1264 	// so hope for a 'new' node to be found!
1265 	pnt = pnt->next;
1266 	continue;
1267       }
1268       map<int,int> deeper = search_text_parents( pnt->link,
1269 						 textclass,
1270 						 prefer_struct );
1271       if ( !deeper.empty() ){
1272 	if ( _debug ){
1273 	  DBG << "deeper we found: " << deeper << endl;
1274 	}
1275 	result.insert( deeper.begin(), deeper.end() );
1276       }
1277       pnt = pnt->next;
1278     }
1279     if ( result.empty() ){
1280       // so no deeper text found
1281       // lets see at this level....
1282       pnt = start;
1283       while ( pnt ){
1284 	if ( pnt->tag == "t" && pnt->textclass == textclass ){
1285 	  // OK text in the right textclass
1286 	  if ( prefer_struct ){
1287 	    // search for a suitable parent
1288 	    xml_tree *par = get_structure_parent( pnt );
1289 	    int index = par->index;
1290 	    int next = INT_MAX;
1291 	    if ( par->next ){
1292 	      next = par->index;
1293 	    }
1294 	    result[index] = next;
1295 	    break;
1296 	  }
1297 	  else {
1298 	    int index = pnt->parent->index;
1299 	    int next = INT_MAX;
1300 	    if ( pnt->parent->next ){
1301 	      next = pnt->parent->next->index;
1302 	    }
1303 	    else if ( pnt->parent->parent->next ){
1304 	      next = pnt->parent->parent->next->index;
1305 	    }
1306 	    result[index] = next;
1307 	    break;
1308 	  }
1309 	}
1310 	pnt = pnt->next;
1311       }
1312     }
1313     if ( _debug && start && !result.empty() ){
1314       DBG << "return " << result << " for " << start->parent->tag << endl;
1315     }
1316     return result;
1317   }
1318 
enumerate_text_parents(const string & textclass,bool prefer_struct)1319   const map<int,int>& TextEngine::enumerate_text_parents( const string& textclass,
1320 							  bool prefer_struct ) {
1321     /// Loop over the full input, looking for textnodes in class 'textclass'
1322     /*!
1323       \param textclass the text-class we are interested in
1324       \param prefer_struct If TRUE, set the TextEngine up for returning
1325       Structure nodes like sentences or paragraphs above returning
1326       just Word or String nodes
1327       \return a reference to a map of text parent nodes
1328 
1329       this function recurses to the DEEPEST text possible, and enumerates their
1330       parents. It creates a mapping of text parents indices to their successor
1331     */
1332     if ( _done ){
1333       throw runtime_error( "enumerate_text_parents() called on a done engine" );
1334     }
1335     if ( _debug ){
1336       DBG << "enumerate_text_parents(" << textclass << ")" << endl;
1337     }
1338     //
1339     // we start by creating a tree of all nodes
1340     xml_tree *tree = create_simple_tree(_in_file);
1341     //
1342     // now search that tree for nodes in 'textclass'
1343     // if is a <t>, then remember the index of its parent
1344     // but when 'prefer_struct' is specified, return the direct structure above
1345     // when present.
1346     text_parent_map.clear();
1347     xml_tree *rec_pnt = tree;
1348     while ( rec_pnt ){
1349       map<int,int> deeper = search_text_parents( rec_pnt->link,
1350 						 textclass,
1351 						 prefer_struct );
1352       text_parent_map.insert( deeper.begin(), deeper.end() );
1353       rec_pnt = rec_pnt->next;
1354     }
1355     if ( _debug ){
1356       DBG << "complete tree: " << endl;
1357       print( DBG, tree );
1358       DBG << "Search map = " << text_parent_map << endl;
1359     }
1360     for ( auto it = text_parent_map.begin();
1361 	  it != text_parent_map.end();
1362 	  ++it ){
1363       auto nit = it;
1364       ++nit;
1365       if ( nit != text_parent_map.end() ){
1366 	it->second = nit->first;
1367       }
1368     }
1369     if ( _debug ){
1370       DBG << "Reduced Search map = " << text_parent_map << endl;
1371     }
1372     delete tree;
1373     return text_parent_map;
1374   }
1375 
next_text_parent()1376   FoliaElement *TextEngine::next_text_parent(){
1377     /// return the next node to handle
1378     /*!
1379       \return a FoLiAElement pointer to a 'textparent' subtree, or 0 when done
1380 
1381       The caller may use this pointer to modify the subtree BELOW that pointer
1382       at will.
1383 
1384       next_text_parent should be called until no more candidates are found.
1385       At that moment, the complete input FoLiA is parsed and stored in _out_doc
1386       adn can be saved or handled over for further processing.
1387 
1388     */
1389     if ( _done ){
1390       if ( _debug ){
1391 	DBG << "next_text_parent(). engine is done" << endl;
1392       }
1393       return 0;
1394     }
1395     if ( !_is_setup ){
1396       throw runtime_error( "TextEngine: not setup yet!" );
1397     }
1398     if ( text_parent_map.empty() ){
1399       if ( _debug ){
1400 	DBG << "next_text_parent(). the parent map is empty." << endl;
1401       }
1402       return 0;
1403     }
1404 
1405     int ret = 0;
1406     if ( _external_node != 0 ){
1407       // so our last action was to output a pointer to a subtree.
1408       // continue with the next node, avoiding the subtree
1409       _external_node = 0;
1410       ret = xmlTextReaderNext(_reader);
1411     }
1412     else {
1413       // so we are the first time here, get first result
1414       ret = xmlTextReaderRead(_reader);
1415     }
1416     if ( ret == 0 ){
1417       if ( _debug ){
1418 	DBG << "next_text_parent(), DONE" << endl;
1419       }
1420       _done = true;
1421       return 0;
1422     }
1423     while ( ret ){
1424       int type = xmlTextReaderNodeType(_reader);
1425       if ( _debug ){
1426 	DBG << "MAIN LOOP search next_text_parent(), type=" << type
1427 	    << " current node=" << _node_count
1428 	    << " search for node=" << _next_text_node << endl;
1429       }
1430       int new_depth = xmlTextReaderDepth(_reader);
1431       switch ( type ){
1432       case XML_READER_TYPE_ELEMENT: {
1433 	string local_name = (const char*)xmlTextReaderConstLocalName(_reader);
1434 	if ( _debug ){
1435 	  DBG << "next element: " << local_name << " cnt =" << _node_count << endl;
1436 	}
1437 	if ( _node_count == _next_text_node  ){
1438 	  // HIT!
1439 	  if ( _debug ){
1440 	    DBG << "at index=" << _node_count << " WE HIT a next element for: " << local_name << endl;
1441 	  }
1442 	  _external_node = handle_match( local_name, new_depth );
1443 	  int skips = count_nodes( _external_node );
1444 	  // we are to output a tree of skips nodes
1445 	  _node_count += skips; // so next time we resume with this count
1446 	  _next_text_node = text_parent_map[_next_text_node];
1447 	  // and we have to search for _next_text_node
1448 	  if ( _debug ){
1449 	    DBG << " increment _node_count with: " << skips << " to "
1450 		<< _node_count << " searching for: "
1451 		<< _next_text_node << endl;
1452 	  }
1453 	  return _external_node;
1454 	}
1455 	else if ( local_name == "t"
1456 		  || local_name == "ph" ){
1457 	  _node_count += handle_content( local_name, new_depth );
1458 	}
1459 	else {
1460 	  handle_element( local_name, new_depth );
1461 	  ++_node_count;
1462 	}
1463       }
1464 	break;
1465       case XML_READER_TYPE_TEXT: {
1466 	add_text( new_depth );
1467       }
1468 	break;
1469       case XML_READER_TYPE_COMMENT: {
1470 	add_comment( new_depth );
1471       }
1472 	break;
1473       default: {
1474 	add_default_node( new_depth );
1475       }
1476 	break;
1477       }
1478       ret = xmlTextReaderRead(_reader);
1479     }
1480     _done = true;
1481     return 0;
1482   }
1483 
1484 } // namespace folia
1485