1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of libfolia
7 
8   libfolia is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   libfolia is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 
27 #include <cassert>
28 #include <cstdlib>
29 #include <iostream>
30 #include <fstream>
31 #include <sstream>
32 #include <string>
33 #include <set>
34 #include <list>
35 #include <vector>
36 #include <map>
37 #include <algorithm>
38 #include <type_traits>
39 #include <stdexcept>
40 #include "ticcutils/PrettyPrint.h"
41 #include "ticcutils/StringOps.h"
42 #include "ticcutils/XMLtools.h"
43 #include "ticcutils/Unicode.h"
44 #include "libfolia/folia.h"
45 #include "libfolia/folia_properties.h"
46 #include "config.h"
47 
48 using namespace std;
49 using namespace icu;
50 using namespace TiCC;
51 
52 namespace folia {
53   using TiCC::operator <<;
54 
VersionName()55   string VersionName() { return PACKAGE_STRING; } ///< Returns the PACKAGE_STRING info of the package
Version()56   string Version() { return VERSION; }  ///< Returns version of the library
57 
element_id() const58   ElementType AbstractElement::element_id() const {
59     /// return the ELEMENT_ID property
60     return _props.ELEMENT_ID;
61   }
62 
occurrences() const63   size_t AbstractElement::occurrences() const {
64     /// return the OCCURENCES property
65     return _props.OCCURRENCES;
66   }
67 
occurrences_per_set() const68   size_t AbstractElement::occurrences_per_set() const {
69     /// return the OCCURRENCES_PER_SET property
70     return _props.OCCURRENCES_PER_SET;
71   }
72 
required_attributes() const73   Attrib AbstractElement::required_attributes() const {
74     /// return the REQUIRED_ATTRIBUTES property
75     return _props.REQUIRED_ATTRIBS;
76   }
77 
optional_attributes() const78   Attrib AbstractElement::optional_attributes() const {
79     /// return the OPTONAL_ATTRIBUTES property
80     return _props.OPTIONAL_ATTRIBS;
81   }
82 
hidden() const83   bool AbstractElement::hidden() const {
84     /// return the HIDDEN property
85     return _props.HIDDEN;
86   }
87 
xmltag() const88   const string& AbstractElement::xmltag() const {
89     /// return the XMLTAG property
90     /*
91       For pre 1.5 documents, it will return the OLD name of that property.
92       e.g. "spanrelation" is translated to the old "complexalignment"
93     */
94     const string& result = _props.XMLTAG;
95     if ( doc() && doc()->version_below(1,6) ){
96       const auto& it = reverse_old.find(result);
97       if ( it != reverse_old.end() ){
98 	return it->second;
99       }
100     }
101     return result;
102   }
103 
default_subset() const104   const string& AbstractElement::default_subset() const {
105     /// return the SUBSET property
106     return _props.SUBSET;
107   }
108 
annotation_type() const109   AnnotationType AbstractElement::annotation_type() const {
110     /// return the ANNOTATIONTYPE property
111     return _props.ANNOTATIONTYPE;
112   }
113 
accepted_data() const114   const set<ElementType>& AbstractElement::accepted_data() const {
115     /// return the ACCEPTED_DATA property
116     return _props.ACCEPTED_DATA;
117   }
118 
required_data() const119   const set<ElementType>& AbstractElement::required_data() const {
120     /// return the REQUIRED_DATA property
121     return _props.REQUIRED_DATA;
122   }
123 
printable() const124   bool AbstractElement::printable() const {
125     /// return the PRINTABLE property
126     return _props.PRINTABLE;
127   }
128 
speakable() const129   bool AbstractElement::speakable() const {
130     /// return the SPEAKABLE property
131     return _props.SPEAKABLE;
132   }
133 
referable() const134   bool AbstractElement::referable() const {
135     /// return the WREFABLE property
136     return _props.WREFABLE;
137   }
138 
is_textcontainer() const139   bool AbstractElement::is_textcontainer() const {
140     /// return the TEXTCONTAINER property
141     return _props.TEXTCONTAINER;
142   }
143 
implicitspace() const144   bool AbstractElement::implicitspace() const {
145     /// return the IMPLICITSPACE property
146     return _props.IMPLICITSPACE;
147   }
148 
is_phoncontainer() const149   bool AbstractElement::is_phoncontainer() const {
150     /// return the PHONCONTAINER property
151     return _props.PHONCONTAINER;
152   }
153 
text_delimiter() const154   const string& AbstractElement::text_delimiter() const {
155     /// return the TEXTDELIMITER property
156     return _props.TEXTDELIMITER;
157   }
158 
xlink() const159   bool AbstractElement::xlink() const {
160     /// return the XLINK property
161     return _props.XLINK;
162   }
163 
auth() const164   bool AbstractElement::auth() const {
165     /// return the AUTH property
166     return _props.AUTH;
167   }
168 
setonly() const169   bool AbstractElement::setonly() const {
170     /// return the SETONLY property
171     return _props.SETONLY;
172   }
173 
auto_generate_id() const174   bool AbstractElement::auto_generate_id() const {
175     /// return the AUTO_GENERATE_ID property
176     return _props.AUTO_GENERATE_ID;
177   }
178 
is_structure(const FoliaElement * el)179   bool is_structure( const FoliaElement *el ){
180     /// test if the object is a Structure Element.
181     /*!
182       \param el the FoliaElement to test
183       \return true when the parameter is an AbstractStructureElement
184       or a derivative of an AbstractStructureElement
185     */
186     return dynamic_cast<const AbstractStructureElement*>( el ) != 0;
187   }
188 
href() const189   const string AllowXlink::href() const {
190     /// return the 'href' value of the object
191     /*!
192      * if the object has an xlink value for 'href' it is returned as a string
193      * otherwise the result is ""
194      */
195     auto it = _xlink.find("href");
196     if ( it != _xlink.end() ){
197       return it->second;
198     }
199     return "";
200   }
201 
set_tag(const string & t)202   const string AbstractElement::set_tag( const string& t ) {
203     /// set a value for the _tags attribute
204     /*!
205      * \param t the new value (may be empty)
206      * \return the old value (can be empty)
207      * thows when the FoliaElement doesn't support the tag attribute
208      */
209     Attrib supported = required_attributes() | optional_attributes();
210     if ( !(TAG & supported) ) {
211       throw ValueError( "settag is not supported for " + classname() );
212     }
213     string r = _tags;
214     _tags = t;
215     return r;
216   }
217 
operator <<(ostream & os,const FoliaElement & ae)218   ostream& operator<<( ostream& os, const FoliaElement& ae ) {
219     /// Output operator for FoliaElements. (for DEBUGGING only)
220     /*!
221      * \param os the output stream
222      * \param ae the FoliaElement
223      */
224     os << " <" << ae.classname();
225     KWargs ats = ae.collectAttributes();
226     if ( !ae.id().empty() ) {
227       os << " xml:id=\"" << ae.id() << '"';
228       ats.erase("xml:id");
229     }
230 
231     for ( const auto& it: ats ) {
232       os << " " << it.first << "=\"" << it.second << '"';
233     }
234     os << " > {";
235     for ( size_t i=0; i < ae.size(); ++i ) {
236       os << "<" << ae.index(i)->classname() << ">,";
237     }
238     os << "}";
239     if ( ae.printable() && ae.classname()[0] == 't' ){
240       os << " \"" << ae.str(ae.textclass()) << "\" (" << ae.textclass() << ")";
241     }
242     return os;
243   }
244 
operator <<(ostream & os,const FoliaElement * ae)245   ostream& operator<<( ostream&os, const FoliaElement *ae ) {
246     /// Output operator for FoliaElements. (for DEBUGGING only)
247     /*!
248      * \param os the output stream
249      * \param ae the FoliaElement
250      */
251     if ( !ae ) {
252       os << "nil";
253     }
254     else
255       os << *ae;
256     return os;
257   }
258 
259   //#define DE_AND_CONSTRUCT_DEBUG
260 
AbstractElement(const properties & p,Document * d)261   AbstractElement::AbstractElement( const properties& p, Document *d ) :
262     /// Constructor for AbstractElements.
263     /*!
264      * \param p a properties block (required)
265      * \param d a parent document
266      */
267     _mydoc(d),
268     _parent(0),
269     _auth( p.AUTH ),
270     _space(true),
271     _annotator_type(UNDEFINED),
272     _refcount(0),
273     _confidence(-1),
274     _preserve_spaces(SPACE_FLAGS::UNSET),
275     _props(p)
276   {
277 #ifdef DE_AND_CONSTRUCT_DEBUG
278     cerr << "created an : " << xmltag() << " adres=" << (void*)this << endl;
279 #endif
280   }
281 
AbstractElement(const properties & p,FoliaElement * el)282   AbstractElement::AbstractElement( const properties& p, FoliaElement *el ) :
283     /// Constructor for AbstractElements.
284     /*!
285      * \param p a properties block (required)
286      * \param el a parent node, to append to
287      */
288     AbstractElement( p, el->doc() )
289   {
290     if ( !el ){
291       throw ValueError( "AbstractElement( p, e ) called with 0 e" );
292     }
293     el->append( this );
294   }
295 
~AbstractElement()296   AbstractElement::~AbstractElement( ) {
297 #ifdef DE_AND_CONSTRUCT_DEBUG
298     cerr << "really delete " << xmltag() << " adres=" << (void*)this << endl;
299 #endif
300   }
301 
destroy()302   void AbstractElement::destroy( ) {
303     /// Pseudo destructor for AbstractElements.
304     /// recursively destroys this nodes and it's children
305     /// Will also remove it from it's parent when no references are left
306 #ifdef DE_AND_CONSTRUCT_DEBUG
307     cerr << "\ndestroy " << xmltag() << " adres=" << (void*)this
308 	 << " id=" << _id << " class= "
309 	 << cls() << " datasize= " << _data.size() << endl;
310     cerr << "REFCOUNT = " << refcount() << endl;
311     cerr << "AT= " << annotation_type() << " (" << _set << ")" << endl;
312 #endif
313     if ( doc() ) {
314       doc()->decrRef( annotation_type(), _set );
315       if ( refcount() > 0 ){
316 	decrefcount();
317 	doc()->keepForDeletion( this );
318 #ifdef DE_AND_CONSTRUCT_DEBUG
319 	cerr << "\t\tstill keeping element id=" << _id << " tag = "
320 	     << xmltag() << " adres=" << (void*)this << " class= " << cls()
321 	     << " datasize= " << _data.size() << endl;
322 #endif
323 	return;
324       }
325       doc()->del_doc_index( _id );
326     }
327     if ( _parent ){
328 #ifdef DE_AND_CONSTRUCT_DEBUG
329       cerr << "STILL A PARENT: " << _parent << endl;
330 #endif
331       _parent->remove( this );
332     }
333     for ( const auto& el : _data ) {
334       el->set_parent(0);
335       el->destroy();
336     }
337     _data.clear();
338 #ifdef DE_AND_CONSTRUCT_DEBUG
339     cerr << "\t\tfinished destroying element id=" << _id << " tag = "
340 	 << xmltag() << " adres=" << (void*)this << " class= " << cls()
341 	 << " datasize= " << _data.size() << endl;
342 #endif
343     delete this;
344   }
345 
destroy(FoliaElement * el)346   void destroy( FoliaElement *el ){
347     if ( el ){
348       el->destroy();
349     }
350   }
351 
foliaNs() const352   xmlNs *AbstractElement::foliaNs() const {
353     /// return the associated xmlNs object.
354     /*!
355      * \return the XML namespace element of the associated FoLiA document
356      * or 0 when no xml document is available
357      */
358     if ( doc() ) {
359       return doc()->foliaNs();
360     }
361     return 0;
362   }
363 
check_set_declaration()364   void AbstractElement::check_set_declaration(){
365     /// check the declation consistency of an object.
366     /// throws an exception on error
367     /*!
368      * When the object has an associated document, the declaration of the
369      * 'set' attribute is checked. Or the default set when no 'set' is provided
370      * Also the presence of an appropiate annotation declaration is checked
371      * for the annotation-type of the object. This might auto-declare
372      * the anntotation-type, when de document allows this.
373      */
374 
375     if ( isSubClass( AbstractCorrectionChild_t ) ){
376       return;
377     }
378 
379     if ( _mydoc ){
380       string def;
381       if ( !_set.empty() ){
382 	if ( !doc()->declared( annotation_type(), _set ) ) {
383 	  throw DeclarationError( "Set '" + _set
384 				  + "' is used but has no declaration " +
385 				  "for " + toString( annotation_type() )
386 				  + "-annotation" );
387 	}
388       }
389       else {
390 	if ( _mydoc->debug > 2 ) {
391 	  cerr << "get def for " <<  annotation_type() << endl;
392 	}
393 	def = doc()->default_set( annotation_type() );
394 	if ( doc()->debug > 2 ) {
395 	  cerr << "got def='" <<  def << "'" << endl;
396 	}
397 	if ( doc()->is_incremental() && def.empty() ){
398 	  // when there is NO default set, AND we are parsing using
399 	  // folia::Engine, we must check if there WAS an empty set originally
400 	  // which is 'obscured' by newly added declarations
401 	  def = doc()->original_default_set( annotation_type() );
402 	  if ( doc()->debug > 2 ) {
403 	    cerr << "from original got def='" <<  def << "'" << endl;
404 	  }
405 	}
406 	if ( !def.empty() ){
407 	  _set = def;
408 	}
409 	else if ( CLASS & required_attributes() ){
410 	  throw XmlError( "unable to assign a default set for tag: " + xmltag() );
411 	}
412       }
413       if ( annotation_type() != AnnotationType::NO_ANN
414 	   && !_mydoc->version_below( 2, 0 ) ){
415 	if ( !_mydoc->declared( annotation_type() ) ){
416 	  if ( _mydoc->autodeclare() ){
417 	    _mydoc->auto_declare( annotation_type(), _set );
418 	  }
419 	  else {
420 	    throw DeclarationError( "Encountered an instance of <"
421 				    + xmltag()
422 				    + "> without a proper "
423 				    + toString(annotation_type())
424 				    + "-annotation" );
425 	  }
426 	}
427 	else if ( _set.empty()
428 		  && !isSubClass( AbstractAnnotationLayer_t )
429 		  && !doc()->declared( annotation_type(), "None" ) ){
430 	  if ( _mydoc->autodeclare() ){
431 	    _mydoc->auto_declare( annotation_type(), _set );
432 	  }
433 	  else {
434 	    throw DeclarationError( "Encountered an instance of <"
435 				    + xmltag()
436 				    + "> without a proper "
437 				    + toString(annotation_type())
438 				    + "-annotation" );
439 	  }
440 	}
441       }
442     }
443   }
444 
445 
setAttributes(KWargs & kwargs)446   void AllowXlink::setAttributes( KWargs& kwargs ) {
447     /// set the objects attributes given a set of Key-Value pairs.
448     /*!
449      * \param kwargs a KWargs set of Key-Value pairs
450      * the given keys are checked agains a range of criteria:
451      *     - if the object supports the attribue
452      *     - if the object provided value is valid
453      *     - if the attribute is declared for the annotation-type
454      */
455     string type = "simple";
456     string val = kwargs.extract( "xlink:type" );
457     if ( !val.empty() ) {
458       type = val;
459     }
460     if ( type != "simple" && type != "locator" ) {
461       throw XmlError( "only xlink:types: 'simple' and 'locator' are supported!" );
462     }
463     _xlink["type"] = type;
464     val = kwargs.extract( "xlink:href" );
465     if ( !val.empty() ) {
466       _xlink["href"] = val;
467     }
468     else if ( type == "locator" ){
469       throw XmlError( "xlink:type='locator' requires an 'xlink:href' attribute" );
470     }
471     val = kwargs.extract( "xlink:role" );
472     if ( !val.empty() ) {
473       _xlink["role"] = val;
474     }
475     val = kwargs.extract( "xlink:title" );
476     if ( !val.empty() ) {
477       _xlink["title"] = val;
478     }
479     val = kwargs.extract( "xlink:label" );
480     if ( !val.empty() ) {
481       if ( type == "simple" ){
482 	throw XmlError( "xlink:type='simple' may not have an 'xlink:label' attribute" );
483       }
484       _xlink["label"] = val;
485     }
486     val = kwargs.extract( "xlink:arcrole" );
487     if ( !val.empty() ) {
488       if ( type == "locator" ){
489 	throw XmlError( "xlink:type='locator' may not have an 'xlink:arcrole' attribute" );
490       }
491       _xlink["arcrole"] = val;
492     }
493     val = kwargs.extract( "xlink:show" );
494     if ( !val.empty() ) {
495       if ( type == "locator" ){
496 	throw XmlError( "xlink:type='locator' may not have an 'xlink:show' attribute" );
497       }
498       _xlink["show"] = val;
499     }
500     val = kwargs.extract( "xlink:actuate" );
501     if ( !val.empty() ) {
502       if ( type == "locator" ){
503 	throw XmlError( "xlink:type='locator' may not have an 'xlink:actuate' attribute" );
504       }
505       _xlink["actuate"] = val;
506     }
507   }
508 
setAttributes(KWargs & kwargs)509   void AbstractElement::setAttributes( KWargs& kwargs ) {
510     /// set the objects attributes given a set of Key-Value pairs.
511     /*!
512      * \param kwargs a KWargs set of Key-Value pairs
513      * the given keys are checked agains a range of criteria:
514      *     - if the object supports the attribue
515      *     - if the object provided value is valid
516      *     - if the attribute is declared for the annotation-type
517      */
518     // for the moment, always look for the 'xml:space' attribute
519     string sval = kwargs.extract( "xml:space" );
520     if ( !sval.empty() ){
521       if ( sval == "preserve" ){
522 	_preserve_spaces = SPACE_FLAGS::PRESERVE;
523       }
524       else if ( sval == "default" ){
525 	_preserve_spaces = SPACE_FLAGS::DEFAULT;
526       }
527       else {
528 	throw runtime_error( "invalid value for attribute xml:space, must be "
529 			     "'default' or 'preserve', found: '" + sval + "'");
530       }
531     }
532     Attrib supported = required_attributes() | optional_attributes();
533     //#define LOG_SET_ATT
534 #ifdef LOG_SET_ATT
535     int db_level = 0;
536     if ( doc() ){
537       db_level = doc()->debug;
538     }
539     if ( element_id() == New_t
540 	 || element_id() == Original_t ) {
541       if ( doc() ){
542 	doc()->setdebug(0);
543       }
544       cerr << "set attributes: '" << kwargs << "' on " << classname() << endl;
545       //      cerr << "required = " <<  toString(required_attributes()) << endl;
546       //      cerr << "optional = " <<  optional_attributes() << endl;
547       //cerr << "supported = " << supported << endl;
548       //      cerr << "ID & supported = " << (ID & supported) << endl;
549       //      cerr << "ID & _required = " << (ID & required_attributes() ) << endl;
550       // cerr << "_id=" << _id << endl;
551       // cerr << "AUTH : " << _auth << endl;
552     }
553 #endif
554     if ( doc() && doc()->debug > 2 ) {
555       cerr << "set attributes: " << kwargs << " on " << classname() << endl;
556     }
557 
558     string val = kwargs.extract( "generate_id" );
559     if ( !val.empty() ) {
560       if ( !doc() ) {
561 	throw runtime_error( "can't generate an ID without a doc" );
562       }
563       if ( (!ID) & supported ) {
564 	throw ValueError( "generate_id: xml:id is not supported for "
565 			  + classname() );
566       }
567       if ( val == "auto()" ){
568 	FoliaElement *par = parent();
569 	if ( par ) {
570 	  _id = par->generateId( xmltag() );
571 	}
572 	else {
573 	  throw ValueError( "generate_id `auto()' not possible without parent" );
574 	}
575       }
576       else {
577 	FoliaElement *e = (*doc())[val];
578 	if ( e ) {
579 	  _id = e->generateId( xmltag() );
580 	}
581 	else {
582 	  throw ValueError("Unable to generate an id from ID= " + val );
583 	}
584       }
585     }
586     else {
587       val = kwargs.extract( "xml:id" );
588       if ( val.empty() ) {
589 	val = kwargs.extract( "_id" ); // for backward compatibility
590       }
591       if ( !val.empty() ) {
592 	if ( (!ID) & supported ) {
593 	  throw ValueError( "xml:id is not supported for " + classname() );
594 	}
595 	else if ( val == "auto()" ){
596 	  FoliaElement *par = parent();
597 	  if ( par ) {
598 	    _id = par->generateId( xmltag() );
599 	  }
600 	  else {
601 	    throw ValueError( "auto-generate of 'xml:id' not possible without parent" );
602 	  }
603 	}
604 	else if ( isNCName( val ) ){
605 	  _id = val;
606 	}
607 	else {
608 	  throw XmlError( "'" + val + "' is not a valid NCName." );
609 	}
610       }
611     }
612 
613     _set.clear();
614     val = kwargs.extract( "set" );
615     if ( !val.empty() ) {
616       if ( !doc() ) {
617 	throw ValueError( "attribute set=" + val + " is used on a node without a document." );
618       }
619       if ( !( (CLASS & supported) || setonly() ) ) {
620 	throw ValueError("attribute 'set' is not supported for " + classname());
621       }
622       else {
623 	string st = doc()->unalias( annotation_type(), val );
624 	if ( st.empty() ){
625 	  _set = val;
626 	}
627 	else {
628 	  _set = st;
629 	}
630       }
631     }
632 
633     check_set_declaration();
634 
635     _class.clear();
636     val = kwargs.extract( "class" );
637     if ( !val.empty() ) {
638       if ( !( CLASS & supported ) ) {
639 	throw ValueError("Class is not supported for " + classname() );
640       }
641       if ( element_id() != TextContent_t && element_id() != PhonContent_t ) {
642 	if ( !doc() ) {
643 	  throw ValueError( "Class=" + val + " is used on a node without a document." );
644 	}
645 	if ( _set.empty() ){
646 	  if ( !doc()->declared( annotation_type(), "None" ) ) {
647 	    cerr << endl << doc()->annotationdefaults() << endl << endl;
648 	    throw ValueError( xmltag() +": An empty set is used but that has no declaration "
649 			      "for " + toString( annotation_type() )
650 			      + "-annotation" );
651 	  }
652 	  _set = "None";
653 	}
654 	doc()->incrRef( annotation_type(), _set );
655       }
656       _class = val;
657     }
658 
659     if ( element_id() != TextContent_t && element_id() != PhonContent_t ) {
660       if ( !_class.empty() && _set.empty() ) {
661 	throw ValueError("Set is required for <" + classname() +
662 			 " class=\"" + _class + "\"> assigned without set."  );
663       }
664     }
665 
666     _annotator.clear();
667     val = kwargs.extract( "annotator" );
668     if ( !val.empty() ) {
669       if ( !(ANNOTATOR & supported) ) {
670 	throw ValueError("attribute 'annotator' is not supported for " + classname() );
671       }
672       else {
673 	_annotator = val;
674       }
675     }
676     else {
677       string def;
678       if ( doc() &&
679 	   (def = doc()->default_annotator( annotation_type(), _set )) != "" ) {
680 	_annotator = def;
681       }
682     }
683 
684     _annotator_type = UNDEFINED;
685     val = kwargs.extract( "annotatortype" );
686     if ( !val.empty() ) {
687       if ( ! (ANNOTATOR & supported) ) {
688 	throw ValueError("Annotatortype is not supported for " + classname() );
689       }
690       else {
691 	_annotator_type = stringTo<AnnotatorType>( val );
692 	if ( _annotator_type == UNDEFINED ) {
693 	  throw ValueError( "annotatortype must be 'auto' or 'manual', got '"
694 			    + val + "'" );
695 	}
696       }
697     }
698     else {
699       if ( doc() ){
700 	AnnotatorType def = doc()->default_annotatortype( annotation_type(), _set );
701 	if ( def != UNDEFINED ) {
702 	  _annotator_type = def;
703 	}
704       }
705     }
706 
707     val = kwargs.extract( "processor" );
708     if ( !val.empty() ){
709       if ( doc() && doc()->debug > 2 ) {
710 	cerr << "set processor= " << val << " on " << classname() << endl;
711       }
712       if ( annotation_type() == AnnotationType::NO_ANN ){
713 	throw ValueError( "Unable to set processor on " + classname() + ". AnnotationType is None!" );
714       }
715       if ( _set.empty() ){
716 	_set = "None";
717       }
718       if ( !(ANNOTATOR & supported) ){
719 	throw ValueError( "attribute 'processor' is not supported for " + classname() );
720       }
721       else {
722 	if ( doc() && doc()->get_processor(val) == 0 ){
723 	  throw ValueError("attribute 'processor' has unknown value: " + val );
724 	}
725 	if ( doc()
726 	     && !doc()->declared( annotation_type(), _set, "", _annotator_type, val ) ){
727 	  if (	!doc()->version_below( 2, 0 )
728 		&& doc()->autodeclare() ) {
729 	    KWargs args;
730 	    args["processor"] = val;
731 	    args["annotatortype"] = _annotator_type;
732 	    doc()->declare( annotation_type(), _set, args );
733 	  }
734 	  else {
735 	    throw DeclarationError( "Processor '" + val
736 				    + "' is used for annotationtype '"
737 				    + toString( annotation_type() )
738 				    + "' with set='" + _set +"'"
739 				    + " but there is no corresponding <annotator>"
740 				    + " referring to it in the annotation"
741 				    + " declaration block." );
742 	  }
743 	}
744 	_processor = val;
745       }
746     }
747     else if ( (ANNOTATOR & supported) && doc() ){
748       string def;
749       try {
750 	def = doc()->default_processor( annotation_type(), _set );
751       }
752       catch ( const NoDefaultError& e ){
753 	if ( doc()->is_incremental() ){
754 	  // when there is NO default processor, AND we are parsing using
755 	  // folia::Engine, we must check if there WAS a processor originally
756 	  // which is 'obscured' by newly added declarations
757 	  def = doc()->original_default_processor( annotation_type() );
758 	  if ( doc()->debug > 2 ) {
759 	    cerr << "from original got default processor='" <<  def << "'" << endl;
760 	  }
761 	}
762 	else {
763 	  throw;
764 	}
765       }
766       _processor = def;
767     }
768 
769     _confidence = -1;
770     val = kwargs.extract( "confidence" );
771     if ( !val.empty() ) {
772       if ( !(CONFIDENCE & supported) ) {
773 	throw ValueError("Confidence is not supported for " + classname() );
774       }
775       else {
776 	try {
777 	  _confidence = stringTo<double>(val);
778 	  if ( _confidence < 0 || _confidence > 1.0 ){
779 	    throw ValueError("Confidence must be a floating point number between 0 and 1, got " + TiCC::toString(_confidence) );
780 	  }
781 	}
782 	catch (...) {
783 	  throw ValueError( "invalid Confidence value: " + val
784 			    + " (not a number?)");
785 	}
786       }
787     }
788 
789     _n = "";
790     val = kwargs.extract( "n" );
791     if ( !val.empty() ) {
792       if ( !(N & supported) ) {
793 	throw ValueError("N attribute is not supported for " + classname() );
794       }
795       else {
796 	_n = val;
797       }
798     }
799     _datetime.clear();
800     val = kwargs.extract( "datetime" );
801     if ( !val.empty() ) {
802       if ( !(DATETIME & supported) ) {
803 	throw ValueError("datetime attribute is not supported for " + classname() );
804       }
805       else {
806 	string time = parseDate( val );
807 	if ( time.empty() ){
808 	  throw ValueError( "invalid datetime, must be in YYYY-MM-DDThh:mm:ss format: " + val );
809 	}
810 	_datetime = time;
811       }
812     }
813     else {
814       string def;
815       if ( doc() &&
816 	   (def = doc()->default_datetime( annotation_type(), _set )) != "" ) {
817 	_datetime = def;
818       }
819     }
820     val = kwargs.extract( "begintime" );
821     if ( !val.empty() ) {
822       if ( !(BEGINTIME & supported) ) {
823 	throw ValueError( "begintime attribute is not supported for " + classname() );
824       }
825       else {
826 	string time = parseTime( val );
827 	if ( time.empty() ) {
828 	  throw ValueError( "invalid begintime, must be in HH:MM:SS.mmm format: " + val );
829 	}
830 	_begintime = time;
831       }
832     }
833     else {
834       _begintime.clear();
835     }
836     val = kwargs.extract( "endtime" );
837     if ( !val.empty() ) {
838       if ( !(ENDTIME & supported) ) {
839 	throw ValueError( "endtime attribute is not supported for " + classname() );
840       }
841       else {
842 	string time = parseTime( val );
843 	if ( time.empty() ) {
844 	  throw ValueError( "invalid endtime, must be in HH:MM:SS.mmm format: " + val );
845 	}
846 	_endtime = time;
847       }
848     }
849     else {
850       _endtime.clear();
851     }
852 
853     val = kwargs.extract( "src" );
854     if ( !val.empty() ) {
855       if ( !(SRC & supported) ) {
856 	throw ValueError( "src attribute is not supported for " + classname() );
857       }
858       else {
859 	_src = val;
860       }
861     }
862     else {
863       _src.clear();
864     }
865     val = kwargs.extract( "tag" );
866     if ( !val.empty() ) {
867       if ( !(TAG & supported) ) {
868 	throw ValueError( "tag attribute is not supported for " + classname() );
869       }
870       else {
871 	_tags = val;
872       }
873     }
874     else {
875       _tags.clear();
876     }
877 
878     if ( SPACE & supported ){
879       _space = true;
880     }
881     val = kwargs.extract( "space" );
882     if ( !val.empty() ) {
883       if ( !(SPACE & supported) ){
884 	throw ValueError( "space attribute is not supported for " + classname() );
885       }
886       else {
887 	if ( val == "no" ) {
888 	  _space = false;
889 	}
890 	else if ( val == "yes" ) {
891 	  _space = true;
892 	}
893 	else {
894 	  throw ValueError( "invalid value for space attribute: '" + val + "'" );
895 	}
896       }
897     }
898 
899     val = kwargs.extract( "metadata" );
900     if ( !val.empty() ) {
901       if ( !(METADATA & supported) ) {
902 	throw ValueError( "Metadata attribute is not supported for " + classname() );
903       }
904       else {
905 	_metadata = val;
906 	if ( doc() && doc()->get_submetadata( _metadata ) == 0 ){
907 	  throw KeyError( "No such metadata defined: " + _metadata );
908 	}
909       }
910     }
911     else {
912       _metadata.clear();
913     }
914     val = kwargs.extract( "speaker" );
915     if ( !val.empty() ) {
916       if ( !(SPEAKER & supported) ) {
917 	throw ValueError( "speaker attribute is not supported for " + classname() );
918       }
919       else {
920 	_speaker = val;
921       }
922     }
923     else {
924       _speaker.clear();
925     }
926 
927     val = kwargs.extract( "textclass" );
928     if ( !val.empty() ) {
929       if ( !(TEXTCLASS & supported) ) {
930 	throw ValueError( "textclass attribute is not supported for " + classname() );
931       }
932       else {
933 	_textclass = val;
934       }
935     }
936     else {
937       _textclass = "current";
938     }
939 
940     val = kwargs.extract( "auth" );
941     if ( !val.empty() ){
942       _auth = stringTo<bool>( val );
943     }
944     if ( doc() && !_id.empty() ) {
945       try {
946 	doc()->add_doc_index( this );
947       }
948       catch ( const DuplicateIDError& e ){
949 	if ( element_id() != WordReference_t ){
950 	  throw;
951 	}
952       }
953     }
954     kwargs.erase("typegroup"); //this is used in explicit form only, we can safely discard it
955     addFeatureNodes( kwargs );
956 #ifdef LOG_SET_ATT
957     if ( doc() ){
958       doc()->setdebug(db_level);
959     }
960 #endif
961   }
962 
addFeatureNodes(const KWargs & kwargs)963   void AbstractElement::addFeatureNodes( const KWargs& kwargs ) {
964     /// add children to the object, based on the set of Key-Value pairs.
965     /*!
966      * \param kwargs a KWargs set of Key-Value pairs
967      * the given keys must be in the AttributeFeatures set.
968      * the values are used as class attribute for the new children
969      * will throw for unexpected attributes, except when in permisive mode
970      */
971     for ( const auto& it: kwargs ) {
972       string tag = it.first;
973       if ( tag == "head" ) {
974 	// "head" is special because the tag is "headfeature"
975 	// this to avoid conflicts with the "head" tag!
976 	tag = "headfeature";
977       }
978       if ( AttributeFeatures.find( tag ) == AttributeFeatures.end() ) {
979 	string message = "unsupported attribute: " + tag + "='" + it.second
980 	  + "' for node with tag '" + classname() + "'";
981 	if ( tag == "id" ){
982 	  message += "\ndid you mean xml:id?";
983 	}
984 	if ( doc() && doc()->permissive() ) {
985 	  cerr << message << endl;
986 	}
987 	else {
988 	  throw XmlError( message );
989 	}
990       }
991       KWargs newa;
992       newa["class"] = it.second;
993       FoliaElement *new_node = createElement( tag, doc() );
994       new_node->setAttributes( newa );
995       append( new_node );
996     }
997   }
998 
toDoubleString(double d)999   string toDoubleString( double d ){
1000     if ( d == 1.0 ){
1001       return "1.0";
1002     }
1003     else if ( d == 0.0 ){
1004       return "0.0";
1005     }
1006     else {
1007       stringstream ss;
1008       ss.precision(6);
1009       ss << d;
1010       return ss.str();
1011     }
1012   }
1013 
collectAttributes() const1014   KWargs AllowXlink::collectAttributes() const {
1015     KWargs attribs;
1016     auto it = _xlink.find("type");
1017     if ( it != _xlink.end() ){
1018       string type = it->second;
1019       if ( type == "simple" || type == "locator" ){
1020 	it = _xlink.find("href");
1021 	if ( it != _xlink.end() ){
1022 	  attribs["xlink:href"] = it->second;
1023 	  attribs["xlink:type"] = type;
1024 	}
1025 	it = _xlink.find("role");
1026 	if ( it != _xlink.end() ){
1027 	  attribs["xlink:role"] = it->second;
1028 	}
1029 	it = _xlink.find("arcrole");
1030 	if ( it != _xlink.end() ){
1031 	  attribs["xlink:arcrole"] = it->second;
1032 	}
1033 	it = _xlink.find("show");
1034 	if ( it != _xlink.end() ){
1035 	  attribs["xlink:show"] = it->second;
1036 	}
1037 	it = _xlink.find("actuate");
1038 	if ( it != _xlink.end() ){
1039 	  attribs["xlink:actuate"] = it->second;
1040 	}
1041 	it = _xlink.find("title");
1042 	if ( it != _xlink.end() ){
1043 	  attribs["xlink:title"] = it->second;
1044 	}
1045 	it = _xlink.find("label");
1046 	if ( it != _xlink.end() ){
1047 	  attribs["xlink:label"] = it->second;
1048 	}
1049       }
1050     }
1051     return attribs;
1052   }
1053 
set_typegroup(KWargs & attribs) const1054   void AbstractElement::set_typegroup( KWargs& attribs ) const {
1055     if ( isSubClass( AbstractStructureElement_t ) ){
1056       attribs["typegroup"] = "structure";
1057     }
1058     else if ( isSubClass(  Feature_t ) ){
1059       attribs["typegroup"] = "feature";
1060     }
1061     else if ( isSubClass( AbstractInlineAnnotation_t ) ){
1062       attribs["typegroup"] = "inline";
1063     }
1064     else if ( isSubClass( AbstractHigherOrderAnnotation_t ) ){
1065       attribs["typegroup"] = "higherorder";
1066     }
1067     else if ( isSubClass(  AbstractSpanRole_t ) ){
1068       attribs["typegroup"] = "spanrole";
1069     }
1070     else if ( isSubClass(  AbstractSpanAnnotation_t ) ){
1071       attribs["typegroup"] = "span";
1072     }
1073     else if ( isSubClass(  AbstractTextMarkup_t ) ){
1074       attribs["typegroup"] = "textmarkup";
1075     }
1076     else if ( isSubClass(  AbstractContentAnnotation_t ) ){
1077       attribs["typegroup"] = "content";
1078     }
1079     else if ( isSubClass(  AbstractAnnotationLayer_t ) ){
1080       attribs["typegroup"] = "layer";
1081     }
1082     else if ( isSubClass(  AbstractSubtokenAnnotation_t ) ){
1083       attribs["typegroup"] = "subtoken";
1084     }
1085     else if ( isSubClass(  AbstractCorrectionChild_t ) ){
1086       attribs["typegroup"] = "correctionchild";
1087     }
1088     else {
1089       cerr << "UNHANDLED " << element_id() << endl;
1090     }
1091   }
1092 
collectAttributes() const1093   KWargs AbstractElement::collectAttributes() const {
1094     /// extract all Attribute-Value pairs from the object
1095     /*!
1096      * \return a KWargs set of Attribute-value pairs
1097      * Might also use declaration defaults and alias declarations to extract
1098      * default values
1099      */
1100     KWargs attribs;
1101     bool Explicit = false;
1102     Attrib supported = required_attributes() | optional_attributes();
1103     if ( doc() && doc()->has_explicit() ){
1104       Explicit = true;
1105       set_typegroup( attribs );
1106     }
1107     if ( !_id.empty() ) {
1108       attribs["xml:id"] = _id;
1109     }
1110     if ( _preserve_spaces == SPACE_FLAGS::PRESERVE ) {
1111       attribs["xml:space"] = "preserve";
1112     }
1113     if ( doc() ){
1114       string default_set = doc()->default_set( annotation_type() );
1115       bool isDefaultSet = (_set == default_set);
1116       if ( Explicit && _set != "None" && !default_set.empty() ){
1117 	if ( _set.empty() ){
1118 	  attribs["set"] = default_set;
1119 	}
1120 	else {
1121 	  attribs["set"] = _set;
1122 	}
1123       }
1124       else if ( _set != "None"
1125 		&& !_set.empty()
1126 		&& !isDefaultSet ){
1127 	string ali = doc()->alias( annotation_type(), _set );
1128 	if ( ali.empty() ){
1129 	  attribs["set"] = _set;
1130 	}
1131 	else {
1132 	  attribs["set"] = ali;
1133 	}
1134       }
1135       if ( !_class.empty() ) {
1136 	attribs["class"] = _class;
1137       }
1138       if ( !_processor.empty() ){
1139 	string tmp;
1140 	try {
1141 	  tmp = doc()->default_processor( annotation_type(), _set );
1142 	  if ( Explicit ){
1143 	    attribs["processor"] = tmp;
1144 	  }
1145 	}
1146 	catch ( const NoDefaultError& ){
1147 	}
1148 	catch ( ... ){
1149 	  throw;
1150 	}
1151 	if ( tmp != _processor ){
1152 	  attribs["processor"] = _processor;
1153 	}
1154       }
1155       else {
1156 	bool isDefaultAnn = true;
1157 	if ( !_annotator.empty() &&
1158 	     _annotator != doc()->default_annotator( annotation_type(), _set ) ) {
1159 	  isDefaultAnn = false;
1160 	  attribs["annotator"] = _annotator;
1161 	}
1162 	if ( _annotator_type != UNDEFINED ){
1163 	  AnnotatorType at = doc()->default_annotatortype( annotation_type(), _set );
1164 	  if ( (!isDefaultSet || !isDefaultAnn)
1165 	       && _annotator_type != at ) {
1166 	    if ( _annotator_type == AUTO ) {
1167 	      attribs["annotatortype"] = "auto";
1168 	    }
1169 	    else if ( _annotator_type == MANUAL ) {
1170 	      attribs["annotatortype"] = "manual";
1171 	    }
1172 	  }
1173 	}
1174       }
1175     }
1176     if ( !_datetime.empty() &&
1177 	 _datetime != doc()->default_datetime( annotation_type(), _set ) ) {
1178       attribs["datetime"] = _datetime;
1179     }
1180     if ( !_begintime.empty() ) {
1181       attribs["begintime"] = _begintime;
1182     }
1183     if ( !_endtime.empty() ) {
1184       attribs["endtime"] = _endtime;
1185     }
1186     if ( !_src.empty() ) {
1187       attribs["src"] = _src;
1188     }
1189     if ( !_tags.empty() ) {
1190       attribs["tag"] = _tags;
1191     }
1192     if ( !_metadata.empty() ) {
1193       attribs["metadata"] = _metadata;
1194     }
1195     if ( !_speaker.empty() ) {
1196       attribs["speaker"] = _speaker;
1197     }
1198     if ( ( TEXTCLASS & supported)
1199 	 && ( !_textclass.empty() &&
1200 	      ( _textclass != "current" || Explicit ) ) ){
1201       attribs["textclass"] = _textclass;
1202     }
1203 
1204     if ( _confidence >= 0 ) {
1205       attribs["confidence"] = toDoubleString(_confidence);
1206     }
1207     if ( !_n.empty() ) {
1208       attribs["n"] = _n;
1209     }
1210     if ( !_auth ) {
1211       attribs["auth"] = "no";
1212     }
1213     if ( SPACE & optional_attributes() ){
1214       if ( !_space ) {
1215 	attribs["space"] = "no";
1216       }
1217     }
1218     return attribs;
1219   }
1220 
xmlstring(bool add_ns) const1221   const string FoliaElement::xmlstring( bool add_ns ) const{
1222     /// serialize a FoLiAElement to a string (XML fragment)
1223     /*!
1224      * \param add_ns Also add the NameSpace declarations
1225      * \return a string representation of the FoLiA XML
1226      */
1227     return xmlstring( false, 0, add_ns );
1228   }
1229 
xmlstring(bool format,int indent,bool add_ns) const1230   const string FoliaElement::xmlstring( bool format,
1231 					int indent,
1232 					bool add_ns ) const{
1233     /// serialize a FoLiAElement to a string (XML fragment)
1234     /*!
1235      * \param format allow output formating
1236      * \param indent number of spaces to indent
1237      * \param add_ns Also add the NameSpace declarations
1238      * \return a string representation of the FoLiA XML
1239      */
1240     xmlNode *n = xml( true, false );
1241     if ( add_ns ){
1242       xmlSetNs( n, xmlNewNs( n, (const xmlChar *)NSFOLIA.c_str(), 0 ) );
1243     }
1244     xmlBuffer *buf = xmlBufferCreate();
1245     //    xmlKeepBlanksDefault(0);
1246     xmlNodeDump( buf, 0, n, indent, (format?1:0) );
1247     string result = (const char*)xmlBufferContent( buf );
1248     xmlBufferFree( buf );
1249     xmlFreeNode( n );
1250     return result;
1251   }
1252 
tagToAtt(const FoliaElement * c)1253   string tagToAtt( const FoliaElement* c ) {
1254     /// helper function. Given an element of type Feature_t, return the tag value
1255     /*!
1256      * \param c some FoLiAElement
1257      * \return the string value of attribute related to the tag of the parameter
1258      * if the element is of type Feature_t is has an asscociated attribute
1259      * otherwise not, and the empty string is returned.
1260      */
1261     string att;
1262     if ( c->isSubClass( Feature_t ) ) {
1263       att = c->xmltag();
1264       if ( att == "feat" ) {
1265 	// "feat" is a Feature_t too. exclude!
1266 	att = "";
1267       }
1268       else if ( att == "headfeature" ) {
1269 	// "head" is special
1270 	att = "head";
1271       }
1272     }
1273     return att;
1274   }
1275 
1276 
CheckText(const FoliaElement * parent,const FoliaElement * child,const string & cls)1277   void CheckText( const FoliaElement *parent,
1278 		  const FoliaElement *child,
1279 		  const string& cls ){
1280     if ( parent
1281 	 && parent->element_id() != Correction_t
1282 	 && parent->hastext( cls ) ){
1283       // check text consistency for parents with text
1284       // but SKIP Corrections
1285       UnicodeString s1 = parent->stricttext( cls );
1286       UnicodeString s2 = child->stricttext( cls );
1287       // cerr << "check parent: " << s1 << endl;
1288       // cerr << "check child: " << s2 << endl;
1289       // no retain tokenization, strict for both
1290       s1 = normalize_spaces( s1 );
1291       s2 = normalize_spaces( s2 );
1292       if ( !s1.isEmpty() && !s2.isEmpty() ){
1293 	bool test_fail;
1294 	if ( child->isSubClass( TextContent_t )
1295 	     || child->isSubClass( AbstractTextMarkup_t )
1296 	     || child->isSubClass( String_t )
1297 	     || child->isSubClass( Word_t ) ){
1298 	  // Words and Strings are 'per definition' PART of their parents
1299 	  test_fail = ( s1.indexOf( s2 ) < 0 ); // aren't they?
1300 	}
1301 	else {
1302 	  // otherwise an exacte match is needed
1303 	  test_fail = ( s1 != s2 );
1304 	}
1305 	if ( test_fail ){
1306 	  throw InconsistentText( "adding text (class="
1307 				  + cls + ") from node: " + child->xmltag()
1308 				  + "(" + child->id() + ")"
1309 				  + " with value\n'" + TiCC::UnicodeToUTF8(s2)
1310 				  + "'\n to element: " + parent->xmltag() +
1311 				  + "(" + parent->id() + ") which already has "
1312 				  + "text in that class and value: \n'"
1313 				  + TiCC::UnicodeToUTF8(s1) + "'\n" );
1314 	}
1315       }
1316     }
1317   }
1318 
CheckText2(const FoliaElement * parent,const FoliaElement * child,const string & cls,bool trim_spaces)1319   void  CheckText2( const FoliaElement *parent,
1320 		    const FoliaElement *child,
1321 		    const string& cls,
1322 		    bool trim_spaces ){
1323     if ( parent
1324 	 && parent->hastext( cls ) ){
1325       // check text consistency for parents with text
1326       // but SKIP Corrections
1327       // no retain tokenization, strict for parent, deeper for child
1328       TextPolicy tp( cls );
1329       tp.set( TEXT_FLAGS::STRICT );
1330       if ( !trim_spaces ) {
1331 	tp.set( TEXT_FLAGS::NO_TRIM_SPACES );
1332       }
1333       UnicodeString s1 = parent->text( tp );
1334       tp.clear( TEXT_FLAGS::STRICT );
1335       UnicodeString s2 = child->text( tp );
1336       s1 = normalize_spaces( s1 );
1337       s2 = normalize_spaces( s2 );
1338       bool test_fail;
1339       if ( child->isSubClass( Word_t )
1340 	   || child->isSubClass( String_t )
1341 	   || child->isSubClass( AbstractTextMarkup_t ) ) {
1342 	// Words, Strings and AbstractTextMarkup are 'per definition' PART of
1343 	// their text parents
1344 	test_fail = ( s1.indexOf( s2 ) < 0 ); // aren't they?
1345       }
1346       else {
1347 	// otherwise an exacte match is needed
1348 	test_fail = ( s1 != s2 );
1349       }
1350       if ( test_fail ){
1351         bool warn_only = false;
1352         if ( trim_spaces ) {
1353 	  //ok, we failed according to the >v2.4.1 rules
1354 	  //but do we also fail under the old rules?
1355 	  try {
1356 	    child->check_text_consistency(false);
1357 	    warn_only = true;
1358 	  } catch ( const InconsistentText& ) {
1359 	    //ignore, we raise the newer error
1360 	  }
1361         }
1362 	string msg = "conflicting text (class="
1363 	  + cls + ") from node: " + child->xmltag()
1364 	  + "(" + child->id() + ")"
1365 	  + " with value\n'" + TiCC::UnicodeToUTF8(s2)
1366 	  + "'\n with parent: " + parent->xmltag() +
1367 	  + "(" + parent->id() + ") which already has "
1368 	  + "text in that class and value: \n'"
1369 	  + TiCC::UnicodeToUTF8(s1) + "'\n";
1370         if ( warn_only ) {
1371 	  msg += "However, according to the older rules (<v2.4.1) the text is consistent. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.\n";
1372 	  cerr << "WARNING: inconsistent text: " << msg << endl;
1373 	  parent->doc()->increment_warn_count();
1374         }
1375 	else {
1376 	  throw InconsistentText(msg);
1377         }
1378       }
1379     }
1380   }
1381 
check_append_text_consistency(const FoliaElement * child) const1382   void AbstractElement::check_append_text_consistency( const FoliaElement *child ) const {
1383     /// check the text consistency of a new child against the Element.
1384     /*!
1385      * \param child the new child
1386      *
1387      * When a document is available AND it has the checktext() property
1388      * the text of the child is checked against the text of the parent.
1389      *
1390      * will throw on error.
1391      *
1392      * For Word, String and TextContent children, we assume that their text is
1393      * embedded in the parents text.
1394      *
1395      * For all other cases, the text of the child should match the parents text.
1396      * \note Matching is opaque to spaces, newlines and tabs
1397      */
1398     //    cerr << "VOOR checkappend I am=" << this << endl;
1399     //    cerr << "VOOR checkappend child=" << child << endl;
1400     if ( !doc() || !doc()->checktext() || doc()->fixtext() ){
1401       return;
1402     }
1403     string cls = child->cls();
1404     //    cerr << "HIER 2 " << cls << endl;
1405     if ( child->size() == 0
1406 	 || ( child->is_textcontainer()
1407 	      && !child->hastext( cls ) ) ){
1408       // no use to proceed. not adding real text
1409       return;
1410     }
1411     //    cerr << "HIER 3 " << endl;
1412     const FoliaElement *parent = 0;
1413     if ( child->is_textcontainer() ){
1414       parent = this->parent();
1415     }
1416     else {
1417       parent = this;
1418       cls = child->index(0)->cls();
1419     }
1420     //    cerr << "PARENT? " << parent << endl;
1421     CheckText( parent, child, cls );
1422   }
1423 
check_text_consistency(bool trim_spaces) const1424   void AbstractElement::check_text_consistency( bool trim_spaces ) const {
1425     /// check the text consistency of the combined text of the children
1426     /// against the text of the Element.
1427     /*!
1428      * When a document is available AND it has the checktext() property
1429      * the combined text of ALL the children is checked against the text of
1430      * the parent.
1431      *
1432      * will throw on error
1433      *
1434      * For Word and String children, we only assume that their text is
1435      * embedded in the parents text.
1436      *
1437      * For all other cases, the text should exactly match the parents text.
1438      * \note Matching is opaque to spaces, newlines and tabs
1439      */
1440     if ( !doc() || !doc()->checktext() || !printable() ){
1441       return;
1442     }
1443 
1444     string cls = this->cls();
1445     FoliaElement *parent = this->parent();
1446     CheckText2( parent, this, cls, trim_spaces );
1447   }
1448 
check_text_consistency_while_parsing(bool trim_spaces,bool debug)1449   void AbstractElement::check_text_consistency_while_parsing( bool trim_spaces,
1450 							      bool debug ) {
1451       // this block was moved from parseXml into a separate function
1452       // it remains to be seen how much overlaps with check_text_consistency()
1453       // and whether we can't make do with one function
1454       //
1455       // unlike the other function, this does do some fixing when requested
1456       //
1457 
1458     if ( debug ){
1459       cerr << "DEBUG: BEGIN check_text_consistency_while_parsing("
1460 	   << trim_spaces << ")" << endl;
1461     }
1462     vector<TextContent*> tv = select<TextContent>( false );
1463     // first see which text classes are present
1464     set<string> classes;
1465     for ( const auto& it : tv ){
1466       classes.insert( it->cls() );
1467     }
1468     // check the text for every text class
1469     for ( const auto& st : classes ){
1470       UnicodeString s1, s2;
1471       TextPolicy tp( st );
1472       tp.set_correction_handling(CORRECTION_HANDLING::EITHER);
1473       tp.set( TEXT_FLAGS::STRICT );
1474       tp.set_debug( debug );
1475       if ( !trim_spaces ) {
1476 	tp.set( TEXT_FLAGS::NO_TRIM_SPACES );
1477       }
1478       try {
1479 	s1 = text( tp );  // no retain tokenization, strict
1480       }
1481       catch (...){
1482       }
1483       if ( !s1.isEmpty() ){
1484 	//	  cerr << "S1: " << s1 << endl;
1485 	tp.clear( TEXT_FLAGS::STRICT );
1486 	try {
1487 	  s2 = text( tp ); // no retain tokenization, no strict
1488 	}
1489 	catch (...){
1490 	}
1491 	//	  cerr << "S2: " << s2 << endl;
1492 	s1 = normalize_spaces( s1 );
1493 	s2 = normalize_spaces( s2 );
1494 	if ( !s2.isEmpty() && s1 != s2 ){
1495 	  if ( doc()->fixtext() ){
1496 	    //	      cerr << "FIX: " << s1 << "==>" << s2 << endl;
1497 	    KWargs args;
1498 	    args["value"] = TiCC::UnicodeToUTF8(s2);
1499 	    args["class"] = st;
1500 	    TextContent *node = new TextContent( args, doc() );
1501 	    this->replace( node );
1502 	  }
1503 	  else {
1504 	    bool warn_only = false;
1505 	    if ( trim_spaces ) {
1506 	      //ok, we failed according to the >v2.4.1 rules
1507 	      //but do we also fail under the old rules?
1508 	      try {
1509 		if ( debug ){
1510 		  cerr << "DEBUG: (testing according to older rules now)" << endl;
1511 		}
1512 		this->check_text_consistency_while_parsing(false);
1513 		warn_only = true;
1514 	      }
1515 	      catch ( const InconsistentText& e ) {
1516 		if ( debug ){
1517 		  cerr << "(tested according to older rules (<v2.4.1) as well, but this failed too)" << endl;
1518 		}
1519 		//ignore, we raise the newer error
1520 	      }
1521 	    }
1522 	    string msg = "node " + xmltag() + "(" + id()
1523 	      + ") has a mismatch for the text in set:" + st
1524 	      + "\nthe element text ='" + TiCC::UnicodeToUTF8(s1)
1525 	      + "'\n" + " the deeper text ='" + TiCC::UnicodeToUTF8(s2) + "'";
1526 	    if ( warn_only ) {
1527 	      msg += "\nHOWEVER, according to the older rules (<v2.4.1) the text is consistent. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.\n";
1528 	      cerr << "WARNING: inconsistent text: " << msg << endl;
1529 	      doc()->increment_warn_count();
1530 	    }
1531 	    else {
1532 	      if ( debug ){
1533 		cerr << "DEBUG: CONSISTENCYERROR check_text_consistency_while_parsing(" << trim_spaces << ")" << endl;
1534 	      }
1535 	      throw InconsistentText(msg);
1536 	    }
1537 	  }
1538 	}
1539       }
1540     }
1541     if ( debug ){
1542       cerr << "DEBUG: END-OK check_text_consistency_while_parsing("
1543 	   << trim_spaces << ")" << endl;
1544     }
1545   }
1546 
xml(bool recursive,bool kanon) const1547   xmlNode *AbstractElement::xml( bool recursive, bool kanon ) const {
1548     /// convert an Element to an xmlNode
1549     /*!
1550      * \param recursive Convert the children too, creating a xmlNode tree
1551      * \param kanon Output in a canonical form to make comparions easy
1552      * \return am xmlNode object(-tree)
1553      */
1554     xmlNode *e = XmlNewNode( foliaNs(), xmltag() );
1555     KWargs attribs = collectAttributes();
1556     if ( _preserve_spaces == SPACE_FLAGS::PRESERVE ){
1557       // we carry an 'xml:space="preserve" flag?
1558       if ( doc()->preserve_spaces() ){
1559 	// if our ancestor did also, clear it here
1560 	attribs.extract( "xml:space" );
1561       }
1562       else {
1563 	// otherwise leave it, and notify our document
1564 	doc()->set_preserve_spaces(true);
1565       }
1566     }
1567     else if ( doc()->preserve_spaces() ){
1568       // this subtree should go back to "default" then
1569       attribs["xml:space"] = "default";
1570       // and the doc needs to know it
1571       doc()->set_preserve_spaces(false);
1572     }
1573     set<FoliaElement *> attribute_elements;
1574     // nodes that can be represented as attributes are converted to atributes
1575     // and excluded of 'normal' output.
1576 
1577     if ( !doc()->has_explicit() ){
1578       map<string,int> af_map;
1579       // first we search al features that can be serialized to an attribute
1580       // and count them!
1581       for ( const auto& el : _data ) {
1582 	string at = tagToAtt( el );
1583 	if ( !at.empty() ) {
1584 	  ++af_map[at];
1585 	}
1586       }
1587       // ok, now we create attributes for those that only occur once
1588       for ( const auto& el : _data ) {
1589 	string at = tagToAtt( el );
1590 	if ( !at.empty() && af_map[at] == 1 ) {
1591 	  attribs[at] = el->cls();
1592 	  attribute_elements.insert( el );
1593 	}
1594       }
1595     }
1596     addAttributes( e, attribs );
1597     if ( _data.empty() ){
1598       return e; // we are done
1599     }
1600     if ( recursive ) {
1601       // append children:
1602       // we want make sure that text elements are in the right order,
1603       // in front and the 'current' class first
1604       list<FoliaElement *> currenttextelements;
1605       list<FoliaElement *> textelements;
1606       list<FoliaElement *> otherelements;
1607       list<FoliaElement *> commentelements;
1608       multimap<ElementType, FoliaElement *, std::greater<ElementType>> otherelementsMap;
1609       for ( const auto& el : _data ) {
1610 	if ( attribute_elements.find(el) == attribute_elements.end() ) {
1611 	  if ( el->isinstance(TextContent_t) ) {
1612 	    if ( el->cls() == "current" ) {
1613 	      currenttextelements.push_back( el );
1614 	    }
1615 	    else {
1616 	      textelements.push_back( el );
1617 	    }
1618 	  }
1619 	  else {
1620 	    if ( kanon ) {
1621 	      otherelementsMap.insert( make_pair( el->element_id(), el ) );
1622 	    }
1623 	    else {
1624 	      if ( el->isinstance(XmlComment_t)
1625 		   && currenttextelements.empty()
1626 		   && textelements.empty() ) {
1627 		commentelements.push_back( el );
1628 	      }
1629 	      else {
1630 		otherelements.push_back( el );
1631 	      }
1632 	    }
1633 	  }
1634 	}
1635       }
1636       for ( const auto& cel : commentelements ) {
1637 	xmlAddChild( e, cel->xml( recursive, kanon ) );
1638       }
1639       for ( const auto& tel : currenttextelements ) {
1640 	xmlAddChild( e, tel->xml( recursive, false ) );
1641 	// don't change the internal sequences of TextContent elements
1642       }
1643       for ( const auto& tel : textelements ) {
1644 	xmlAddChild( e, tel->xml( recursive, false ) );
1645 	// don't change the internal sequences of TextContent elements
1646       }
1647       if ( !kanon ) {
1648 	for ( const auto& oem : otherelements ) {
1649 	  xmlAddChild( e, oem->xml( recursive, kanon ) );
1650 	}
1651       }
1652       else {
1653 	for ( const auto& oem : otherelementsMap ) {
1654 	  xmlAddChild( e, oem.second->xml( recursive, kanon ) );
1655 	}
1656       }
1657       check_text_consistency();
1658     }
1659     return e;
1660   }
1661 
str(const string & cls) const1662   const string AbstractElement::str( const string& cls ) const {
1663     /// return the text value of this element
1664     /*!
1665      * \param cls The desired textclass
1666      * \return the string value (UTF8 encoded)
1667      *
1668      * if this is a TextContent or it may contain TextContent
1669      * then return the associated text()
1670      *
1671      * if this is a PhonContent or it may contain PhonContent
1672      * then return the associated phon()
1673      *
1674      * otherwise return the empty string
1675      */
1676     UnicodeString us;
1677     try {
1678       us = text(cls);
1679     }
1680     catch( const NoSuchText& ){
1681       try {
1682 	us = phon(cls);
1683       }
1684       catch( const NoSuchPhon&){
1685 	// No TextContent or Phone is allowed
1686       }
1687     }
1688     return TiCC::UnicodeToUTF8( us );
1689   }
1690 
str(const TextPolicy & tp) const1691   const string AbstractElement::str( const TextPolicy& tp ) const {
1692     /// return the text value of this element
1693     /*!
1694      * \param tp the TextPolicy to use
1695      * \return the string value (UTF8 encoded)
1696      *
1697      * if this is a TextContent or it may contain TextContent
1698      * then return the associated text()
1699      *
1700      * if this is a PhonContent or it may contain PhonContent
1701      * then return the associated phon()
1702      *
1703      * otherwise return the empty string
1704      */
1705     UnicodeString us;
1706     try {
1707       us = text( tp );
1708     }
1709     catch( const NoSuchText& ){
1710       try {
1711 	us = phon( tp );
1712       }
1713       catch( const NoSuchPhon&){
1714 	// No TextContent or Phone is allowed
1715       }
1716     }
1717     return TiCC::UnicodeToUTF8( us );
1718   }
1719 
speech_src() const1720   const string AbstractElement::speech_src() const {
1721     /// give the value of the _scr of an element
1722     /*!
1723      * return a (possibly empty) string.
1724      *
1725      * This function recurses upward to the first element which carries _src
1726      */
1727     if ( !_src.empty() ) {
1728       return _src;
1729     }
1730     if ( _parent ) {
1731       return _parent->speech_src();
1732     }
1733     return "";
1734   }
1735 
speech_speaker() const1736   const string AbstractElement::speech_speaker() const {
1737     /// give the value of the _speaker of an element
1738     /*!
1739      * return a (possibly empty) string.
1740      *
1741      * This function recurses upward to the first element which carries _speaker
1742      */
1743     if ( !_speaker.empty() ) {
1744       return _speaker;
1745     }
1746     if ( _parent ) {
1747       return _parent->speech_speaker();
1748     }
1749     return "";
1750   }
1751 
language(const string & st) const1752   const string AbstractElement::language( const string& st ) const {
1753     /// give the language value of an element
1754     /*!
1755      * \param st the setname to us for searching
1756      * The search will start at the object, and recurse upward until
1757      * the document level, where it will return the Documents language
1758      * Might return "" when no match is found
1759      */
1760     set<ElementType> exclude;
1761     vector<LangAnnotation*> v = select<LangAnnotation>( st, exclude, false );
1762     if ( v.size() > 0 ){
1763       return v[0]->cls();
1764     }
1765     else if ( _parent ){
1766       return _parent->language( st );
1767     }
1768     else {
1769       return doc()->language();
1770     }
1771   }
1772 
hastext(const string & cls) const1773   bool FoliaElement::hastext( const string& cls ) const {
1774     /// check if the element has a TextContent with class 'cls'
1775     /*!
1776      * \param cls The desired textclass
1777      * \return true if there is a TextContent available. Otherwise false
1778      */
1779     try {
1780       this->text_content(cls);
1781       return true;
1782     } catch ( const NoSuchText& e ) {
1783       return false;
1784     }
1785   }
1786 
hasphon(const string & cls) const1787   bool FoliaElement::hasphon( const string& cls ) const {
1788     /// check if the element has a PhonContent with class 'cls'
1789     /*!
1790      * \param cls The desired textclass
1791      * \return true if there is a PhonContent available. Otherwise false
1792      */
1793     try {
1794       this->phon_content(cls);
1795       return true;
1796     } catch ( const NoSuchPhon& e ) {
1797       return false;
1798     }
1799   }
1800 
get_delimiter(const TextPolicy & tp) const1801   const string& AbstractElement::get_delimiter( const TextPolicy& tp ) const {
1802     /// get the default delimiter of this object.
1803     /*!
1804      * \param tp the TextPolicy to use
1805      * \return a string representing the delimiter
1806      *
1807      * If the object has a TEXTDELIMITER property thats is returned
1808      * Otherwise, the last child is taken and its delimiter is returned IF
1809      * it is a Structure Element.
1810      * When this test fails, an empty string is returned, UNLESS the element has
1811      * the SPACE attribute AND retaintok is specified
1812      */
1813     bool retaintok  = tp.is_set( TEXT_FLAGS::RETAIN );
1814     if ( tp.debug() ){
1815       cerr << "IN <" << xmltag() << ">:get_delimiter (" << retaintok << ")"
1816 	   << endl;
1817     }
1818     if ( (SPACE & optional_attributes()) ){
1819       if ( ! ( _space || retaintok ) ){
1820 	if ( tp.debug() ){
1821 	  cerr << " space = NO, return: '" << EMPTY_STRING << "'" << endl;
1822 	}
1823 	return EMPTY_STRING;
1824       }
1825     }
1826 
1827     if ( !_data.empty() ){
1828       FoliaElement *last = _data.back();
1829       if ( last &&
1830 	   last->isSubClass(AbstractStructureElement_t)
1831 	   && !last->space() ){
1832 	return EMPTY_STRING;
1833       }
1834     }
1835     if ( text_delimiter() != "NONE" ) {
1836       return text_delimiter();
1837     }
1838     else if ( _data.size() > 0 ) {
1839       // attempt to get a delimiter from the last child
1840       FoliaElement *last = _data.back();
1841       if ( last->isSubClass(AbstractStructureElement_t) ){
1842 	const string& det = last->get_delimiter( tp );
1843 	if ( tp.debug() ){
1844 	  cerr << "out <" << xmltag() << ">:get_delimiter ==> '" << det << "'"
1845 	       << endl;
1846 	}
1847 	return det;
1848       }
1849     }
1850     if ( tp.debug() ){
1851       cerr << "out <" << xmltag() << ">:get_delimiter ==> ''" << endl;
1852     }
1853     return EMPTY_STRING;
1854   }
1855 
is_space(const UChar32 kar)1856   bool is_space( const UChar32 kar ){
1857     return ( kar == 0x0020       // space
1858 	     || kar == 0x0009    // tab
1859 	     || kar == 0x000a    // newline
1860 	     || kar == 0x000d ); // carriage return
1861   }
1862 
text_container_text(const TextPolicy & tp) const1863   UnicodeString AbstractElement::text_container_text( const TextPolicy& tp ) const {
1864     string desired_class = tp.get_class();
1865     if ( isinstance( TextContent_t )
1866 	 && cls() != desired_class ) {
1867       // take a shortcut for TextContent in wrong class
1868       if ( tp.debug() ){
1869 	cerr << "TextContent shortcut, class=" << cls()
1870 	     << " but looking for: " << desired_class << endl;
1871       }
1872       return "";
1873     }
1874     UnicodeString result;
1875     bool pendingspace = false;
1876     bool trim_spaces = !tp.is_set( TEXT_FLAGS::NO_TRIM_SPACES);
1877     for ( const auto& d : _data ){
1878       if (d->isinstance( XmlText_t)) {
1879 	// 'true' text child
1880 	if (pendingspace) {
1881 	  result += " ";
1882 	  pendingspace = false;
1883 	}
1884 	if ( trim_spaces) {
1885 	  //This implements https://github.com/proycon/folia/issues/88
1886 	  //FoLiA >= v2.5 behaviour (introduced earlier in v2.4.1 but modified thereafter)
1887 	  const int l = result.length();
1888 	  UnicodeString text = d->text( tp );
1889 	  int begin = 0;
1890 	  int linenr = 0;
1891 	  for ( int i = 0; i < text.length(); ++i ) {
1892 	    if ( text[i] == 0x000a
1893 		 || (i == text.length() - 1) ) {
1894 	      //newline or end
1895 	      UnicodeString line;
1896 	      if ( text[i] == 0x000a ) { //newline
1897 		line = UnicodeString(text, begin, i - begin);
1898 	      }
1899 	      else {
1900 		line = UnicodeString(text, begin, text.length() - begin);
1901 	      }
1902 	      begin = i+1;
1903 
1904 	      UnicodeString subresult;
1905 	      if ( _preserve_spaces == SPACE_FLAGS::PRESERVE) {
1906 		if ( line.length() > 0
1907 		     && line[line.length() - 1] == 0x000d) {
1908 		  //carriage return
1909 		  //remove artefacts of any DOS-style line endings (not sure if still
1910 		  //needed here but better safe than sorry)
1911 		  line = UnicodeString(line, 0, line.length() - 1);
1912 		}
1913 		subresult = line;
1914 	      }
1915 	      else {
1916 		subresult = normalize_spaces(trim_space(line));
1917 	      }
1918 
1919 	      if ( (linenr > 0)
1920 		   && (subresult.length() > 0)
1921 		   && (result.length() != l) ) {
1922 		//insert spaces between lines that used to be newline separated
1923 		result.append((UChar32) 0x0020);
1924 	      }
1925 	      else if ( (subresult.length() > 0)
1926 			&& (line.length() > 0)
1927 			&& ( is_space(line[0]) )
1928 			&& this->_preserve_spaces != SPACE_FLAGS::PRESERVE ) {
1929 		//we have leading indentation we may need to collapse or ignore entirely
1930 		//we can't be sure yet what to do so we add a temporary placeholder \1
1931 		//this will later be handled in postprocess_spaces() (converts to a space only if no space preceeds it)
1932 		result.append(0x0001);
1933 	      }
1934 	      result += subresult;
1935 	      linenr++;
1936 	    }
1937 	  }
1938 
1939 	  if ( this->_preserve_spaces != SPACE_FLAGS::PRESERVE
1940 	       && text.length() > 0
1941 	       && result.length() > 0
1942 	       && is_space(text[text.length() - 1])
1943 	       && !is_space(result[result.length() - 1]) ){
1944 	    //this item has trailing spaces but we stripped them
1945 	    //this may be premature so
1946 	    //we reserve to output them later in case there is a next item
1947 	    pendingspace = true;
1948 	  }
1949 	}
1950 	else {
1951 	  //old FoLiA <= v2.4.1 behaviour, we don't trim anything
1952 	  result += d->text( tp );
1953 	}
1954       }
1955       else if ( d->printable() ){
1956 	// this is some TextMarkup I hope
1957 	if (pendingspace) {
1958 	  if (!d->implicitspace()) result += " ";
1959 	  pendingspace = false;
1960 	}
1961 	string tv = d->tag();
1962 	if ( !tv.empty() ){
1963 	  vector<string> tvv = TiCC::split(tv);
1964 	  bool no_match = true;
1965 	  for ( const auto& v : tvv ){
1966 	    TextPolicy::tag_handler match = tp.get_handler( v );
1967 	    if ( match ){
1968 	      no_match = false;
1969 	      UnicodeString tmp_result = match( d, tp );
1970 	      result += tmp_result;
1971 	    }
1972 	  }
1973 	  if ( no_match ){
1974 	    result += d->text( tp );
1975 	  }
1976 	}
1977 	else {
1978 	  result += d->text( tp );
1979 	}
1980 	if ( !result.isEmpty() ){
1981 	  const string& delim = d->get_delimiter( tp );
1982 	  if ( tp.debug() ){
1983 	    cerr << "append delimiter: '" << delim << "'" << endl;
1984 	  }
1985 	  result += TiCC::UnicodeFromUTF8(delim);
1986 	}
1987       }
1988       else {
1989 	// non interesting stuff like <feature>, <comment> etc.
1990       }
1991     }
1992     if (trim_spaces && this->spaces_flag() != SPACE_FLAGS::PRESERVE) {
1993       result = postprocess_spaces(result);
1994     }
1995     if ( tp.debug() ){
1996       cerr << "TEXT(" << tp.get_class() << ") on a textcontainer :" << xmltag()
1997 	   << " returned '" << result << "'" << endl;
1998     }
1999     return result;
2000   }
2001 
private_text(const TextPolicy & tp) const2002   const UnicodeString AbstractElement::private_text( const TextPolicy& tp ) const {
2003     /// get the UnicodeString value of an element
2004     /*!
2005      * \param tp The TextPolicy to use
2006      * \return the Unicode String representation found. Throws when
2007      * no text can be found
2008      */
2009     bool strict = tp.is_set( TEXT_FLAGS::STRICT );
2010     bool show_hidden = tp.is_set( TEXT_FLAGS::HIDDEN );
2011     bool trim = !tp.is_set( TEXT_FLAGS::NO_TRIM_SPACES );
2012     if ( tp.debug() ){
2013       cerr << "TEXT(" << tp.get_class() << ") on node : " << xmltag() << " id="
2014 	   << id() << endl;
2015       cerr << "TextPolicy: " << tp << endl;
2016     }
2017     if ( strict ) {
2018       /// WARNING. Don't call text(tp) here. We will get into an infinite
2019       /// recursion. Can't we do better then calling ourself again, sort of?
2020       TextPolicy tmp = tp;
2021       tmp.clear( TEXT_FLAGS::STRICT );
2022       return text_content(tmp)->text( tmp );
2023     }
2024     else if ( !printable() || ( hidden() && !show_hidden ) ){
2025       throw NoSuchText( "NON printable element: " + xmltag() );
2026     }
2027     else if ( is_textcontainer() ){
2028       return text_container_text( tp );
2029     }
2030     else {
2031       //
2032       UnicodeString result = deeptext( tp );
2033       if ( result.isEmpty() ) {
2034 	TextPolicy tmp = tp;
2035 	tmp.set( TEXT_FLAGS::STRICT );
2036 	if ( !trim ) {
2037 	  tmp.set( TEXT_FLAGS::NO_TRIM_SPACES );
2038 	}
2039 	result = text( tmp );
2040       }
2041       if ( result.isEmpty() ) {
2042 	throw NoSuchText( "on tag " + xmltag() + " nor it's children" );
2043       }
2044       return result;
2045     }
2046   }
2047 
text(const TextPolicy & tp) const2048   const UnicodeString AbstractElement::text( const TextPolicy& tp ) const {
2049     /// get the UnicodeString text value of an element
2050     /*!
2051      * \param tp a TextPolicy
2052      */
2053     if ( tp.debug() ){
2054       cerr << "DEBUG <" << xmltag() << ">.text() Policy=" << tp << endl;
2055     }
2056     return private_text( tp );
2057   }
2058 
text(const string & cls,TEXT_FLAGS flags,bool debug) const2059   const UnicodeString AbstractElement::text( const string& cls,
2060 					     TEXT_FLAGS flags,
2061 					     bool debug ) const {
2062     /// get the UnicodeString text value of an element
2063     /*!
2064      * \param cls the textclass the text should be in
2065      * \param flags the search parameters to use. See TEXT_FLAGS.
2066      * \param debug enables debugging when true
2067      */
2068     TextPolicy tp( cls, flags );
2069     tp.set_debug( debug );
2070     if ( debug ){
2071       cerr << "DEBUG <" << xmltag() << ">.text() Policy=" << tp << endl;
2072     }
2073     return private_text( tp );
2074   }
2075 
setAttributes(KWargs & kwargs)2076   void FoLiA::setAttributes( KWargs& kwargs ){
2077     /// set the attributes of a FoLiA top node
2078     /*!
2079      * \param kwargs an attribute-value list
2080      * the FoLiA top is special, as it may accept special attributes
2081      * which are stored in the associated document, and NOT in the node
2082      */
2083     // we store some attributes in the document itself
2084     doc()->setDocumentProps( kwargs );
2085     // use remaining attributes for the FoLiA node
2086     // probably only the ID
2087     AbstractElement::setAttributes( kwargs );
2088   }
2089 
parseXml(const xmlNode * node)2090   FoliaElement* FoLiA::parseXml( const xmlNode *node ){
2091     ///
2092     /// recursively parse a complete FoLiA tree
2093     /// \param node an xmlNode that MUST be a FoLiA root node
2094     /// \return the parsed tree. Throws on error.
2095     /*!
2096      * the topnode is special, as it also carries the main document properties
2097      *
2098      */
2099     KWargs atts = getAttributes( node );
2100     if ( !doc() ){
2101       throw XmlError( "FoLiA root without Document" );
2102     }
2103     setAttributes( atts );
2104     bool meta_found = false;
2105     xmlNode *p = node->children;
2106     while ( p ){
2107       if ( p->type == XML_ELEMENT_NODE ){
2108 	if ( TiCC::Name(p) == "metadata" &&
2109 	     checkNS( p, NSFOLIA ) ){
2110 	  if ( doc()->debug > 1 ){
2111 	    cerr << "Found metadata" << endl;
2112 	  }
2113 	  doc()->parse_metadata( p );
2114 	  meta_found = true;
2115 	}
2116 	else if ( p && TiCC::getNS(p) == NSFOLIA ){
2117 	  string tag = TiCC::Name( p );
2118 	  if ( !meta_found  && !doc()->version_below(1,6) ){
2119 	    throw XmlError( "Expecting element metadata, got '" + tag + "'" );
2120 	  }
2121 	  FoliaElement *t = AbstractElement::createElement( tag, doc() );
2122 	  if ( t ){
2123 	    if ( doc()->debug > 2 ){
2124 	      cerr << "created " << t << endl;
2125 	    }
2126 	    t = t->parseXml( p );
2127 	    if ( t ){
2128 	      if ( doc()->debug > 2 ){
2129 		cerr << "extend " << this << " met " << tag << endl;
2130 	      }
2131 	      this->append( t );
2132 	    }
2133 	  }
2134 	}
2135       }
2136       else if ( p->type == XML_TEXT_NODE ){
2137 	// This MUST be 'empty space', so only spaces and tabs formatting
2138 	string txt = TextValue(p);
2139 	txt = TiCC::trim(txt);
2140 	if ( !txt.empty() ){
2141 	  if ( p->prev ){
2142 	    string tg = "<" + Name(p->prev) + ">";
2143 	    throw XmlError( "found extra text '" + txt + "' after element "
2144 			    + tg + ", NOT allowed there." );
2145 	  }
2146 	  else {
2147 	    string tg = "<" + Name(p->parent) + ">";
2148 	    throw XmlError( "found extra text '" + txt + "' inside element "
2149 			    + tg + ", NOT allowed there." );
2150 	  }
2151 	}
2152       }
2153       p = p->next;
2154     }
2155     return this;
2156   }
2157 
trim_space(const UnicodeString & in)2158   UnicodeString trim_space( const UnicodeString& in ){
2159     /// remove leading and traling spaces. KEEP newlines etc.
2160     /*!
2161      * \param in an untrimmed UnicodeString
2162      * \return an UnicodeString with all leading and trailing spaces removed.
2163      * Other 'whitespace' characters like newline and tab are retained!
2164      */
2165     const char16_t space = 0x0020;
2166     //    cerr << "in = '" << in << "'" << endl;
2167     UnicodeString out;
2168     int i = 0;
2169     for( ; i < in.length(); ++i ){
2170       //      cerr << "start: bekijk:" << UnicodeString(in[i]) << endl;
2171       if ( in[i] != space ){
2172 	break;
2173       }
2174     }
2175     int j = in.length()-1;
2176     for( ; j >= 0; --j ){
2177       //      cerr << "end: bekijk:" << UnicodeString(in[j]) << endl;
2178       if ( in[j] != space ){
2179 	break;
2180       }
2181     }
2182     // cerr << "I=" << i << endl;
2183     // cerr << "J=" << j << endl;
2184     if ( j < i ){
2185       //      cerr << "out = LEEG" << endl;
2186       return out;
2187     }
2188     out = UnicodeString( in, i, j-i+1 );
2189     //    cerr << "out = '" << out << "'" << endl;
2190     return out;
2191   }
2192 
postprocess_spaces(const UnicodeString & in)2193   UnicodeString postprocess_spaces( const UnicodeString& in ){
2194     ///Postprocessing for spaces, translates temporary \1 codepoints to spaces
2195     /// if they are are not preceeded by whitespace
2196     bool need_postprocessing = false;
2197     for (int i = 0; i < in.length(); i++) {
2198       if (in[i] == 0x0001) {
2199 	need_postprocessing = true;
2200 	break;
2201       }
2202     }
2203     if (!need_postprocessing) {
2204       return in;
2205     }
2206     else {
2207       UnicodeString result;
2208       for (int i = 0; i < in.length(); ++i) {
2209 	if ( in[i] == 0x0001 ) {
2210 	  if ( i > 0
2211 	       && !is_space(in[i-1]) ){
2212 	    result.append((UChar32) 0x0020); //add a space
2213 	    // 1 byte is dropped otherwise
2214 	  }
2215 	}
2216 	else {
2217 	  result.append(in[i]);
2218 	}
2219       }
2220       return result;
2221     }
2222   }
2223 
check_end(const UnicodeString & us,bool & only)2224   bool check_end( const UnicodeString& us, bool& only ){
2225     /// check for newline characters at the end
2226     /*!
2227      * \param us the UnicodeString to check for '\n'
2228      * \param only set to true if the whole string consists of only '\n'
2229      * \return true when at least 1 '\n' is found at the end.
2230      */
2231     only = false;
2232     string tmp = TiCC::UnicodeToUTF8( us );
2233     int j = tmp.length()-1;
2234     size_t found_nl = 0;
2235     for ( ; j >=0; --j ){
2236       if ( tmp[j] == '\n' ){
2237 	++found_nl;
2238       }
2239       else {
2240 	break;
2241       }
2242     }
2243     only = found_nl == tmp.length();
2244     return found_nl > 0;
2245   }
2246 
no_space_at_end(FoliaElement * s)2247   bool no_space_at_end( FoliaElement *s ){
2248     /// given a FoliaElement check if the last Word in it has space()
2249     /*!
2250      * \param s a FoliaElement
2251      * \return true if the element contains Word children and the last
2252      * one has space()
2253      */
2254     bool result = false;
2255     //    cerr << "no space? s: " << s << endl;
2256     if ( s ){
2257       vector<Word*> words = s->select<Word>(false);
2258       if ( !words.empty() ){
2259 	Word *last = words.back();
2260 	//	cerr << "no space? last: " << last << endl;
2261 	return !last->space();
2262       }
2263     }
2264     return result;
2265   }
2266 
deeptext(const TextPolicy & tp) const2267   const UnicodeString AbstractElement::deeptext( const TextPolicy& tp ) const {
2268     /// get the UnicodeString text value of underlying elements
2269     /*!
2270      * \param tp the TextPolicy to use
2271      * \return The Unicode Text found.
2272      * Will throw on error.
2273      */
2274     if ( tp.debug() ){
2275       cerr << "deeptext, policy: " << tp << ", on node : " << xmltag() << " id=" << id() << ", cls=" << this->cls() << ")" << endl;
2276       cerr << "deeptext: node has " << _data.size() << " children." << endl;
2277     }
2278     vector<UnicodeString> parts;
2279     vector<UnicodeString> seps;
2280     for ( const auto& child : data() ) {
2281       // try to get text dynamically from printable children
2282       // skipping the TextContent elements
2283       if ( tp.debug() ){
2284 	if ( !child->printable() ) {
2285 	  cerr << "deeptext: node[" << child->xmltag() << "] NOT PRINTABLE! "
2286 	       << endl;
2287 	}
2288       }
2289       if ( child->printable()
2290 	   && ( is_structure( child )
2291 		|| child->isSubClass( AbstractSpanAnnotation_t )
2292 		|| child->isinstance( Correction_t ) )
2293 	   && !child->isinstance( TextContent_t ) ) {
2294 	if ( tp.debug() ){
2295 	  cerr << "deeptext:bekijk node[" << child->xmltag() << "]"<< endl;
2296 	}
2297 	try {
2298 	  UnicodeString tmp = child->text( tp );
2299 	  if ( tp.debug() ){
2300 	    cerr << "deeptext found '" << tmp << "'" << endl;
2301 	  }
2302 	  parts.push_back(tmp);
2303 	  if ( child->isinstance( Sentence_t )
2304 	       && no_space_at_end(child) ){
2305 	    const string& delim = "";
2306 	    if ( tp.debug() ){
2307 	      cerr << "deeptext: no delimiter van "<< child->xmltag() << " on"
2308 		   << " last w of s" << endl;
2309 	    }
2310 	    seps.push_back(TiCC::UnicodeFromUTF8(delim));
2311 	  }
2312 	  else {
2313 	    // get the delimiter
2314 	    const string& delim = child->get_delimiter( tp );
2315 	    if ( tp.debug() ){
2316 	      cerr << "deeptext:delimiter van "<< child->xmltag() << " ='"
2317 		   << delim << "'" << endl;
2318 	    }
2319 	    seps.push_back(TiCC::UnicodeFromUTF8(delim));
2320 	  }
2321 	} catch ( const NoSuchText& e ) {
2322 	  if ( tp.debug() ){
2323 	    cerr << "HELAAS" << endl;
2324 	  }
2325 	}
2326       }
2327     }
2328 
2329     // now construct the result;
2330     UnicodeString result;
2331     for ( size_t i=0; i < parts.size(); ++i ) {
2332       if ( tp.debug() ){
2333 	cerr << "part[" << i << "]='" << parts[i] << "'" << endl;
2334 	cerr << "sep[" << i << "]='" << seps[i] << "'" << endl;
2335       }
2336       bool only_nl = false;
2337       bool end_is_nl = check_end( parts[i], only_nl );
2338       if ( end_is_nl ){
2339 	if ( tp.debug() ){
2340 	  cerr << "a newline after: '" << parts[i] << "'" << endl;
2341 	  if ( i < parts.size()-1 ){
2342 	    cerr << "next sep='" << seps[i+1] << "'" << endl;
2343 	  }
2344 	}
2345 
2346 	if ( only_nl ){
2347 	  // only a newline
2348 	  result = trim_space( result );
2349 	  if ( tp.debug() ){
2350 	    cerr << "OK it is only newline(s)" << endl;
2351 	    cerr << "TRIMMED? '" << result << "'" << endl;
2352 	  }
2353 	}
2354       }
2355       result += parts[i];
2356       if ( !end_is_nl && i < parts.size()-1 ){
2357 	result += seps[i];
2358       }
2359       if ( tp.debug() ){
2360 	cerr << "result='" << result << "'" << endl;
2361       }
2362     }
2363     if ( tp.debug() ){
2364       cerr << "deeptext() for " << xmltag() << " step 3 " << endl;
2365     }
2366     if ( result.isEmpty() ) {
2367       // so no deeper text is found. Well, lets look here then
2368       result = text_content(tp)->text( tp );
2369     }
2370     if ( tp.debug() ){
2371       cerr << "deeptext() for " << xmltag() << " result= '" << result << "'"
2372 	   << endl;
2373     }
2374     if ( result.isEmpty() ) {
2375       throw NoSuchText( xmltag() + ":(class=" + tp.get_class() +"): empty!" );
2376     }
2377     return result;
2378   }
2379 
stricttext(const string & cls) const2380   const UnicodeString FoliaElement::stricttext( const string& cls ) const {
2381     /// get the UnicodeString value of TextContent children only
2382     /*!
2383      * \param cls the textclass
2384      * \return The Unicode Text found.
2385      * Will throw on error.
2386      */
2387     TextPolicy tp( cls, TEXT_FLAGS::STRICT );
2388     return this->text( tp );
2389   }
2390 
toktext(const string & cls) const2391   const UnicodeString FoliaElement::toktext( const string& cls ) const {
2392     /// get the UnicodeString value of TextContent children only, retaining
2393     /// tokenization
2394     /*!
2395      * \param cls the textclass
2396      * \return The Unicode Text found.
2397      * Will throw on error.
2398      */
2399     TextPolicy tp( cls, TEXT_FLAGS::RETAIN );
2400     return this->text( tp );
2401   }
2402 
text_content(const TextPolicy & tp) const2403   const TextContent *AbstractElement::text_content( const TextPolicy& tp ) const {
2404     /// Get the TextContent explicitly associated with this element.
2405     /*!
2406      * \param tp the TextPolicy to use
2407      *
2408      * Returns the TextContent instance rather than the actual text.
2409      * (so it might return itself.. ;)
2410      * Does not recurse into children with the sole exception of Correction
2411      * might throw NoSuchText exception if not found.
2412      */
2413 
2414     if ( tp.debug() ){
2415       cerr << "text_content, policy= " << tp << endl;
2416     }
2417     string desired_class = tp.get_class();
2418     if ( isinstance(TextContent_t) ){
2419       if ( tp.debug() ){
2420 	cerr << "A textcontent!!" << endl;
2421       }
2422       if  ( this->cls() == desired_class ) {
2423 	if ( tp.debug() ){
2424 	  cerr << "return myself..." << endl;
2425 	}
2426 	return dynamic_cast<const TextContent*>(this);
2427       }
2428       else {
2429 	throw NoSuchText( "TextContent::text_content(" + desired_class + ")" );
2430       }
2431     }
2432     bool show_hidden = tp.is_set( TEXT_FLAGS::HIDDEN );
2433     if ( tp.debug() ){
2434       cerr << (!printable()?"NOT":"") << " printable: " << xmltag() << endl;
2435       cerr << (!hidden()?"NOT":"") << " hidden: " << xmltag() << endl;
2436     }
2437     if ( !printable() || ( hidden() && !show_hidden ) ) {
2438       throw NoSuchText( "non-printable element: " +  xmltag() );
2439     }
2440     if ( tp.debug() ){
2441       cerr << "recurse into children...." << endl;
2442     }
2443     for ( const auto& el : data() ) {
2444       if ( el->isinstance(TextContent_t) && (el->cls() == desired_class ) ) {
2445 	return dynamic_cast<TextContent*>(el);
2446       }
2447       else if ( el->element_id() == Correction_t) {
2448 	try {
2449 	  return el->text_content( tp );
2450 	} catch ( const NoSuchText& e ) {
2451 	  // continue search for other Corrections or a TextContent
2452 	}
2453       }
2454     }
2455     throw NoSuchText( xmltag() + "::text_content(" + desired_class + ")" );
2456   }
2457 
text_content(const string & cls,bool debug) const2458   const TextContent *AbstractElement::text_content( const string& cls,
2459 						    bool debug ) const {
2460     /// Get the TextContent explicitly associated with this element.
2461     /*!
2462      * \param cls the textclass to search for
2463      * \param debug enables debugging when true
2464      *
2465      * Returns the TextContent instance rather than the actual text.
2466      * (so it might return itself.. ;)
2467      * Does not recurse into children with the sole exception of Correction
2468      * might throw NoSuchText exception if not found.
2469      */
2470     TextPolicy tp( cls );
2471     tp.set_debug( debug );
2472     return text_content( tp );
2473   }
2474 
phon_content(const TextPolicy & tp) const2475   const PhonContent *AbstractElement::phon_content( const TextPolicy& tp ) const {
2476     /// Get the PhonContent explicitly associated with this element.
2477     /*!
2478      * \param tp the TextPolicy to use
2479      *
2480      * Returns the PhonContent instance rather than the actual text.
2481      * (so it might return iself.. ;)
2482      * Does not recurse into children with the sole exception of Correction
2483      * might throw NoSuchPhon exception if not found.
2484      */
2485     string desired_class = tp.get_class();
2486     if ( isinstance(PhonContent_t) ){
2487       if  ( cls() == desired_class ){
2488 	return dynamic_cast<const PhonContent*>(this);
2489       }
2490       else {
2491 	throw NoSuchPhon( xmltag() + "::phon_content(" + desired_class + ")" );
2492       }
2493     }
2494     bool show_hidden = tp.is_set( TEXT_FLAGS::HIDDEN );
2495     if ( !speakable() || ( hidden() && !show_hidden ) ) {
2496       throw NoSuchPhon( "non-speakable element: " + xmltag() );
2497     }
2498 
2499     for ( const auto& el : _data ) {
2500       if ( el->isinstance(PhonContent_t) && ( el->cls() == desired_class ) ) {
2501 	return dynamic_cast<PhonContent*>(el);
2502       }
2503       else if ( el->element_id() == Correction_t) {
2504 	try {
2505 	  return el->phon_content(tp);
2506 	} catch ( const NoSuchPhon& e ) {
2507 	  // continue search for other Corrections or a TextContent
2508 	}
2509       }
2510     }
2511     throw NoSuchPhon( xmltag() + "::phon_content(" + desired_class + ")" );
2512   }
2513 
phon_content(const string & cls,bool debug) const2514   const PhonContent *AbstractElement::phon_content( const string& cls,
2515 						    bool debug ) const {
2516     /// Get the PhonContent explicitly associated with this element.
2517     /*!
2518      * \param cls the textclass to search for
2519      * \param debug enable debugging when true
2520      *
2521      * Returns the PhonContent instance rather than the actual text.
2522      * (so it might return iself.. ;)
2523      * Does not recurse into children with the sole exception of Correction
2524      * might throw NoSuchPhon exception if not found.
2525      */
2526     TextPolicy tp(cls );
2527     tp.set_debug( debug );
2528     return phon_content( tp );
2529   }
2530 
phon(const TextPolicy & tp) const2531   const UnicodeString AbstractElement::phon( const TextPolicy& tp ) const {
2532     /// get the UnicodeString phon value of an element
2533     /*!
2534      * \param tp the TextPolic to use
2535      */
2536     bool hidden = tp.is_set( TEXT_FLAGS::HIDDEN );
2537     bool strict = tp.is_set( TEXT_FLAGS::STRICT );
2538     if ( tp.debug() ){
2539       cerr << "PHON, Policy= " << tp << " on node : " << xmltag() << " id="
2540 	   << id() << endl;
2541     }
2542     if ( strict ) {
2543       return phon_content(tp)->phon();
2544     }
2545     else if ( !speakable() || ( this->hidden() && !hidden ) ) {
2546       throw NoSuchPhon( "NON speakable element: " + xmltag() );
2547     }
2548     else {
2549       UnicodeString result = deepphon( tp );
2550       if ( result.isEmpty() ) {
2551 	result = phon_content(tp)->phon();
2552       }
2553       if ( result.isEmpty() ) {
2554 	throw NoSuchPhon( "on tag " + xmltag() + " nor it's children" );
2555       }
2556       return result;
2557     }
2558   }
2559 
phon(const string & cls,TEXT_FLAGS flags) const2560   const UnicodeString AbstractElement::phon( const string& cls,
2561 					     TEXT_FLAGS flags ) const {
2562     /// get the UnicodeString phon value of an element
2563     /*!
2564      * \param cls the textclass the text should be in
2565      * \param flags the search parameters to use. See TEXT_FLAGS.
2566      */
2567     TextPolicy tp( cls, flags );
2568     return phon( tp );
2569   }
2570 
deepphon(const TextPolicy & tp) const2571   const UnicodeString AbstractElement::deepphon( const TextPolicy& tp ) const {
2572     /// get the UnicodeString phon value of underlying elements
2573     /*!
2574      * \param tp the TextPolicu to use
2575      * \return The Unicode Text found.
2576      * Will throw on error.
2577      */
2578     if ( tp.debug() ){
2579       cerr << "deepPHON, policy= " << tp << ", on node : " << xmltag()
2580 	   << " id=" << id() << endl;
2581       cerr << "deepphon: node has " << _data.size() << " children." << endl;
2582     }
2583     vector<UnicodeString> parts;
2584     vector<UnicodeString> seps;
2585     for ( const auto& child : _data ) {
2586       // try to get text dynamically from children
2587       // skip PhonContent elements
2588       if ( tp.debug() ){
2589 	if ( !child->speakable() ) {
2590 	  cerr << "deepphon: node[" << child->xmltag() << "] NOT SPEAKABLE! "
2591 	     << endl;
2592 	}
2593       }
2594       if ( child->speakable() && !child->isinstance( PhonContent_t ) ) {
2595 	if ( tp.debug() ){
2596 	  cerr << "deepphon:bekijk node[" << child->xmltag() << "]" << endl;
2597 	}
2598 	try {
2599 	  UnicodeString tmp = child->phon( tp );
2600 	  if ( tp.debug() ){
2601 	    cerr << "deepphon found '" << tmp << "'" << endl;
2602 	  }
2603 	  parts.push_back(tmp);
2604 	  // get the delimiter
2605 	  const string& delim = child->get_delimiter(tp);
2606 	  if ( tp.debug() ){
2607 	    cerr << "deepphon:delimiter van "<< child->xmltag()
2608 		 << " ='" << delim << "'" << endl;
2609 	  }
2610 	  seps.push_back(TiCC::UnicodeFromUTF8(delim));
2611 	} catch ( const NoSuchPhon& e ) {
2612 	  if ( tp.debug() ){
2613 	    cerr << "HELAAS" << endl;
2614 	  }
2615 	}
2616       }
2617     }
2618 
2619     // now construct the result;
2620     UnicodeString result;
2621     for ( size_t i=0; i < parts.size(); ++i ) {
2622       result += parts[i];
2623       if ( i < parts.size()-1 ) {
2624 	result += seps[i];
2625       }
2626     }
2627     if ( tp.debug() ){
2628       cerr << "deepphon() for " << xmltag() << " step 3 " << endl;
2629     }
2630     if ( result.isEmpty() ) {
2631       try {
2632 	result = phon_content(tp)->phon();
2633       }
2634       catch ( ... ) {
2635       }
2636     }
2637     if ( tp.debug() ){
2638       cerr << "deepphontext() for " << xmltag() << " result= '" << result
2639 	   << "'" << endl;
2640     }
2641     if ( result.isEmpty() ) {
2642       throw NoSuchPhon( xmltag() + ":(class=" + tp.get_class() +"): empty!" );
2643     }
2644     return result;
2645   }
2646 
2647 
find_replacables(FoliaElement * par) const2648   vector<FoliaElement *>AbstractElement::find_replacables( FoliaElement *par ) const {
2649     // find all children with the same signature as the parameter
2650     /*!
2651      * \param par the FoliaElement to search
2652      * \return a vector of matching elements
2653      * search in the DIRECT children for nodes with the same tag AND set
2654      * as the element par
2655      */
2656     return par->select( element_id(), sett(), SELECT_FLAGS::LOCAL );
2657   }
2658 
replace(FoliaElement * child)2659   void AbstractElement::replace( FoliaElement *child ) {
2660     /// replace a child element
2661     /*!
2662      * \param child The element to substitute
2663      * This function searches for A child of the same signature (type and set)
2664      * If found, that child is replaced.
2665      * If no such child element exists, this will act the same as append()
2666      */
2667     vector<FoliaElement*> replace = child->find_replacables( this );
2668     if ( replace.empty() ) {
2669       // nothing to replace, simply call append
2670       append( child );
2671     }
2672     else if ( replace.size() > 1 ) {
2673       throw runtime_error( "Unable to replace. Multiple candidates found, unable to choose." );
2674     }
2675     else {
2676       replace[0]->destroy();
2677       append( child );
2678     }
2679   }
2680 
replace(FoliaElement * old,FoliaElement * _new)2681   FoliaElement* AbstractElement::replace( FoliaElement *old,
2682 					  FoliaElement* _new ) {
2683     /// replace in the children old by _new
2684     /*!
2685      * \param old The node to be replacec
2686      * \param _new the new node to add
2687      * \return old
2688      * First old is looked up, if present it is replaced
2689      *
2690      * when not found this function does nothing and returns 0
2691      */
2692     FoliaElement *result = 0;
2693     auto it = find_if( _data.begin(),
2694 		       _data.end(),
2695 		       [&]( FoliaElement *el ){ return el == old; } );
2696     if ( it != _data.end() ){
2697       *it = _new;
2698       result = old;
2699       _new->set_parent(this);
2700     }
2701     return result;
2702   }
2703 
insert_after(FoliaElement * pos,FoliaElement * add)2704   void AbstractElement::insert_after( FoliaElement *pos, FoliaElement *add ){
2705     /// append a node after a certain element
2706     /*!
2707      * \param pos The location after which to insert add
2708      * \param add the element to add
2709      *
2710      * throws when pos is not found
2711      */
2712     auto it = _data.begin();
2713     while ( it != _data.end() ) {
2714       if ( *it == pos ) {
2715 	it = _data.insert( ++it, add );
2716 	break;
2717       }
2718       ++it;
2719     }
2720     if ( it == _data.end() ) {
2721       throw runtime_error( "insert_after(): previous not found" );
2722     }
2723   }
2724 
clear_textcontent(const string & textclass)2725   void FoliaElement::clear_textcontent( const string& textclass ){
2726     for ( size_t i=0; i < size(); ++i ){
2727       FoliaElement *p = index(i);
2728       if ( p->element_id() == TextContent_t ) {
2729 	if ( p->cls() == textclass ){
2730 	  p->destroy();
2731 	  break;
2732 	}
2733       }
2734     }
2735   }
2736 
settext(const string & txt,const string & cls)2737   TextContent *FoliaElement::settext( const string& txt,
2738 				      const string& cls ){
2739     /// append a TextContent child of class txt with value txt
2740     /*!
2741      * \param txt the UTF8 text value
2742      * \param cls the textclass of the new TextContent
2743      * \return the new created TextContent
2744      * may throw on error
2745      *
2746      * when the associated document has the checktext mode, (which is the
2747      * default) both text consistency and the offset are checked.
2748      */
2749     return settext( txt, -1, cls );
2750   }
2751 
setutext(const UnicodeString & txt,const string & cls)2752   TextContent *FoliaElement::setutext( const UnicodeString& txt,
2753 				       const string& cls ){
2754     /// append a TextContent child of class cls with value txt
2755     /*!
2756      * \param txt the Unicode text value
2757      * \param cls the textclass of the new TextContent
2758      * \return the new created TextContent
2759      * may throw on error
2760      *
2761      * when the associated document has the checktext mode, (which is the
2762      * default) both text consistency and the offset are checked.
2763      */
2764     string utf8 = TiCC::UnicodeToUTF8(txt);
2765     return settext( utf8, cls );
2766   }
2767 
settext(const string & txt,int offset,const string & cls)2768   TextContent *FoliaElement::settext( const string& txt,
2769 				      int offset,
2770 				      const string& cls ){
2771     /// append a TextContent child of class cls with value txt
2772     /*!
2773      * \param txt the UTF8 text value
2774      * \param offset offset of the text in the text of the parent,
2775               when offset < 0 it is ignored.
2776      * \param cls the textclass of the new TextContent
2777      * \return the new created TextContent
2778      * may throw on error
2779      *
2780      * when the associated document has the checktext mode, (which is the
2781      * default) both text consistency and the offset are checked.
2782      */
2783     UnicodeString txt_u = TiCC::UnicodeFromUTF8( txt );
2784     if ( doc() && doc()->checktext()
2785 	 && !isSubClass( Morpheme_t ) && !isSubClass( Phoneme_t) ){
2786       UnicodeString deeper_u;
2787       try {
2788 	deeper_u = text( cls );
2789 	// get deep original text: no retain tokenization, no strict
2790       }
2791       catch (...){
2792       }
2793       deeper_u = normalize_spaces( deeper_u );
2794       UnicodeString txt_check_u = normalize_spaces( txt_u );
2795       if ( !deeper_u.isEmpty()
2796 	   && txt_check_u != deeper_u ){
2797 	throw InconsistentText( "settext(cls=" + cls + "): deeper text differs from attempted\ndeeper='" + TiCC::UnicodeToUTF8(deeper_u) + "'\nattempted='" + TiCC::UnicodeToUTF8(txt_u) + "'" );
2798       }
2799     }
2800     KWargs args;
2801     args["value"] = TiCC::UnicodeToUTF8(txt_u);
2802     args["class"] = cls;
2803     if ( offset >= 0 ){
2804       args["offset"] = TiCC::toString(offset);
2805     }
2806     TextContent *node = new TextContent( args, doc() );
2807     replace( node );
2808     return node;
2809   }
2810 
setutext(const UnicodeString & txt,int offset,const string & cls)2811   TextContent *FoliaElement::setutext( const UnicodeString& txt,
2812 				       int offset,
2813 				       const string& cls ){
2814     /// append a TextContent child of class cls with value txt
2815     /*!
2816      * \param txt the Unicode text value
2817      * \param offset offset of the text in the text of the parent,
2818               when offset < 0 it is ignored.
2819      * \param cls the textclass of the new TextContent
2820      * \return the new created TextContent
2821      * may throw on error
2822      *
2823      * when the associated document has the checktext mode, (which is the
2824      * default) both text consistency and the offset are checked.
2825      */
2826     string utf8 = TiCC::UnicodeToUTF8(txt);
2827     return settext( utf8, offset, cls );
2828   }
2829 
description() const2830   const string FoliaElement::description() const {
2831     /// return the string value of the description tag (if present)
2832     /*!
2833      * \return a string
2834      * search for Description nodes in this object.
2835      * When 1 or more are found, return the value of the first one
2836      */
2837     vector<FoliaElement *> v = select( Description_t, SELECT_FLAGS::LOCAL );
2838     if ( v.size() == 0 ) {
2839       return "";
2840     }
2841     return v[0]->description();
2842   }
2843 
acceptable(ElementType t) const2844   bool AbstractElement::acceptable( ElementType t ) const {
2845     /// test if this ElementType is acceptable for the current node
2846     /*!
2847      * \param t the ElementType to test
2848      *
2849      * This function tests if t is in the accepted_data list of the node
2850      * OR if it is a SubClass of one of the accepted types
2851      */
2852 
2853     auto it = accepted_data().find( t );
2854     if ( it == accepted_data().end() ) {
2855       for ( const auto& et : accepted_data() ) {
2856 	if ( folia::isSubClass( t, et ) ) {
2857 	  return true;
2858 	}
2859       }
2860       return false;
2861     }
2862     return true;
2863   }
2864 
addable(const FoliaElement * parent) const2865   bool AbstractElement::addable( const FoliaElement *parent ) const {
2866     /// test if an element might succesfully appended to \em parent
2867     /*!
2868      * \param parent the node to check
2869      * \return true if it doesn't throw
2870      *
2871      * \note It will allways throw an error, instead of returning false
2872      */
2873     if ( !parent->acceptable( element_id() ) ) {
2874       string mess = "Unable to append object of type " + classname()
2875 	+ " to a <" + parent->classname() + ">";
2876       if ( !parent->id().empty() ){
2877 	mess += " (id=" + parent->id() + ")";
2878       }
2879       throw ValueError( mess );
2880     }
2881     if ( occurrences() > 0 ) {
2882       vector<FoliaElement*> v = parent->select( element_id(),
2883 						SELECT_FLAGS::LOCAL );
2884       size_t count = v.size();
2885       if ( count >= occurrences() ) {
2886 	throw DuplicateAnnotationError( "Unable to add another object of type " + classname() + " to " + parent->classname() + ". There are already " + TiCC::toString(count) + " instances of this type, which is the maximum." );
2887       }
2888     }
2889     if ( occurrences_per_set() > 0 &&
2890 	 (CLASS & required_attributes() || setonly() ) ){
2891       vector<FoliaElement*> v = select( element_id(),
2892 					sett(),
2893 					SELECT_FLAGS::LOCAL );
2894       size_t count = v.size();
2895       if ( count >= occurrences_per_set() ) {
2896 	throw DuplicateAnnotationError( "Unable to add another object of type " + classname() + " to " + parent->classname() + ". There are already " + TiCC::toString(count) + " instances of this type and set (" + sett() + "), which is the maximum." );
2897       }
2898     }
2899     if ( _parent &&
2900 	 !( element_id() == WordReference_t
2901 	    || referable() ) ){
2902       throw XmlError( "attempt to reconnect node " + classname() + "("
2903 		      + id()
2904 		      + ") to a " + parent->classname() + " node, id="
2905 		      + parent->id()
2906 		      + ", it was already connected to a "
2907 		      +  parent->classname() + " id=" + parent->id() );
2908     }
2909 #ifdef NOT_WORKING
2910     // this fails. needs attention
2911     if ( c->element_id() == WordReference_t ){
2912       string tval = atts["t"];
2913       if ( !tval.empty() ){
2914       	string tc = ref->textclass();
2915       	string rtval = ref->str(tc);
2916       	if ( tval != rtval ){
2917       	  throw XmlError( "WordReference id=" + id + " has another value for "
2918       			  + "the t attribute than it's reference. ("
2919       			  + tval + " versus " + rtval + ")" );
2920       	}
2921       }
2922     }
2923 #endif
2924     if ( element_id() == TextContent_t
2925 	 && parent->element_id() == Word_t ) {
2926       string val = str(cls());
2927       val = trim( val );
2928       if ( val.empty() ) {
2929      	throw ValueError( "attempt to add an empty <t> to word: " + parent->id() );
2930       }
2931     }
2932     if ( element_id() == TextContent_t ){
2933       string cls = this->cls();
2934       string st = sett();
2935       vector<TextContent*> tmp = parent->select<TextContent>( st, false );
2936       if ( any_of( tmp.begin(),
2937 		   tmp.end(),
2938 		   [cls]( const TextContent *t) { return ( t->cls() == cls);} ) ){
2939 	throw DuplicateAnnotationError( "attempt to add <t> with class="
2940 					+ cls + " to element: " + parent->id()
2941 					+ " which already has a <t> with that class" );
2942 	}
2943     }
2944     if ( is_textcontainer() ||
2945 	 element_id() == Word_t ){
2946       parent->check_append_text_consistency( this );
2947     }
2948     return true;
2949   }
2950 
assignDoc(Document * the_doc)2951   void AbstractElement::assignDoc( Document* the_doc ) {
2952     /// attach a document-less FoliaElement (-tree) to a Document the_doc
2953     /*!
2954      * \param the_doc The Document to attach to
2955      *
2956      * if the node already has a Document assigned , nothing is done.
2957      *
2958      * Otherwise: The annotation type is checked. If not set yet and
2959      * the doc has autodeclare mode set, it is attempted to do so.
2960      * For TextContent and PhonContent, a default is added too
2961      *
2962      * Also the ID is registered in the_doc.
2963      *
2964      * Finaly, all children are also assigned to the_doc
2965      */
2966     if ( !_mydoc ) {
2967       _mydoc = the_doc;
2968       if ( annotation_type() != AnnotationType::NO_ANN
2969 	   && !the_doc->version_below( 2, 0 )
2970 	   && !the_doc->declared( annotation_type() ) ){
2971 	// cerr << "assignDoc: " << this << endl;
2972 	// cerr << "ant: " << annotation_type() << endl;
2973 	// cerr << "set: " << _set << endl;
2974 	// so when appending a document-less child, make sure that
2975 	// an annotation declaration is present or added.
2976 	if ( annotation_type() ==  AnnotationType::TEXT ){
2977 	  if ( _set.empty() ){
2978 	    doc()->declare( AnnotationType::TEXT, DEFAULT_TEXT_SET );
2979 	  }
2980 	  else {
2981 	    doc()->declare( AnnotationType::TEXT, _set );
2982 	  }
2983 	}
2984 	else if ( annotation_type() == AnnotationType::PHON ){
2985 	  if ( _set.empty() ){
2986 	    doc()->declare( AnnotationType::PHON, DEFAULT_PHON_SET );
2987 	  }
2988 	  else {
2989 	    doc()->declare( AnnotationType::PHON, _set );
2990 	  }
2991 	}
2992 	else if ( doc()->autodeclare() ){
2993 	  doc()->auto_declare( annotation_type(), _set );
2994 	}
2995 	else {
2996 	  throw DeclarationError( "Encountered an instance of <"
2997 				  + xmltag()
2998 				  + "> without a proper declaration for "
2999 				  + toString(annotation_type())
3000 				  + "-annotation" );
3001 	}
3002       }
3003       if ( !_set.empty()
3004 	   && (CLASS & required_attributes() )
3005 	   && !_mydoc->declared( annotation_type(), _set ) ) {
3006 	throw DeclarationError( "Set " + _set + " is used in " + xmltag()
3007 				+ "element: " + _id
3008 				+ " but has no declaration "
3009 				+ "for " + toString( annotation_type() )
3010 				+ "-annotation" );
3011       }
3012       if ( !_id.empty() ) {
3013 	_mydoc->add_doc_index( this );
3014       }
3015       // assume that children also might be doc-less
3016       for ( const auto& el : _data ) {
3017 	el->assignDoc( _mydoc );
3018       }
3019     }
3020   }
3021 
checkAtts()3022   bool AbstractElement::checkAtts() {
3023     /// check if all the REQUIRED attributes of the node are set
3024     /*!
3025      * \return true, or throws
3026      */
3027     if ( _id.empty()
3028 	 && (ID & required_attributes() ) ) {
3029       throw ValueError( "attribute 'ID' is required for " + classname() );
3030     }
3031     if ( _set.empty()
3032 	 && (CLASS & required_attributes() ) ) {
3033       throw ValueError( "attribute 'set' is required for " + classname() );
3034     }
3035     if ( _class.empty()
3036 	 && ( CLASS & required_attributes() ) ) {
3037       throw ValueError( "attribute 'class' is required for " + classname() );
3038     }
3039     if ( _annotator.empty()
3040 	 && ( ANNOTATOR & required_attributes() ) ) {
3041       throw ValueError( "attribute 'annotator' is required for " + classname() );
3042     }
3043     if ( _annotator_type == UNDEFINED
3044 	 && ( ANNOTATOR & required_attributes() ) ) {
3045       throw ValueError( "attribute 'Annotatortype' is required for " + classname() );
3046     }
3047     if ( _confidence == -1 &&
3048 	 ( CONFIDENCE & required_attributes() ) ) {
3049       throw ValueError( "attribute 'confidence' is required for " + classname() );
3050     }
3051     if ( _n.empty()
3052 	 && ( N & required_attributes() ) ) {
3053       throw ValueError( "attribute 'n' is required for " + classname() );
3054     }
3055     if ( _datetime.empty()
3056 	 && ( DATETIME & required_attributes() ) ) {
3057       throw ValueError( "attribute 'datetime' is required for " + classname() );
3058     }
3059     if ( _begintime.empty()
3060 	 && ( BEGINTIME & required_attributes() ) ) {
3061       throw ValueError( "attribute 'begintime' is required for " + classname() );
3062     }
3063     if ( _endtime.empty()
3064 	 && ( ENDTIME & required_attributes() ) ) {
3065       throw ValueError( "attribute 'endtime' is required for " + classname() );
3066     }
3067     if ( _src.empty()
3068 	 && ( SRC & required_attributes() ) ) {
3069       throw ValueError( "attribute 'src' is required for " + classname() );
3070     }
3071     if ( _metadata.empty()
3072 	 && ( METADATA & required_attributes() ) ) {
3073       throw ValueError( "attribute 'metadata' is required for " + classname() );
3074     }
3075     if ( _speaker.empty()
3076 	 && ( SPEAKER & required_attributes() ) ) {
3077       throw ValueError( "attribute 'speaker' is required for " + classname() );
3078     }
3079     return true;
3080   }
3081 
classInit()3082   void AbstractElement::classInit(){
3083     // we could call 'init()' directly, but this is more esthetic
3084     // keep in balance with the next function
3085     init(); // virtual init
3086   }
3087 
classInit(const KWargs & a)3088   void AbstractElement::classInit( const KWargs& a ){
3089     // this funcion is needed because calling the virtual function
3090     // setAttributes from the constructor will NOT call the right version
3091     // THIS IS BY DESIGN in C++
3092     init(); // virtual init
3093     KWargs a1 = a;
3094     setAttributes( a1 ); // also virtual!
3095     checkAtts(); // check if all needed attributes are set
3096   }
3097 
append(FoliaElement * child)3098   FoliaElement *AbstractElement::append( FoliaElement *child ){
3099     /// append child to this node
3100     /*!
3101      * \param child the node to add
3102      * \return the appended child
3103      *
3104      * will throw on error
3105      */
3106     if ( !child ){
3107       throw XmlError( "attempt to append an empty node to a " + classname() );
3108     }
3109     bool ok = false;
3110     try {
3111       ok = child->addable( this );
3112     }
3113     catch ( const XmlError& ) {
3114       // don't delete the offending child in case of illegal reconnection
3115       // it will be deleted by the true parent
3116       throw;
3117     }
3118     catch ( const exception& ) {
3119       child->destroy();
3120       throw;
3121     }
3122     if ( ok ) {
3123       if ( doc() ){
3124 	child->assignDoc( doc() );
3125       }
3126       _data.push_back(child);
3127       if ( !child->parent() ) {
3128 	child->set_parent(this);
3129       }
3130       if ( child->referable() ){
3131 	child->increfcount();
3132       }
3133       if ( child->spaces_flag() == SPACE_FLAGS::UNSET ){
3134 	child->set_spaces_flag( _preserve_spaces );
3135       }
3136       return child->postappend();
3137     }
3138     return 0;
3139   }
3140 
postappend()3141   FoliaElement *AbstractElement::postappend( ) {
3142     /// perform some post correction after appending
3143     if ( id().empty() && (ID & required_attributes()) && auto_generate_id() ){
3144       _id = generateId( xmltag() );
3145     }
3146     return this;
3147   }
3148 
remove(FoliaElement * child)3149   void AbstractElement::remove( FoliaElement *child ) {
3150     /// remove a child from a node
3151     /*!
3152      * \param child the element to remove
3153      */
3154 #ifdef DE_AND_CONSTRUCT_DEBUG
3155     cerr << "\nremove " << child->xmltag() << " from " << xmltag()
3156 	 << " adres=" << (void*)this
3157 	 << " id=" << _id << " class= " << endl;
3158 #endif
3159     auto it = std::remove( _data.begin(), _data.end(), child );
3160     _data.erase( it, _data.end() );
3161   }
3162 
index(size_t i) const3163   FoliaElement* AbstractElement::index( size_t i ) const {
3164     /// return the child at index i
3165     /*!
3166      * \param i the index
3167      * \return the child at index i
3168      *
3169      * Will throw when the index is out of range
3170      */
3171     if ( i < _data.size() ) {
3172       return _data[i];
3173     }
3174     throw range_error( "[] index out of range" );
3175   }
3176 
rindex(size_t ri) const3177   FoliaElement* AbstractElement::rindex( size_t ri ) const {
3178     /// return the child at reversed index ri
3179     /*!
3180      * \param ri the index
3181      * \return the child at index ri
3182      *
3183      * Will throw when the index is out of range
3184      */
3185     if ( ri < _data.size() ) {
3186       return _data[_data.size()-1-ri];
3187     }
3188     throw range_error( "[] rindex out of range" );
3189   }
3190 
select(ElementType et,const string & st,const set<ElementType> & exclude,SELECT_FLAGS flag) const3191   vector<FoliaElement*> AbstractElement::select( ElementType et,
3192 						 const string& st,
3193 						 const set<ElementType>& exclude,
3194 						 SELECT_FLAGS flag ) const {
3195     /// The generic 'select()' function on which all other variants are based
3196     ///   it searches a FoLiA node for matchins sibblings.
3197     /*!
3198      * \param et which type of element we are looking for
3199      * \param st when not empty ("") we also must match on the 'sett' of the nodes
3200      * \param exclude a set of ElementType to exclude from searching.
3201      * These are skipped, and NOT recursed into.
3202      * \param flag determines special search stategies:
3203      *     - RECURSE : recurse the whole FoLia from the given node downwards
3204      *                 returning all matching nodes, even within matches
3205      *                 This is the default.
3206      *     - LOCAL   : just look at the direct sibblings of the node
3207      *     - TOP_HIT : like recurse, but do NOT recurse into sibblings
3208      *               of matching node
3209      */
3210     vector<FoliaElement*> res;
3211     for ( const auto& el : _data ) {
3212       if ( el->element_id() == et &&
3213 	   ( st.empty() || el->sett() == st ) ) {
3214 	res.push_back( el );
3215 	if ( flag == SELECT_FLAGS::TOP_HIT ){
3216 	  flag = SELECT_FLAGS::LOCAL;
3217 	}
3218       }
3219       if ( flag != SELECT_FLAGS::LOCAL ){
3220 	// not at this level, search deeper when recurse is true
3221 	if ( exclude.find( el->element_id() ) == exclude.end() ) {
3222 	  vector<FoliaElement*> tmp = el->select( et, st, exclude, flag );
3223 	  res.insert( res.end(), tmp.begin(), tmp.end() );
3224 	}
3225       }
3226     }
3227     return res;
3228   }
3229 
select(ElementType et,const string & st,SELECT_FLAGS flag) const3230   vector<FoliaElement*> AbstractElement::select( ElementType et,
3231 						 const string& st,
3232 						 SELECT_FLAGS flag ) const {
3233     /// wrapper around the the generic select()
3234     /*!
3235      * calls select() with a default ignore set.
3236      */
3237     return select( et, st, default_ignore, flag );
3238   }
3239 
select(ElementType et,const set<ElementType> & exclude,SELECT_FLAGS flag) const3240   vector<FoliaElement*> AbstractElement::select( ElementType et,
3241 						 const set<ElementType>& exclude,
3242 						 SELECT_FLAGS flag ) const {
3243     /// wrapper around the the generic select()
3244     /*!
3245      * calls select() with a default setname.
3246      */
3247     return select( et, "", exclude, flag );
3248   }
3249 
select(ElementType et,SELECT_FLAGS flag) const3250   vector<FoliaElement*> AbstractElement::select( ElementType et,
3251 						 SELECT_FLAGS flag ) const {
3252     /// wrapper around the the generic select()
3253     /*!
3254      * calls select() with a default setname and the default ignore set
3255      */
3256     return select( et, "", default_ignore, flag );
3257   }
3258 
unravel(set<FoliaElement * > & store)3259   void AbstractElement::unravel( set<FoliaElement*>& store ){
3260     /// split the node and all siblings into a set of nodes
3261     /*!
3262      * \param store
3263      * recursively go through this node and its children an collect all
3264      * node pointers in store.
3265      * Erase the _data array of every node
3266      *
3267      * This function is used when erasing a document. Creating a set avoids
3268      * deleting nodes twice
3269      */
3270     resetrefcount();
3271     _parent = 0;
3272     store.insert( this );
3273     auto dit = _data.begin();
3274     while ( dit != _data.end() ){
3275       (*dit)->unravel( store );
3276       dit = _data.erase(dit);
3277     }
3278   }
3279 
parseXml(const xmlNode * node)3280   FoliaElement* AbstractElement::parseXml( const xmlNode *node ) {
3281     /// recursively parse a FoLiA tree starting at node
3282     /*!
3283      * \param node an xmlNode representing a FoLiA subtree
3284      * \return the parsed tree. Throws on error.
3285      */
3286     KWargs att = getAttributes( node );
3287     int sp = xmlNodeGetSpacePreserve(node);
3288     if ( sp == 1 ){
3289       att["xml:space"] = "preserve";
3290     }
3291     else if ( sp == 0 ){
3292       att["xml:space"] = "default";
3293     }
3294 
3295     setAttributes( att );
3296     xmlNode *p = node->children;
3297     while ( p ) {
3298       string pref;
3299       string ns = getNS( p, pref );
3300       if ( !ns.empty() && ns != NSFOLIA ){
3301 	// skip alien nodes
3302 	if ( doc() && doc()->debug > 2 ) {
3303 	  cerr << "skipping non-FoLiA node: " << pref << ":" << Name(p) << endl;
3304 	}
3305 	p = p->next;
3306 	continue;
3307       }
3308       if ( p->type == XML_ELEMENT_NODE ) {
3309 	string tag = Name( p );
3310 	FoliaElement *t = createElement( tag, doc() );
3311 	if ( t ) {
3312 	  if ( doc() && doc()->debug > 2 ) {
3313 	    cerr << "created " << t << endl;
3314 	  }
3315 	  t = t->parseXml( p );
3316 	  if ( t ) {
3317 	    if ( doc() && doc()->debug > 2 ) {
3318 	      cerr << "extend " << this << " met " << t << endl;
3319 	    }
3320 	    append( t );
3321 	  }
3322 	}
3323 	else if ( doc() && !doc()->permissive() ){
3324 	  throw XmlError( "FoLiA parser terminated" );
3325 	}
3326       }
3327       else if ( p->type == XML_COMMENT_NODE ) {
3328 	string tag = "_XmlComment";
3329 	FoliaElement *t = createElement( tag, doc() );
3330 	if ( t ) {
3331 	  if ( doc() && doc()->debug > 2 ) {
3332 	    cerr << "created " << t << endl;
3333 	  }
3334 	  t = t->parseXml( p );
3335 	  if ( t ) {
3336 	    if ( doc() && doc()->debug > 2 ) {
3337 	      cerr << "extend " << this << " met " << t << endl;
3338 	    }
3339 	    append( t );
3340 	  }
3341 	}
3342       }
3343       else if ( p->type == XML_ENTITY_REF_NODE ){
3344 	string txt = TextValue( p );
3345 	XmlText *t = add_child<XmlText>( txt );
3346 	if ( doc() && doc()->debug > 2 ) {
3347 	  cerr << "created " << t << "(" << t->text() << ")" << endl;
3348 	  cerr << "extended " << this << " met " << t << endl;
3349 	  cerr << "this.size()= " << size() << " t.size()=" << t->size() << endl;
3350 	}
3351       }
3352       else if ( p->type == XML_TEXT_NODE ){
3353 	if ( this->is_textcontainer()
3354 	     || this->is_phoncontainer() ){
3355 	  // non empty text is allowed (or even required) here
3356 	  string txt = TextValue( p );
3357 	  if ( !txt.empty() ) {
3358 	    XmlText *t = add_child<XmlText>( txt );
3359 	    if ( doc() && doc()->debug > 2 ) {
3360 	      cerr << "created " << t << "(" << t->text() << ")" << endl;
3361 	      cerr << "extended " << this << " met " << t << endl;
3362 	      cerr << "this.size()= " << size() << " t.size()=" << t->size() << endl;
3363 	    }
3364 	  }
3365 	}
3366 	else {
3367 	  // This MUST be 'empty space', so only spaces and tabs formatting
3368 	  string tag = "_XmlText";
3369 	  FoliaElement *t = createElement( tag, doc() );
3370 	  if ( t ) {
3371 	    if ( doc() && doc()->debug > 2 ){
3372 	      cerr << "created " << t << endl;
3373 	    }
3374 	    try {
3375 	      t = t->parseXml( p );
3376 	    }
3377 	    catch ( const ValueError& e ){
3378 	      t->destroy();
3379 	      t = 0;
3380 	    }
3381 	  }
3382 	  if ( t ) {
3383 	    string txt = t->str();
3384 	    txt = TiCC::trim(txt);
3385 	    if ( !txt.empty() ){
3386 	      if ( p->prev ){
3387 		string tg = "<" + Name(p->prev) + ">";
3388 		throw XmlError( "found extra text '" + txt + "' after element "
3389 				+ tg + ", NOT allowed there." );
3390 	      }
3391 	      else {
3392 		string tg = "<" + Name(p->parent) + ">";
3393 		throw XmlError( "found extra text '" + txt + "' inside element "
3394 				+ tg + ", NOT allowed there." );
3395 	      }
3396 	    }
3397 	    if ( doc() && doc()->debug > 2 ){
3398 	      cerr << "created " << t << "(" << t->text() << ")" << endl;
3399 	      cerr << "extended " << this << " met " << t << endl;
3400 	      cerr << "this.size()= " << size() << " t.size()=" << t->size() << endl;
3401 	    }
3402 	    append( t );
3403 	  }
3404 	}
3405       }
3406       p = p->next;
3407     }
3408     if ( doc() && ( doc()->checktext() || doc()->fixtext() )
3409 	 && this->printable()
3410 	 && !isSubClass( Morpheme_t ) && !isSubClass( Phoneme_t) ){
3411       check_text_consistency_while_parsing();
3412     }
3413     return this;
3414   }
3415 
setDateTime(const string & s)3416   void AbstractElement::setDateTime( const string& s ) {
3417     /// set the DATETIME value of a node
3418     /*!
3419      * \param s a date/time in ISO.... format. (YYYY-MM-DDThh:mm:ss)
3420      */
3421     Attrib supported = required_attributes() | optional_attributes();
3422     if ( !(DATETIME & supported) ) {
3423       throw ValueError("datetime is not supported for " + classname() );
3424     }
3425     else {
3426       string time = parseDate( s );
3427       if ( time.empty() ) {
3428 	throw ValueError( "invalid datetime, must be in YYYY-MM-DDThh:mm:ss format: " + s );
3429       }
3430       _datetime = time;
3431     }
3432   }
3433 
getDateTime() const3434   const string AbstractElement::getDateTime() const {
3435     /// return the _datetime value
3436     return _datetime;
3437   }
3438 
addPosAnnotation(const KWargs & inargs)3439   PosAnnotation *AllowInlineAnnotation::addPosAnnotation( const KWargs& inargs ) {
3440     /// add a PosAnnotation node given the parameters
3441     /*!
3442      * \param inargs A list of Attribute-Value pairs
3443      * \return the created PosAnnotation node
3444      *
3445      * when the *this node already has a PosAnnotation in the specified set,
3446      * an ALTERNATIVE node is added
3447      */
3448     KWargs args = inargs;
3449     string st;
3450     auto it = args.find("set" );
3451     if ( it != args.end() ) {
3452       st = it->second;
3453     }
3454     string newId = args.extract("generate_id" );
3455     if ( newId.empty() ){
3456       newId = "alt-pos";
3457     }
3458     if ( has_annotation<PosAnnotation>( st ) > 0 ) {
3459       // ok, there is already one, so create an Alternative
3460       KWargs kw;
3461       kw["xml:id"] = generateId( newId );
3462       if ( !doc()->declared( AnnotationType::ALTERNATIVE ) ){
3463 	doc()->declare( AnnotationType::ALTERNATIVE, "" );
3464       }
3465       Alternative *alt = new Alternative( kw, doc() );
3466       append( alt );
3467       return alt->addAnnotation<PosAnnotation>( args );
3468     }
3469     else {
3470       return addAnnotation<PosAnnotation>( args );
3471     }
3472   }
3473 
getPosAnnotations(const string & st,vector<PosAnnotation * > & alts) const3474   PosAnnotation* AllowInlineAnnotation::getPosAnnotations( const string& st,
3475 							   vector<PosAnnotation*>& alts ) const {
3476     /// return the PosAnnotation AND all alternatives
3477     /*!
3478      * \param st the annotation set
3479      * \param alts all the alternatives in set st
3480      * \return the PosAnnotation in set st
3481      *
3482      * \note The return value may be 0, even when there ARE alternatives!
3483      */
3484     PosAnnotation *res = annotation<PosAnnotation>( st ); // may be 0
3485     alts.clear();
3486     // now search for alternatives
3487     vector<Alternative *> alt_nodes = select<Alternative>( AnnoExcludeSet );
3488     for ( const auto& alt : alt_nodes ){
3489       if ( alt->size() > 0 ) { // child elements?
3490 	for ( size_t j=0; j < alt->size(); ++j ) {
3491 	  if ( alt->index(j)->element_id() == PosAnnotation_t &&
3492 	       ( st.empty() || alt->index(j)->sett() == st ) ) {
3493 	    alts.push_back( dynamic_cast<PosAnnotation*>(alt->index(j)) );
3494 	  }
3495 	}
3496       }
3497     }
3498     return res;
3499   }
3500 
addLemmaAnnotation(const KWargs & inargs)3501   LemmaAnnotation *AllowInlineAnnotation::addLemmaAnnotation( const KWargs& inargs ) {
3502     /// add a LemmaAnnotation node given the parameters
3503     /*!
3504      * \param inargs A list of Attribute-Value pairs
3505      * \return the created LemmaAnnotation node
3506      *
3507      * when the *this node already has a LemmaAnnotation in the specified set,
3508      * an ALTERNATIVE node is added
3509      */
3510     KWargs args = inargs;
3511     string st;
3512     auto it = args.find("set" );
3513     if ( it != args.end() ) {
3514       st = it->second;
3515     }
3516     string newId = args.extract("generate_id" );
3517     if ( newId.empty() ){
3518       newId = "alt-lem";
3519     }
3520     if ( has_annotation<LemmaAnnotation>( st ) > 0 ) {
3521       // ok, there is already one, so create an Alternative
3522       KWargs kw;
3523       kw["xml:id"] = generateId( newId );
3524       if ( !doc()->declared( AnnotationType::ALTERNATIVE ) ){
3525 	doc()->declare( AnnotationType::ALTERNATIVE, "" );
3526       }
3527       Alternative *alt = new Alternative( kw, doc() );
3528       append( alt );
3529       return alt->addAnnotation<LemmaAnnotation>( args );
3530     }
3531     else {
3532       return addAnnotation<LemmaAnnotation>( args );
3533     }
3534   }
3535 
getLemmaAnnotations(const string & st,vector<LemmaAnnotation * > & alts) const3536   LemmaAnnotation* AllowInlineAnnotation::getLemmaAnnotations( const string& st,
3537 							       vector<LemmaAnnotation*>& alts ) const {
3538     /// return the LemmaAnnotation AND all alternatives
3539     /*!
3540      * \param st the annotation set
3541      * \param alts all the alternatives in set st
3542      * \return the LemmaAnnotation in set st
3543      *
3544      * \note The return value may be 0, even when there ARE alternatives!
3545      */
3546     alts.clear();
3547     LemmaAnnotation *res = annotation<LemmaAnnotation>( st ); // may be 0 !
3548     // also search alternatives
3549     vector<Alternative *> alt_nodes = select<Alternative>( AnnoExcludeSet );
3550     for ( const auto& alt : alt_nodes ){
3551       if ( alt->size() > 0 ) { // child elements?
3552 	for ( size_t j =0; j < alt->size(); ++j ) {
3553 	  if ( alt->index(j)->element_id() == LemmaAnnotation_t &&
3554 	       ( st.empty() || alt->index(j)->sett() == st ) ) {
3555 	    alts.push_back( dynamic_cast<LemmaAnnotation*>(alt->index(j)) );
3556 	  }
3557 	}
3558       }
3559     }
3560     return res;
3561   }
3562 
addSentence(const KWargs & in_args)3563   Sentence *AbstractElement::addSentence( const KWargs& in_args ) {
3564     /// add a Sentence node given the parameters
3565     /*!
3566      * \param in_args A list of Attribute-Value pairs
3567      * \return the created Sentence
3568      * may throw when the 'xml:id' is nor unique
3569      */
3570     Sentence *res = 0;
3571     KWargs kw = in_args;
3572     if ( !kw.is_present("xml:id") ){
3573       string id = generateId( "s" );
3574       kw["xml:id"] = id;
3575     }
3576     try {
3577       res = new Sentence( kw, doc() );
3578     }
3579     catch( const DuplicateIDError& e ) {
3580       res->destroy();
3581       throw;
3582     }
3583     append( res );
3584     return res;
3585   }
3586 
addWord(const KWargs & in_args)3587   Word *AbstractElement::addWord( const KWargs& in_args ) {
3588     /// add a Word node given the parameters
3589     /*!
3590      * \param in_args A list of Attribute-Value pairs
3591      * \return the created Word
3592      * may throw when the 'xml:id' is nor unique, or when appending fails
3593      */
3594     Word *res = new Word( doc() );
3595     KWargs kw = in_args;
3596     if ( !kw.is_present("xml:id") ){
3597       string id = generateId( "w" );
3598       kw["xml:id"] = id;
3599     }
3600     try {
3601       res->setAttributes( kw );
3602     }
3603     catch( const DuplicateIDError& e ) {
3604       res->destroy();
3605       throw;
3606     }
3607     append( res );
3608     return res;
3609   }
3610 
addWord(const string & s)3611   Word *AbstractElement::addWord( const string& s ){
3612     /// add a Word given the string s
3613     /*!
3614       \param s a string with text OR an encode KWargs list
3615       \return a new Word
3616 
3617       If the string is a KWargs properties list it is used to create a word
3618       with those properties. Otherwise it is assumed that \em s represents
3619       the text value for the Word
3620     */
3621     KWargs args = getArgs(s);
3622     if ( args.empty() ){
3623       args["text"] = s;
3624     }
3625     return addWord( args );
3626   }
3627 
generateId(const string & tag)3628   const string AllowGenerateID::generateId( const string& tag ){
3629     /// generate an new xml:id
3630     /*!
3631      * \param tag an extra string to use in the result
3632      * \return a string with an unique id
3633      *
3634      * The new id is constructed from the elements id, or from a parent id
3635      */
3636     string nodeId = id();
3637     // cerr << "node: " << this << endl;
3638     // cerr << "ID=" << nodeId << endl;
3639     if ( nodeId.empty() ){
3640       // if no ID, look upward.
3641       FoliaElement *par = parent();
3642       if ( !par ){
3643 	throw XmlError( "unable to generate an ID. No StructureElement parent found?" );
3644       }
3645       // cerr << "call on parent:" << par << endl;
3646       return par->generateId( tag );
3647     }
3648     else {
3649       int max = 0;
3650       if ( !tag.empty() ) {
3651 	max = ++id_map[tag];
3652       }
3653       // cerr << "MAX = " << max << endl;
3654       string id = nodeId + '.' + tag + '.' +  TiCC::toString( max );
3655       // cerr << "new id = " << id << endl;
3656       return id;
3657     }
3658   }
3659 
setMaxId(FoliaElement * child)3660   void AllowGenerateID::setMaxId( FoliaElement *child ) {
3661     /// register the child id for later use
3662     /*!
3663       * \param child
3664       * if the child has an id, try to extract the last part as a number
3665       * if so, check the registration of that numer for the childs tag
3666       */
3667     if ( !child->id().empty() && !child->xmltag().empty() ) {
3668       vector<string> parts = TiCC::split_at( child->id(), "." );
3669       if ( !parts.empty() ) {
3670 	string val = parts.back();
3671 	int i;
3672 	try {
3673 	  i = stringTo<int>( val );
3674 	}
3675 	catch ( const exception& ) {
3676 	  // no number, so assume some user defined id
3677 	  return;
3678 	}
3679 	const auto& it = id_map.find( child->xmltag() );
3680 	if ( it == id_map.end() ) {
3681 	  id_map[child->xmltag()] = i;
3682 	}
3683 	else {
3684 	  if ( it->second < i ) {
3685 	    it->second = i;
3686 	  }
3687 	}
3688       }
3689     }
3690   }
3691 
3692   //#define DEBUG_CORRECT 1
3693 
correct(const vector<FoliaElement * > & _original,const vector<FoliaElement * > & _current,const vector<FoliaElement * > & _newv,const vector<FoliaElement * > & _suggestions,const KWargs & args_in)3694   Correction * AllowCorrections::correct( const vector<FoliaElement*>& _original,
3695 					  const vector<FoliaElement*>& _current,
3696 					  const vector<FoliaElement*>& _newv,
3697 					  const vector<FoliaElement*>& _suggestions,
3698 					  const KWargs& args_in ) {
3699     /// generic function to correct a group of FoliaElements into a Correction
3700     /*!
3701      * \param _original a group of nodes to correct and add to the Original
3702      * \param _current a group of nodes to add to the Current
3703      * \param _newv a group of nodes to replace _original, added to New
3704      * \param _suggestions a group of nodes to add to Suggestions
3705      * \param args_in additional arguments
3706      * \return the Correction node. Might throw on problems
3707      */
3708 #ifdef DEBUG_CORRECT
3709     cerr << "correct " << this << endl;
3710     cerr << "original= " << _original << endl;
3711     cerr << "current = " << _current << endl;
3712     cerr << "new     = " << _newv << endl;
3713     cerr << "suggestions     = " << _suggestions << endl;
3714     cerr << "args in     = " << args_in << endl;
3715 #endif
3716     // Apply a correction
3717     Document *doc = this->doc();
3718     Correction *corr = 0;
3719     bool hooked = false;
3720     New *addnew = 0;
3721     KWargs args = args_in;
3722     vector<FoliaElement*> original = _original;
3723     vector<FoliaElement*> _new = _newv;
3724     vector<FoliaElement*> suggestions = _suggestions;
3725     auto it = args.find("new");
3726     if ( it != args.end() ) {
3727       KWargs my_args;
3728       my_args["value"] = it->second;
3729       TextContent *t = new TextContent( my_args, doc );
3730       _new.push_back( t );
3731       args.erase( it );
3732     }
3733     it = args.find("suggestion");
3734     if ( it != args.end() ) {
3735       KWargs my_args;
3736       my_args["value"] = it->second;
3737       TextContent *t = new TextContent( my_args, doc );
3738       suggestions.push_back( t );
3739       args.erase( it );
3740     }
3741     it = args.find("reuse");
3742     if ( it != args.end() ) {
3743       // reuse an existing correction instead of making a new one
3744       try {
3745 	corr = dynamic_cast<Correction*>(doc->index(it->second));
3746       }
3747       catch ( const exception& e ) {
3748 	throw ValueError("reuse= must point to an existing correction id!");
3749       }
3750       if ( !corr->isinstance( Correction_t ) ) {
3751 	throw ValueError("reuse= must point to an existing correction id!");
3752       }
3753       hooked = true;
3754       if ( !_new.empty() && corr->hasCurrent() ) {
3755 	// can't add new if there's current, so first set original to current, and then delete current
3756 
3757 	if ( !_current.empty() ) {
3758 	  throw runtime_error( "Can't set both new= and current= !");
3759 	}
3760 	if ( original.empty() ) {
3761 	  // move the current to Original
3762 	  FoliaElement *cur = corr->getCurrent();
3763 	  original.push_back( cur );
3764 	  corr->remove( cur );
3765 	}
3766       }
3767     }
3768     else {
3769       KWargs args2 = args;
3770       args2.erase("suggestion" );
3771       args2.erase("suggestions" );
3772       string id = generateId( "correction" );
3773       args2["xml:id"] = id;
3774       corr = new Correction( args2, doc );
3775     }
3776 #ifdef DEBUG_CORRECT
3777     cerr << "now corr= " << corr << endl;
3778 #endif
3779     if ( !_current.empty() ) {
3780       if ( !original.empty() || !_new.empty() ) {
3781 	throw runtime_error("When setting current=, original= and new= can not be set!");
3782       }
3783       for ( const auto& cur : _current ) {
3784 	FoliaElement *add = new Current( doc );
3785 	cur->set_parent(0);
3786 	add->append( cur );
3787 	corr->replace( add );
3788 	if ( !hooked ) {
3789 	  for ( size_t i=0; i < size(); ++i ) {
3790 	    if ( index(i) == cur ) {
3791 	      replace( index(i), corr );
3792 	      hooked = true;
3793 	    }
3794 	  }
3795 	}
3796       }
3797 #ifdef DEBUG_CORRECT
3798       cerr << "now corr= " << corr << endl;
3799 #endif
3800     }
3801     if ( !_new.empty() ) {
3802 #ifdef DEBUG_CORRECT
3803       cerr << "there is new! " << endl;
3804 #endif
3805       vector<New*> old_new = corr->select<New>();
3806       if ( !old_new.empty() && old_new[0]->size() == 0 ){
3807 	// there is an EMPTY <new> tag!
3808 	// use it to expand
3809 	addnew = old_new[0];
3810       }
3811       else {
3812 	// create a <new> tag, might throw is there is a non-empty one
3813 	addnew = new New( doc );
3814 	corr->append(addnew);
3815       }
3816       for ( const auto& nw : _new ) {
3817 	nw->set_parent(0);
3818 	addnew->append( nw );
3819       }
3820 #ifdef DEBUG_CORRECT
3821       cerr << "after adding NEW: " << corr->xmlstring() << endl;
3822 #endif
3823       vector<Current*> v = corr->FoliaElement::select<Current>();
3824       //delete current if present
3825       for ( const auto& cur:v ) {
3826 	corr->remove( cur );
3827       }
3828 #ifdef DEBUG_CORRECT
3829       cerr << "after removing CUR: " << corr->xmlstring() << endl;
3830 #endif
3831     }
3832     else if ( !original.empty() ){
3833       vector<New*> old_new = corr->select<New>();
3834       if ( !old_new.empty() && old_new[0]->size() == 0 ){
3835 	// there is aleady an EMPTY <new> tag!
3836       }
3837       else {
3838 	// create a <new> tag, might throw is there is a non-empty one
3839 	New *add_new = new New( doc );
3840 	corr->append(add_new);
3841       }
3842     }
3843     if ( !original.empty() ) {
3844 #ifdef DEBUG_CORRECT
3845       cerr << "there is original! " << endl;
3846 #endif
3847       FoliaElement *add = new Original( doc );
3848       corr->replace(add);
3849 #ifdef DEBUG_CORRECT
3850       cerr << " corr after replacing original " << corr->xmlstring() << endl;
3851       cerr << " new original= " << add << endl;
3852 #endif
3853       for ( const auto& org: original ) {
3854 #ifdef DEBUG_CORRECT
3855 	cerr << " examine org " << org << endl;
3856 #endif
3857 	bool dummyNode = ( org->id() == "dummy" );
3858 	if ( !dummyNode ) {
3859 	  org->set_parent(0);
3860 	  add->append( org );
3861 	}
3862 #ifdef DEBUG_CORRECT
3863 	cerr << " NOW original= " << add << endl;
3864 #endif
3865 	for ( size_t i=0; i < size(); ++i ) {
3866 #ifdef DEBUG_CORRECT
3867 	  cerr << "in loop, bekijk " << index(i) << endl;
3868 #endif
3869 	  if ( index(i) == org ) {
3870 #ifdef DEBUG_CORRECT
3871 	    cerr << "OK hit on ORG :" << org << endl;
3872 #endif
3873 	    if ( !hooked ) {
3874 #ifdef DEBUG_CORRECT
3875 	      cerr << "it isn't hooked!" << endl;
3876 	      FoliaElement *tmp = replace( index(i), corr );
3877 	      cerr << " corr after replace " << corr->xmlstring() << endl;
3878 	      cerr << " replaced " << tmp << endl;
3879 #else
3880 	      replace( index(i), corr );
3881 #endif
3882 	      hooked = true;
3883 	    }
3884 	    else {
3885 #ifdef DEBUG_CORRECT
3886 	      cerr << " corr before remove " << corr << endl;
3887 	      cerr << " remove  " << org << endl;
3888 #endif
3889 	      this->remove( org );
3890 #ifdef DEBUG_CORRECT
3891 	      cerr << " corr after remove " << corr << endl;
3892 #endif
3893 	    }
3894 	  }
3895 	}
3896       }
3897       if ( add->size() == 0 ){
3898 	corr->remove( add );
3899       }
3900     }
3901     else if ( addnew ) {
3902       // original not specified, find automagically:
3903       vector<FoliaElement *> orig;
3904 #ifdef DEBUG_CORRECT
3905       cerr << "start to look for original " << endl;
3906 #endif
3907       for ( size_t i=0; i < len(addnew); ++ i ) {
3908 	FoliaElement *p = addnew->index(i);
3909 #ifdef DEBUG_CORRECT
3910 	cerr << "bekijk " << p << endl;
3911 #endif
3912 	vector<FoliaElement*> v = p->find_replacables( this );
3913 	// for ( const auto& el: v ) {
3914 	//   orig.push_back( el );
3915 	// }
3916 	copy( v.begin(), v.end(), back_inserter(orig) );
3917       }
3918       if ( orig.empty() ) {
3919 	throw runtime_error( "No original= specified and unable to automatically infer");
3920       }
3921       else {
3922 #ifdef DEBUG_CORRECT
3923 	cerr << "we seem to have some originals! " << endl;
3924 #endif
3925 	FoliaElement *add = new Original( doc );
3926 #ifdef DEBUG_CORRECT
3927 	cerr << "corr before adding new original! " << corr << endl;
3928 #endif
3929 	corr->replace(add);
3930 #ifdef DEBUG_CORRECT
3931 	cerr << "corr after adding new original! " << corr << endl;
3932 	cerr << "now parent = " << add->parent() << endl;
3933 #endif
3934 
3935 	for ( const auto& org: orig ) {
3936 #ifdef DEBUG_CORRECT
3937 	  cerr << " examine original : " << org << endl;
3938 	  cerr << "with parent = " << org->parent() << endl;
3939 #endif
3940 	  // first we lookup org in our data and remove it there
3941 	  for ( size_t i=0; i < size(); ++i ) {
3942 #ifdef DEBUG_CORRECT
3943 	    cerr << "in loop, bekijk " << index(i) << endl;
3944 #endif
3945 	    if ( index(i) == org ) {
3946 #ifdef DEBUG_CORRECT
3947 	      cerr << "found original " << endl;
3948 #endif
3949 	      if ( !hooked ) {
3950 #ifdef DEBUG_CORRECT
3951 		cerr << "it isn't hooked!" << endl;
3952 		FoliaElement *tmp = replace( index(i), corr );
3953 		cerr << " corr after replace " << corr << endl;
3954 		cerr << " replaced " << tmp << endl;
3955 #else
3956 		replace( index(i), corr );
3957 #endif
3958 
3959 		hooked = true;
3960 	      }
3961 	      else {
3962 #ifdef DEBUG_CORRECT
3963 		cerr << " corr before remove " << corr << endl;
3964 		cerr << " remove  " << org << endl;
3965 #endif
3966 		this->remove( org );
3967 #ifdef DEBUG_CORRECT
3968 		cerr << " corr after remove " << corr << endl;
3969 #endif
3970 	      }
3971 	    }
3972 	  }
3973 	  // now we conect org to the new original node
3974 	  org->set_parent( 0 );
3975 	  add->append( org );
3976 #ifdef DEBUG_CORRECT
3977 	  cerr << " add after append : " << add << endl;
3978 	  cerr << "parent = " << org->parent() << endl;
3979 #endif
3980 	}
3981 	vector<Current*> v = corr->FoliaElement::select<Current>();
3982 	//delete current if present
3983 	for ( const auto& cur: v ) {
3984 #ifdef DEBUG_CORRECT
3985 	  cerr << " remove cur=" << cur << endl;
3986 #endif
3987 	  this->remove( cur );
3988 	}
3989       }
3990     }
3991 #ifdef DEBUG_CORRECT
3992     cerr << " corr after edits " << corr->xmlstring() << endl;
3993 #endif
3994     if ( addnew ) {
3995       for ( const auto& org : original ) {
3996 #ifdef DEBUG_CORRECT
3997 	cerr << " remove  " << org << endl;
3998 #endif
3999 	bool dummyNode = ( org->id() == "dummy" );
4000 	corr->remove( org );
4001 	if ( dummyNode ){
4002 	  org->destroy();
4003 	}
4004       }
4005     }
4006 #ifdef DEBUG_CORRECT
4007     cerr << " corr after removes " << corr->xmlstring() << endl;
4008 #endif
4009     if ( !suggestions.empty() ) {
4010       if ( !hooked ) {
4011 	append(corr);
4012       }
4013       for ( const auto& sug : suggestions ) {
4014 	if ( sug->isinstance( Suggestion_t ) ) {
4015 	  sug->set_parent(0);
4016 	  corr->append( sug );
4017 	}
4018 	else {
4019 	  FoliaElement *add = new Suggestion( doc );
4020 	  sug->set_parent(0);
4021 	  add->append( sug );
4022 	  corr->append( add );
4023 	}
4024       }
4025     }
4026 
4027     it = args.find("reuse");
4028     if ( it != args.end() ) {
4029       it = args.find("annotator");
4030       if ( it != args.end() ) {
4031 	corr->annotator( it->second );
4032       }
4033       it = args.find("annotatortype");
4034       if ( it != args.end() ){
4035 	corr->annotatortype( stringTo<AnnotatorType>(it->second) );
4036       }
4037       it = args.find("confidence");
4038       if ( it != args.end() ) {
4039 	corr->confidence( stringTo<double>(it->second) );
4040       }
4041     }
4042     return corr;
4043   }
4044 
correct(const string & s)4045   Correction *AllowCorrections::correct( const string& s ) {
4046     /// use an Attribute-Value list to create a Correction
4047     /*!
4048      * \param s a string representation of a Attribute-Value list
4049      * \return the created Correcion
4050      * The parameter is converted to a KWargs list which is handled over
4051      * to correct()
4052      */
4053     vector<FoliaElement*> nil1;
4054     vector<FoliaElement*> nil2;
4055     vector<FoliaElement*> nil3;
4056     vector<FoliaElement*> nil4;
4057     KWargs args = getArgs( s );
4058     //    cerr << xmltag() << "::correct() <== " << this << endl;
4059     Correction *tmp = correct( nil1, nil2, nil3, nil4, args );
4060     //    cerr << xmltag() << "::correct() ==> " << this << endl;
4061     return tmp;
4062   }
4063 
correct(FoliaElement * _old,FoliaElement * _new,const vector<FoliaElement * > & sugg,const KWargs & args)4064   Correction *AllowCorrections::correct( FoliaElement *_old,
4065 					 FoliaElement *_new,
4066 					 const vector<FoliaElement*>& sugg,
4067 					 const KWargs& args ) {
4068     /// create a correction using the parameters
4069     /*!
4070      * \param _old the node to correct
4071      * \param _new the corrected node
4072      * \param sugg a list of possible suggestions
4073      * \param args additonal arguments
4074      * \return the created Correcion
4075      */
4076     vector<FoliaElement *> nv;
4077     nv.push_back( _new );
4078     vector<FoliaElement *> ov;
4079     ov.push_back( _old );
4080     vector<FoliaElement *> nil;
4081     //    cerr << xmltag() << "::correct() <== " << this << endl;
4082     Correction *tmp = correct( ov, nil, nv, sugg, args );
4083     //    cerr << xmltag() << "::correct() ==> " << this << endl;
4084     return tmp;
4085   }
4086 
correct(FoliaElement * _old,FoliaElement * _new,const KWargs & args)4087   Correction *AllowCorrections::correct( FoliaElement* _old,
4088 					 FoliaElement* _new,
4089 					 const KWargs& args ) {
4090     /// create a correction using the parameters
4091     /*!
4092      * \param _old the node to correct
4093      * \param _new the corrected node
4094      * \param args additonal arguments
4095      * \return the created Correcion
4096      */
4097     const vector<FoliaElement*> sugg;
4098     return correct( _old, _new, sugg, args );
4099   }
4100 
feats(const string & s) const4101   vector<string> AbstractElement::feats( const string& s ) const {
4102     /// return all classes of the given subset
4103     /*!
4104      * \param s a subset name
4105      * \return a list of all classes in the subset of the Feature nodes
4106      * The function loops through all children and for Feature_t children
4107      * it check the subset and collects the matching ones
4108      */
4109     vector<string> result;
4110     for ( const auto& el : data() ) {
4111       if ( el->isSubClass( Feature_t ) &&
4112 	   el->subset() == s ) {
4113 	result.push_back( el->cls() );
4114       }
4115     }
4116     return result;
4117   }
4118 
feat(const string & s) const4119   const string AbstractElement::feat( const string& s ) const {
4120     /// return the class of the first matching Feature with subset s
4121     /*!
4122      * \param s a subset name
4123      * \return the first class of the first Feature node in subset s
4124      */
4125     const auto& it = find_if( _data.begin(), _data.end(),
4126 			      [s]( const FoliaElement *e ){
4127 				return ( e->isSubClass( Feature_t )
4128 					 && e->subset() == s ); } );
4129     if ( it == _data.end() ){
4130       return "";
4131     }
4132     else {
4133       return (*it)->cls();
4134     }
4135   }
get_metadata() const4136   const MetaData* AbstractElement::get_metadata() const {
4137     /// Get the MetaData node related to this element
4138     /*!
4139      * \return the _metadata or 0 if not available
4140      * may recurse upwards through the parent nodes
4141      */
4142     if ( !_metadata.empty() && doc() ){
4143       return doc()->get_submetadata(_metadata);
4144     }
4145     else if ( parent() ){
4146       return parent()->get_metadata();
4147     }
4148     else {
4149       return 0;
4150     }
4151   }
4152 
get_metadata(const string & key) const4153   const string AbstractElement::get_metadata( const string& key ) const {
4154     /// Get the metadata value for this key
4155     /*!
4156      * \param key which metadata field do we want?
4157      * \return the metadata value for this key
4158      */
4159     if ( !_metadata.empty() && doc() ){
4160       const MetaData *what = doc()->get_submetadata(_metadata);
4161       if ( what && what->datatype() == "NativeMetaData" && !key.empty() ){
4162 	return what->get_val( key );
4163       }
4164       return "";
4165     }
4166     else if ( parent() ){
4167       return parent()->get_metadata( key );
4168     }
4169     else {
4170       return "";
4171     }
4172   }
4173 
selectSpan() const4174   vector<AbstractSpanAnnotation*> AbstractElement::selectSpan() const {
4175     /// select all SpanAnnotation nodes in the FoliaElement
4176     /*!
4177      * \return a list of SpanAnnotation nodes.
4178      * All possible Span types are collected in this list. (see SpanSet)
4179      */
4180     vector<AbstractSpanAnnotation*> res;
4181     for ( const auto& el : SpanSet ) {
4182       vector<FoliaElement*> tmp = select( el );
4183       transform( tmp.begin(), tmp.end(),
4184 		 back_inserter(res),
4185 		 [&]( FoliaElement *e ){
4186 		   return dynamic_cast<AbstractSpanAnnotation*>( e ); } );
4187     }
4188     return res;
4189   }
4190 
4191 } // namespace folia
4192