1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of libfolia
7 
8   libfolia is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   libfolia is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 #include <cassert>
27 #include <cstdlib>
28 #include <iostream>
29 #include <fstream>
30 #include <string>
31 #include <algorithm>
32 #include <vector>
33 #include <map>
34 #include <stdexcept>
35 #include "config.h"
36 #include "ticcutils/PrettyPrint.h"
37 #include "ticcutils/XMLtools.h"
38 #include "ticcutils/StringOps.h"
39 #include "ticcutils/Unicode.h"
40 #include "ticcutils/zipper.h"
41 #include "libfolia/folia.h"
42 #include "libfolia/folia_properties.h"
43 #include "libxml/xmlstring.h"
44 
45 using namespace std;
46 using namespace icu;
47 
48 /// the default output encoding, in fact the only one we allow
49 const char *output_encoding = "UTF-8";
50 
51 namespace folia {
52   using TiCC::operator<<;
53 
operator <<(ostream & os,const Document::at_t & at)54   ostream& operator<<( ostream& os, const Document::at_t& at ){
55     /// output an at_t structure (Debugging only)
56     /*!
57       \param os the output stream
58       \param at the at_t object
59     */
60     os << "<" << at._annotator << "," << TiCC::toString(at._ann_type)
61        << "," << at._date << "," << at._processors << ">";
62     return os;
63   }
64 
Document()65   Document::Document(){
66     /// create and initalize a FoLiA Document.
67     init();
68   }
69 
init_args(const KWargs & kwargs)70   void Document::init_args( const KWargs& kwargs ){
71     /// init some Document properties from a key-value list
72     /*!
73       \param kwargs a list of key-value pairs
74 
75       this function initializes a Document and can set the attributes
76       \e 'debug' and \e 'mode'
77 
78       When the attributes \e 'file' or \e 'string' are found, the value is used
79       to extract a complete FoLiA document from that file or string.
80     */
81     init();
82     KWargs args = kwargs;
83     string value = args.extract( "debug" );
84     if ( !value.empty() ){
85       debug = TiCC::stringTo<int>( value );
86     }
87     value = args.extract( "mode" );
88     if ( !value.empty() ){
89       setmode( value );
90     }
91     value = args.extract( "file" );
92     if ( !value.empty() ){
93       // extract a Document from a file
94       read_from_file( value );
95     }
96     else {
97       value = args.extract( "string" );
98       if ( !value.empty() ){
99 	// extract a Document from a string
100 	read_from_string( value );
101       }
102     }
103     if ( !foliadoc ){
104       // so NO 'file' or 'string' argument.
105       // (read_from_file/read_from_string create a foliadoc OR throw )
106       if ( args.find( "version" ) == args.end() ){
107 	// no version attribute. set it to the current default
108 	args["version"] = folia_version();
109       }
110       // create an 'empty' document using the args, with a FoLiA root node.
111       foliadoc = new FoLiA( args, this );
112     }
113   }
114 
Document(const KWargs & kwargs)115   Document::Document( const KWargs& kwargs ) {
116     /// initialize a Document using an attribute-value list
117     /*!
118       \param kwargs an attribute-value list
119      */
120     init_args( kwargs );
121   }
122 
Document(const string & s)123   Document::Document( const string& s ) {
124     /// initialize a Document using a string (filename or attribute-value list)
125     /*!
126       \param s a string representing a filename OR an attribute value list
127 
128       the string \e s can be a string encoded attribute value list OR just a
129       filename.
130 
131       Some examples:
132 
133       Document doc("my_first.folia.xml") creates a Document doc from
134       the file \e my_first.folia.xml.
135 
136       Document doc( "file='my_first.folia.xml', debug='3', mode='nochecktext'" ) This creates a document from the file \e my_first.folia.xml with a
137       debugging level of 3 and textchecking set to OFF
138 
139       Document doc( "xml:id='test'" ) creates a yet empty document with a
140       document ID with value 'test'
141 
142     */
143     KWargs args = getArgs(s);
144     if ( args.empty() ){
145       args["file"] = s;
146     }
147     init_args( args );
148   }
149 
folia_version()150   string folia_version(){
151     /// return the FoLiA version of this build
152     stringstream ss;
153     ss << MAJOR_VERSION << "." << MINOR_VERSION << "." << SUB_VERSION;
154     return ss.str();
155   }
156 
doc_version() const157   string Document::doc_version() const {
158     /// return the FoLiA version of this Document
159     stringstream ss;
160     ss << _major_version << "." << _minor_version << "." << _sub_version;
161     return ss.str();
162   }
163 
library_version()164   string library_version(){
165     /// return the version of the library
166     return VERSION;
167   }
168 
update_version()169   string Document::update_version(){
170     /// override the document version with the version of the build
171     /*!
172       \return the old value of the documents version
173     */
174     string old = _version_string;
175     _version_string = folia_version();
176     return old;
177   }
178 
init()179   void Document::init(){
180     /// initialize a Document structure with default values
181     _metadata = 0;
182     _foreign_metadata = 0;
183     _provenance = 0;
184     _xmldoc = 0;
185     foliadoc = 0;
186     _foliaNsIn_href = 0;
187     _foliaNsIn_prefix = 0;
188     _foliaNsOut = 0;
189     debug = 0;
190     mode = Mode( CHECKTEXT|AUTODECLARE );
191     _external_document = false;
192     _incremental_parse = false;
193     _preserve_spaces = false;
194     _warn_count = 0;
195     _major_version = 0;
196     _minor_version = 0;
197     _sub_version = 0;
198   }
199 
~Document()200   Document::~Document(){
201     /// Destroy a Document structure including al it's members
202     /*!
203       This also finally deletes FoLiA nodes that were marked for deletion
204       but not yet really destroyed. (because they might still be referenced)
205      */
206     xmlFreeDoc( _xmldoc );
207     xmlFree( (xmlChar*)_foliaNsIn_href );
208     xmlFree( (xmlChar*)_foliaNsIn_prefix );
209     sindex.clear();
210     if ( foliadoc ){
211       foliadoc->destroy();
212     }
213     set<FoliaElement*> bulk;
214     for ( const auto& it : delSet ){
215       it->unravel( bulk );
216     }
217     for ( const auto& it : bulk ){
218       it->destroy();
219     }
220     delete _metadata;
221     delete _foreign_metadata;
222     for ( const auto& it : submetadata ){
223       delete it.second;
224     }
225     delete _provenance;
226   }
227 
setmode(const string & ms) const228   void Document::setmode( const string& ms ) const {
229     /// Sets the  mode attributes of a document
230     /*!
231       \param ms an encoded string of attribute-values pairs giving modes
232       \note mode is mutable, so this even sets mode on CONST documents!
233 
234       The following modes can be set:
235       '(no)permissive' (default is NO),
236       '(no)strip' (default is NO),
237       '(no)canonical' (default is NO),
238       '(no)checktext' (default is checktext),
239       '(no)fixtext' (default is NO),
240       '(no)autodeclare' (default is NO)
241 
242       example:
243 
244       doc.setmode( "strip,nochecktext,autodeclare" );
245     */
246     vector<string> modev = TiCC::split_at( ms, "," );
247     for ( const auto& mod : modev ){
248       if ( mod == "permissive" ){
249 	mode = Mode( (int)mode | PERMISSIVE );
250       }
251       else if ( mod == "nopermissive" ){
252 	mode = Mode( (int)mode & ~PERMISSIVE );
253       }
254       else if ( mod == "strip" ){
255 	mode = Mode( (int)mode | STRIP );
256       }
257       else if ( mod == "nostrip" ){
258 	mode = Mode( (int)mode & ~STRIP );
259       }
260       else if ( mod == "canonical" ){
261 	mode = Mode( (int)mode | CANONICAL );
262       }
263       else if ( mod == "nocanonical" ){
264 	mode = Mode( (int)mode & ~CANONICAL );
265       }
266       else if ( mod == "kanon" ){ // backward compatible
267 	mode = Mode( (int)mode | CANONICAL );
268       }
269       else if ( mod == "nokanon" ){ // backward compatible
270 	mode = Mode( (int)mode & ~CANONICAL );
271       }
272       else if ( mod == "checktext" ){
273 	mode = Mode( int(mode) | CHECKTEXT );
274       }
275       else if ( mod == "nochecktext" ){
276 	mode = Mode( int(mode) & ~CHECKTEXT );
277       }
278       else if ( mod == "fixtext" ){
279 	mode = Mode( int(mode) | FIXTEXT );
280       }
281       else if ( mod == "nofixtext" ){
282 	mode = Mode( int(mode) & ~FIXTEXT );
283       }
284       else if ( mod == "autodeclare" ){
285 	mode = Mode( int(mode) | AUTODECLARE );
286       }
287       else if ( mod == "noautodeclare" ){
288 	mode = Mode( int(mode) & ~AUTODECLARE );
289       }
290       else if ( mod == "explicit" ){
291 	mode = Mode( int(mode) | EXPLICIT );
292       }
293       else if ( mod == "noexplicit" ){
294 	mode = Mode( int(mode) & ~EXPLICIT );
295       }
296       else {
297 	throw invalid_argument( "FoLiA::Document: unsupported mode value: "+ mod );
298       }
299     }
300   }
301 
getmode() const302   string Document::getmode() const {
303     /// returns the curent mode(s) as a string
304     /*!
305       \return a string explaining the modes set
306 
307       example:
308 
309       doc.getmode() might return: "mode=strip,nohecktext,autodeclare,"
310      */
311     string result = "mode=";
312     if ( mode & PERMISSIVE ){
313       result += "permissive,";
314     }
315     if ( mode & STRIP ){
316       result += "strip,";
317     }
318     if ( mode & CHECKTEXT ){
319       result += "checktext,";
320     }
321     else {
322       result += "nochecktext,";
323     }
324     if ( mode & FIXTEXT ){
325       result += "fixtext,";
326     }
327     if ( mode & CANONICAL ){
328       result += "canonical,";
329     }
330     if ( mode & AUTODECLARE ){
331       result += "autodeclare,";
332     }
333     else {
334       result += "noautodeclare,";
335     }
336     if ( mode & EXPLICIT ){
337       result += "explicit,";
338     }
339     return result;
340   }
341 
set_strip(bool new_val) const342   bool Document::set_strip( bool new_val ) const{
343     /// sets the 'strip' mode to on/off
344     /*!
345       \param new_val the boolean to use for on/off
346       \return the previous value
347     */
348     bool old_val = (mode & STRIP);
349     if ( new_val ){
350       mode = Mode( (int)mode | STRIP );
351     }
352     else {
353       mode = Mode( (int)mode & ~STRIP );
354     }
355     return old_val;
356   }
357 
set_permissive(bool new_val) const358   bool Document::set_permissive( bool new_val ) const{
359     /// sets the 'permissive' mode to on/off
360     /*!
361       \param new_val the boolean to use for on/off
362       \return the previous value
363     */
364     bool old_val = (mode & PERMISSIVE);
365     if ( new_val ){
366       mode = Mode( (int)mode | PERMISSIVE );
367     }
368     else {
369       mode = Mode( (int)mode & ~PERMISSIVE );
370     }
371     return old_val;
372   }
373 
set_checktext(bool new_val) const374   bool Document::set_checktext( bool new_val ) const{
375     /// sets the 'checktext' mode to on/off
376     /*!
377       \param new_val the boolean to use for on/off
378       \return the previous value
379     */
380     bool old_val = (mode & CHECKTEXT);
381     if ( new_val ){
382       mode = Mode( (int)mode | CHECKTEXT );
383     }
384     else {
385       mode = Mode( (int)mode & ~CHECKTEXT );
386     }
387     return old_val;
388   }
389 
390 
set_fixtext(bool new_val) const391   bool Document::set_fixtext( bool new_val ) const{
392     /// sets the 'fixtext' mode to on/off
393     /*!
394       \param new_val the boolean to use for on/off
395       \return the previous value
396     */
397     bool old_val = (mode & FIXTEXT);
398     if ( new_val ){
399       mode = Mode( (int)mode | FIXTEXT );
400     }
401     else {
402       mode = Mode( (int)mode & ~FIXTEXT );
403     }
404     return old_val;
405   }
406 
set_canonical(bool new_val) const407   bool Document::set_canonical( bool new_val ) const{
408     /// sets the 'canonical' mode to on/off
409     /*!
410       \param new_val the boolean to use for on/off
411       \return the previous value
412     */
413     bool old_val = (mode & CANONICAL);
414     if ( new_val ){
415       mode = Mode( (int)mode | CANONICAL );
416     }
417     else {
418       mode = Mode( (int)mode & ~CANONICAL );
419     }
420     return old_val;
421   }
422 
set_autodeclare(bool new_val) const423   bool Document::set_autodeclare( bool new_val ) const{
424     /// sets the 'autodeclare' mode to on/off
425     /*!
426       \param new_val the boolean to use for on/off
427       \return the previous value
428     */
429     bool old_val = (mode & AUTODECLARE);
430     if ( new_val ){
431       mode = Mode( (int)mode | AUTODECLARE );
432     }
433     else {
434       mode = Mode( (int)mode & ~AUTODECLARE );
435     }
436     return old_val;
437   }
438 
set_explicit(bool new_val) const439   bool Document::set_explicit( bool new_val ) const{
440     /// sets the 'explicit' mode to on/off
441     /*!
442       \param new_val the boolean to use for on/off
443       \return the previous value
444     */
445     bool old_val = (mode & EXPLICIT);
446     if ( new_val ){
447       mode = Mode( (int)mode | EXPLICIT );
448     }
449     else {
450       mode = Mode( (int)mode & ~EXPLICIT );
451     }
452     return old_val;
453   }
454 
add_doc_index(FoliaElement * el)455   void Document::add_doc_index( FoliaElement* el ){
456     /// add a FoliaElement to the index
457     /*!
458       \param el the FoliaElement to add
459       will throw when \em el->id() is already in the index
460      */
461     const string id = el->id();
462     if ( id.empty() ) {
463       return;
464     }
465     auto it = sindex.find( id );
466     if ( it == sindex.end() ){
467       sindex[id] = el;
468     }
469     else {
470       throw DuplicateIDError( id );
471     }
472   }
473 
del_doc_index(const string & id)474   void Document::del_doc_index( const string& id ){
475     /// remove an id from the index
476     /*!
477       \param id The id to remove
478     */
479     if ( sindex.empty() ){
480       // only when ~Document is in progress
481       return;
482     }
483     if ( id.empty() ) {
484       return;
485     }
486     sindex.erase(id);
487   }
488 
annotation_type_to_string(AnnotationType ann) const489   string Document::annotation_type_to_string( AnnotationType ann ) const {
490     /// return the ANNOTATIONTYPE translated to a string in a Document context.
491     /// takes the version into account, for older labels
492     /*!
493       \param ann the annotationtype
494       \return a string representation of \e ann.
495 
496       Taking into account the version of the Dcocument, translating to
497       old labels for pre 1.6 versions
498     */
499     const string& result = toString( ann );
500     if ( version_below(1,6) ){
501       const auto& it = reverse_old.find(result);
502       if ( it != reverse_old.end() ){
503 	return it->second;
504       }
505     }
506     return result;
507   }
508 
error_sink(void * mydata,xmlError * error)509   static void error_sink(void *mydata, xmlError *error ){
510     /// helper function for libxml2 to catch and display problems in an
511     /// orderly fashion
512     /*!
513       \param a pointer to a struct to hold persisten data. In our case just an
514       int.
515       \param error an xmlEror structure created by a libxml2 function
516 
517       For the first error encountered, a message is sent to stderr. Further
518       errors are just counted. It is up to calling functions to react on a
519       a count > 0
520      */
521     int *cnt = (int*)mydata;
522     if ( *cnt == 0 ){
523       string line = "\n";
524       if ( error->file ){
525 	line += string(error->file) + ":";
526 	if ( error->line > 0 ){
527 	  line += TiCC::toString(error->line) + ":";
528 	}
529       }
530       line += " XML-error: " + string(error->message);
531       cerr << line << endl;
532     }
533     (*cnt)++;
534     return;
535   }
536 
read_from_file(const string & file_name)537   bool Document::read_from_file( const string& file_name ){
538     /// read a FoLiA document from a file
539     /*!
540       \param file_name the name of the file
541       \return true on succes. Will throw otherwise.
542 
543       This function also takes care of files in .bz2 or .gz format when the
544       right extension is given.
545     */
546     ifstream is( file_name );
547     if ( !is.good() ){
548       throw invalid_argument( "file not found: " + file_name );
549     }
550     if ( foliadoc ){
551       throw logic_error( "Document is already initialized" );
552     }
553     _source_filename = file_name;
554     if ( TiCC::match_back( file_name, ".bz2" ) ){
555       string buffer = TiCC::bz2ReadFile( file_name );
556       return read_from_string( buffer );
557     }
558     int cnt = 0;
559     xmlSetStructuredErrorFunc( &cnt, (xmlStructuredErrorFunc)error_sink );
560     _xmldoc = xmlReadFile( file_name.c_str(),
561 			   0,
562 			   XML_PARSER_OPTIONS );
563     if ( _xmldoc ){
564       if ( cnt > 0 ){
565 	throw XmlError( "document is invalid" );
566       }
567       if ( debug ){
568 	cout << "read a doc from " << file_name << endl;
569       }
570       foliadoc = parseXml();
571       if ( !validate_offsets() ){
572 	// cannot happen. validate_offsets() throws on error
573 	throw InconsistentText("MEH");
574       }
575       if ( debug ){
576 	if ( foliadoc ){
577 	  cout << "successful parsed the doc from: " << file_name << endl;
578 	}
579 	else {
580 	  cout << "failed to parse the doc from: " << file_name << endl;
581 	}
582       }
583       xmlFreeDoc( _xmldoc );
584       _xmldoc = 0;
585       return foliadoc != 0;
586     }
587     if ( debug ){
588       cout << "Failed to read a doc from " << file_name << endl;
589     }
590     throw XmlError( "No valid FoLiA read" );
591   }
592 
read_from_string(const string & buffer)593   bool Document::read_from_string( const string& buffer ){
594     /// read a FoLiA Document from a string buffer
595     /*!
596       \param buffer A complete FoLiA document in a string buffer
597       \return true on succes. Will throw otherwise.
598      */
599     if ( foliadoc ){
600       throw logic_error( "Document is already initialized" );
601       return false;
602     }
603     int cnt = 0;
604     xmlSetStructuredErrorFunc( &cnt, (xmlStructuredErrorFunc)error_sink );
605     _xmldoc = xmlReadMemory( buffer.c_str(), buffer.length(), 0, 0,
606 			     XML_PARSER_OPTIONS );
607     if ( _xmldoc ){
608       if ( cnt > 0 ){
609 	throw XmlError( "document is invalid" );
610       }
611       if ( debug ){
612 	cout << "read a doc from string" << endl;
613       }
614       foliadoc = parseXml();
615       if ( !validate_offsets() ){
616 	// cannot happen. validate_offsets() throws on error
617 	throw InconsistentText("MEH");
618       }
619       if ( debug ){
620 	if ( foliadoc ){
621 	  cout << "successful parsed the doc" << endl;
622 	}
623 	else{
624 	  cout << "failed to parse the doc" << endl;
625 	}
626       }
627       xmlFreeDoc( _xmldoc );
628       _xmldoc = 0;
629       return foliadoc != 0;
630     }
631     if ( debug ){
632       throw runtime_error( "Failed to read a doc from a string" );
633     }
634     return false;
635   }
636 
operator <<(ostream & os,const Document * d)637   ostream& operator<<( ostream& os, const Document *d ){
638     /// output a Document to a stream
639     /*!
640       \param os the output stream
641       \param d the document to output
642      */
643     if ( d ){
644       os << d->toXml( "" );
645       // the toXml() string already ends with a newline (i hope....)
646       // but flush the stream
647       os.flush();
648     }
649     else {
650       os << "MISSING DOCUMENT" << endl;
651     }
652     return os;
653   }
654 
save(ostream & os,const string & ns_label,bool canonical) const655   bool Document::save( ostream& os,
656 		       const string& ns_label,
657 		       bool canonical ) const {
658     /// save the Document to a stream
659     /*!
660       \param os the output stream
661       \param ns_label the namespace name to use, the default is "" placing all
662       FoLiA nodes in the default namespace.
663       \param canonical determines to output in canonical order. Default is no.
664     */
665     bool old_k = set_canonical(canonical);
666     os << toXml( ns_label );
667     // the toXml() string already ends with a newline (i hope....)
668     // but flush the stream
669     os.flush();
670     set_canonical(old_k);
671     return os.good();
672   }
673 
save(const string & file_name,const string & ns_label,bool canonical) const674   bool Document::save( const string& file_name,
675 		       const string& ns_label,
676 		       bool canonical ) const {
677     /// save the Document to a file
678     /*!
679       \param file_name the name of the file to create
680       \param ns_label the namespace name to use, the default is "" placing all
681       FoLiA nodes in the default namespace.
682       \param canonical determines to output in canonical order. Default is no.
683 
684       This function also takes care of output to files in .bz2 or .gz format
685       when the right extension is given.
686     */
687     bool old_k = set_canonical(canonical);
688     bool result = false;
689     try {
690       result = toXml( file_name, ns_label );
691     }
692     catch ( const exception& e ){
693       throw runtime_error( "saving to file " + file_name + " failed: " + e.what() );
694     }
695     set_canonical( old_k );
696     return result;
697   }
698 
xmlstring(bool canonical) const699   string Document::xmlstring( bool canonical ) const {
700     /// dump the Document in a string buffer
701     /*!
702       \param canonical determines to output in canonical order. Default is no.
703       \return the complete document in an unformatted string
704     */
705     bool old_k = set_canonical(canonical);
706     xmlDoc *outDoc = to_xmlDoc( "" );
707     set_canonical(old_k);
708     xmlChar *buf; int size;
709     xmlDocDumpFormatMemoryEnc( outDoc, &buf, &size,
710 			       output_encoding, 0 ); // no formatting
711     string result = string( (const char *)buf, size );
712     xmlFree( buf );
713     xmlFreeDoc( outDoc );
714     _foliaNsOut = 0;
715     return result;
716   }
717 
index(const string & id) const718   FoliaElement* Document::index( const string& id ) const {
719     /// search for the element with xml:id id
720     /*!
721       \param id the id we search
722       \return the FoliaElement with this \e id or 0, when not present
723      */
724     const auto& it = sindex.find( id );
725     if ( it == sindex.end() ){
726       return 0;
727     }
728     else {
729       return it->second;
730     }
731   }
732 
operator [](const string & id) const733   FoliaElement* Document::operator []( const string& id ) const {
734     /// search for the element with xml:id id
735     /*!
736       \param id the id we search
737       \return the FoliaElement with this \e id or 0, when not present
738 
739       example:
740 
741       FoliaElement *e = doc["doc.sent.1"];
742       when Document doc has a node with id="doc.sent.1", \e e refer that node
743       otherwise \e e will be set to 0;
744     */
745     return index(id);
746   }
747 
text(const TextPolicy & tp) const748   UnicodeString Document::text( const TextPolicy& tp ) const {
749     /// return the text content of the whole document, restricted by the
750     /// parameters.
751     /*!
752       \param tp The TextPolicy to use
753       \return the complete text matching the criteria as an UnicodeString
754      */
755     return foliadoc->text( tp );
756   }
757 
text(const std::string & cls,bool retaintok,bool strict) const758   UnicodeString Document::text( const std::string& cls,
759 				bool retaintok,
760 				bool strict ) const {
761     /// return the text content of the whole document, restricted by the
762     /// parameters.
763     /*!
764       \param cls The textclass to use fro searching.
765       \param retaintok Should we retain the tokenization. Default NO.
766       \param strict Should we perform a strict search? Default NO.
767       \return the complete text matching the criteria as an UnicodeString
768      */
769     TEXT_FLAGS flags = TEXT_FLAGS::NONE;
770     if ( retaintok ){
771       flags = flags | TEXT_FLAGS::RETAIN;
772     }
773     if ( strict ){
774       flags = flags | TEXT_FLAGS::STRICT;
775     }
776     return foliadoc->text( cls, flags );
777   }
778 
779   static const set<ElementType> quoteSet = { Quote_t };
780   static const set<ElementType> emptySet;
781 
sentences() const782   vector<Sentence*> Document::sentences() const {
783     /// return all Sentences in the Document, except those in Quotes
784     return foliadoc->select<Sentence>( quoteSet );
785   }
786 
sentenceParts() const787   vector<Sentence*> Document::sentenceParts() const {
788     /// return all Sentences in the Document, including those in Quotes
789     vector<Sentence*> sents = foliadoc->select<Sentence>( emptySet );
790     return sents;
791   }
792 
sentences(size_t index) const793   Sentence *Document::sentences( size_t index ) const {
794     /// return the Sentence at position \e index
795     /*!
796       \param index  the index to search for
797       \return The Sentence found.
798       will throw when the index is out of range
799     */
800     vector<Sentence*> v = sentences();
801     if ( index < v.size() ){
802       return v[index];
803     }
804     throw range_error( "sentences() index out of range" );
805   }
806 
rsentences(size_t index) const807   Sentence *Document::rsentences( size_t index ) const {
808     /// return the Sentence at position \e index from the back of the Document
809     /*!
810       \param index  the index to search for
811       \return The Sentence found.
812       will throw when the index is out of range
813     */
814     vector<Sentence*> v = sentences();
815     if ( index < v.size() ){
816       return v[v.size()-1-index];
817     }
818     throw range_error( "rsentences() index out of range" );
819   }
820 
words() const821   vector<Word*> Document::words() const {
822     /// return all the Words in the Document, ignoring those within structure
823     /// annotations
824     /*!
825       \return The Words found.
826     */
827     return foliadoc->select<Word>( default_ignore_structure );
828   }
829 
words(size_t index) const830   Word *Document::words( size_t index ) const {
831     /// return the Word at position \e index, ignoring those within structure
832     /// annotations
833     /*!
834       \param index the index to search for
835       \return The Word found.
836       will throw when the index is out of range
837     */
838     vector<Word*> v = words();
839     if ( index < v.size() ){
840       return v[index];
841     }
842     throw range_error( "words() index out of range" );
843   }
844 
rwords(size_t index) const845   Word *Document::rwords( size_t index ) const {
846     /// return the Word at position \e index from the back of the Document,
847     /// ignoring those within structure annotations
848     /*!
849       \param index the index to search for
850       \return The Word found.
851       will throw when the index is out of range
852     */
853     vector<Word*> v = words();
854     if ( index < v.size() ){
855       return v[v.size()-1-index];
856     }
857     throw range_error( "rwords() index out of range" );
858   }
859 
paragraphs() const860   vector<Paragraph*> Document::paragraphs() const {
861     /// return all Paragraphs in the Document
862     return foliadoc->select<Paragraph>();
863   }
864 
paragraphs(size_t index) const865   Paragraph *Document::paragraphs( size_t index ) const {
866     /// return the Paragraph at position \e index
867     /*!
868       \param index the index to search for
869       \return The Paragraph found.
870       will throw when the index is out of range
871     */
872     vector<Paragraph*> v = paragraphs();
873     if ( index < v.size() ){
874       return v[index];
875     }
876     throw range_error( "paragraphs() index out of range" );
877   }
878 
rparagraphs(size_t index) const879   Paragraph *Document::rparagraphs( size_t index ) const {
880     /// return the Word at position \e index from the back of the Document
881     /*!
882       \param index the index to search for
883       \return The Paragraph found.
884       will throw when the index is out of range
885     */
886     vector<Paragraph*> v = paragraphs();
887     if ( index < v.size() ){
888       return v[v.size()-1-index];
889     }
890     throw range_error( "rparagraphs() index out of range" );
891   }
892 
language() const893   string Document::language() const {
894     /// extract the language from the metadata
895     /*!
896       \return the metadata language value or "" when not set
897     */
898     string result;
899     if ( _metadata ){
900       result = _metadata->get_val("language");
901     }
902     return result;
903   }
904 
metadata_type() const905   string Document::metadata_type() const {
906     /// returns the metadata type
907     /*!
908       \return the metadata type or "native" when not set
909     */
910     if ( _metadata ){
911       return _metadata->type();
912     }
913     else if ( _foreign_metadata ){
914       return _foreign_metadata->type();
915     }
916     return "native";
917   }
918 
metadata_file() const919   string Document::metadata_file() const {
920     /// returns the metadata filename. if any
921     /*!
922       \return the metadata file name.
923     */
924     if ( _metadata ){
925       if ( _metadata->datatype() != "ExternalMetaData" ){
926 	return "";
927       }
928       return _metadata->src();
929     }
930     return "";
931   }
932 
parse_imdi(const xmlNode * node)933   void Document::parse_imdi( const xmlNode *node ){
934     /// set IMDI values. DEPRECATED
935     xmlNode *n = TiCC::xPath( node, "//imdi:Session/imdi:Title" );
936     if ( n ){
937       _metadata->add_av( "title", TiCC::XmlContent( n ) );
938     }
939     n = TiCC::xPath( node, "//imdi:Session/imdi:Date" );
940     if ( n ){
941       _metadata->add_av( "date", TiCC::XmlContent( n ) );
942     }
943     n = TiCC::xPath( node, "//imdi:Source/imdi:Access/imdi:Publisher" );
944     if ( n ){
945       _metadata->add_av( "publisher", TiCC::XmlContent( n ) );
946     }
947     n = TiCC::xPath( node, "//imdi:Source/imdi:Access/imdi:Availability" );
948     if ( n ){
949       _metadata->add_av( "licence", TiCC::XmlContent( n ) );
950     }
951     n = TiCC::xPath( node, "//imdi:Languages/imdi:Language/imdi:ID" );
952     if ( n ){
953       _metadata->add_av( "language", TiCC::XmlContent( n ) );
954     }
955   }
956 
set_metadata(const string & attribute,const string & value)957   void Document::set_metadata( const string& attribute, const string& value ){
958     /// add a metadata attribute/value pair to the Document
959     /*!
960       \param attribute the attribute to set
961       \param value the value of the attribute
962 
963       Will throw if the current metadata is NOT 'native'
964 
965       May create a new NativeMetaData structure.
966      */
967     if ( !_metadata ){
968       _metadata = new NativeMetaData( "native" );
969     }
970     else if ( _metadata->datatype() == "ExternalMetaData" ){
971       throw MetaDataError( "cannot set meta values on ExternalMetaData" );
972     }
973     if ( _metadata->type() == "imdi" ){
974       throw MetaDataError( "cannot set meta values on IMDI MetaData" );
975     }
976     _metadata->add_av( attribute, value );
977   }
978 
get_metadata(const string & attribute) const979   const string Document::get_metadata( const string& attribute ) const {
980     /// return the metadata value for a metadata attribute
981     /*!
982       \param attribute the attribite to lookup
983       \return the requested metadata value. May return "" if no metadata is
984       available or the attribute is not found.
985      */
986     if ( _metadata ){
987       return _metadata->get_val( attribute );
988     }
989     else {
990       return "";
991     }
992   }
993 
get_default_processor() const994   processor *Document::get_default_processor() const {
995     /// return the default processor for this document
996     /*!
997       \return the main processor in the provenance data. can be 0;
998      */
999     if ( _provenance ){
1000       return _provenance->get_top_processor();
1001     }
1002     else {
1003       return 0;
1004     }
1005   }
1006 
get_processor(const string & pid) const1007   processor *Document::get_processor( const string& pid ) const {
1008     /// return the processor with ID=pid
1009     /*!
1010       \param pid the processorID we look for
1011       \return the processor found, or 0
1012     */
1013     if ( _provenance ){
1014       return _provenance->get_processor_by_id( pid );
1015     }
1016     else {
1017       return 0;
1018     }
1019   }
1020 
get_processors_by_name(const string & name) const1021   vector<processor*> Document::get_processors_by_name( const string& name ) const {
1022     /// return all the processor with name=name
1023     /*!
1024       \param name the name of the processors we look for
1025       \return al list of matching processors
1026     */
1027     vector<processor*> result;
1028     if ( _provenance ){
1029       result = _provenance->get_processors_by_name( name );
1030     }
1031     return result;
1032   }
1033 
add_processor(const KWargs & args,processor * parent)1034   processor *Document::add_processor( const KWargs& args,
1035 				      processor *parent ){
1036     /// create new processor and add it to the provenance data
1037     /*!
1038       \param args the argument list for creating the new provessor
1039       \param parent add the new processor as a child to this parent.
1040       When the parent = 0, add to the Documents provenance structure.
1041 
1042       May create a new Provenance structure if not yet available.
1043     */
1044     if ( debug ){
1045       cerr << "ADD_PROCESSOR: " << args << endl;
1046     }
1047     if ( !parent
1048 	 && !_provenance ){
1049       _provenance = new Provenance(this);
1050     }
1051     processor *p = new processor( _provenance, parent, args );
1052     if ( parent ){
1053       parent->_processors.push_back( p );
1054     }
1055     else {
1056       _provenance->processors.push_back( p );
1057     }
1058     return p;
1059   }
1060 
set_foreign_metadata(xmlNode * node)1061   void Document::set_foreign_metadata( xmlNode *node ){
1062     /// create a ForeigMetaData element from 'node'
1063     /*!
1064       \param node the xml node we are parsing
1065 
1066       FoLiA treats foreign metadata by adding a copy of the xml tree under node
1067       to the folia, without further notice.
1068     */
1069     if ( !_foreign_metadata ){
1070       _foreign_metadata = new ForeignMetaData( "foreign" );
1071     }
1072     ForeignData *add = new ForeignData();
1073     if ( TiCC::Name( node ) != "foreign-data" ){
1074       // we need an extra layer then
1075       xmlNode *n = TiCC::XmlNewNode( "foreign-data" );
1076       xmlAddChild( n, xmlCopyNode( node, 1 ) );
1077       add->set_data( n );
1078       _foreign_metadata->add_foreign( n );
1079       xmlFreeNode (n );
1080     }
1081     else {
1082       add->set_data( node );
1083       _foreign_metadata->add_foreign( node );
1084     }
1085   }
1086 
save_orig_ann_defaults()1087   void Document::save_orig_ann_defaults(){
1088     /// make a copy of the _annotationdefaults
1089     /*!
1090       For incremental document creation (using folia::Engine) we need to
1091       'remember' which annotationdefaults there were initially, so before
1092       any new annotations are added with declare().
1093 
1094       But we only need those that would return a default annotation or
1095       default processor.
1096      */
1097     for ( const auto& it : _annotationdefaults ){
1098       if ( it.second.size() == 1 ){
1099 	// so 1 set
1100 	_orig_ann_default_sets.insert( make_pair(it.first,it.second.begin()->first) );
1101 	auto procs = it.second.begin()->second._processors;
1102 	if ( procs.size() == 1 ){
1103 	  _orig_ann_default_procs.insert( make_pair(it.first,*procs.begin()) );
1104 	}
1105       }
1106     }
1107   }
1108 
parse_annotations(const xmlNode * node)1109   void Document::parse_annotations( const xmlNode *node ){
1110     /// parse all annotation declarations from the Xml tree given by node
1111     if ( debug ){
1112       cerr << "parse annotations " << TiCC::Name(node) << endl;
1113     }
1114     xmlNode *n = node->children;
1115     _anno_sort.clear();
1116     while ( n ){
1117       string tag = TiCC::Name( n );
1118       if ( tag.length() > 11 && tag.substr( tag.length() - 11 ) == "-annotation" ){
1119 	string prefix = tag.substr( 0,  tag.length() - 11 );
1120 	AnnotationType at_type
1121 	  = TiCC::stringTo<AnnotationType>( prefix );
1122 	if ( debug ){
1123 	  cerr << "parse " << prefix << "-annotation" << endl;
1124 	}
1125 	KWargs atts = getAttributes( n );
1126 	ElementType et = BASE;
1127 	string set_name = atts.extract("set" );
1128 	if ( set_name.empty() ){
1129 	  if ( version_below( 1, 6 ) ){
1130 	    set_name = "undefined"; // default value
1131 	  }
1132 	  else if ( at_type == AnnotationType::TEXT ){
1133 	    if ( debug ){
1134 	      cerr << "assign default for TEXT: " <<  DEFAULT_TEXT_SET << endl;
1135 	    }
1136 	    set_name = DEFAULT_TEXT_SET;
1137 	  }
1138 	  else if ( at_type == AnnotationType::PHON ){
1139 	    if ( debug ){
1140 	      cerr << "assign default for PHON: " <<  DEFAULT_PHON_SET << endl;
1141 	    }
1142 	    set_name = DEFAULT_PHON_SET;
1143 	  }
1144 	  else {
1145 	    auto et_it = annotationtype_elementtype_map.find( at_type );
1146 	    if ( et_it == annotationtype_elementtype_map.end() ){
1147 	      throw logic_error( "no matching element_type for annotation_type: "
1148 				 + prefix );
1149 	    }
1150 	    et = et_it->second;
1151 	    properties *prop = element_props[et];
1152 	    if ( prop->REQUIRED_ATTRIBS & Attrib::CLASS ) {
1153 	      throw XmlError( "setname may not be empty for " + prefix
1154 			      + "-annotation" );
1155 	    }
1156 	  }
1157 	}
1158 	if ( set_name.empty() ){
1159 	  set_name = "None";
1160 	}
1161 	string format = atts.extract( "format" );
1162 	string annotator = atts.extract( "annotator" );
1163 	string ann_type = atts.extract( "annotatortype" );
1164 	string datetime = parseDate( atts.extract( "datetime" ) );
1165 	string alias = atts.extract( "alias" );
1166 	string gran_val = atts.extract( "groupannotations" );
1167 	if ( !gran_val.empty() ){
1168 	  if ( !isSubClass( et, AbstractSpanAnnotation_t ) ){
1169 	    throw XmlError( "attribute 'groupannotations' not allowed for '"
1170 			    + prefix + "-annotation" );
1171 	  }
1172 	  if ( gran_val == "yes"
1173 	       || gran_val == "true" ){
1174 	    _groupannotations[at_type][set_name] = true;
1175 	  }
1176 	  else {
1177 	    throw XmlError( "invalid value '" + gran_val
1178 			    + "' for attribute groupannotations" );
1179 	  }
1180 	}
1181 	else {
1182 	  _groupannotations[at_type][set_name] = false;
1183 	}
1184 	set<string> processors;
1185 	xmlNode *sub = n->children;
1186 	while ( sub ){
1187 	  string subtag = TiCC::Name( sub );
1188 	  if ( debug ){
1189 	    cerr << "parse subtag:" << subtag << endl;
1190 	  }
1191 	  if ( subtag == "annotator" ){
1192 	    KWargs args = getAttributes( sub );
1193 	    if ( args["processor"].empty() ){
1194 	      throw XmlError( tag + "-annotation: <annotator> misses attribute 'processor'" );
1195 	    }
1196 	    processors.insert( args["processor"] );
1197 	  }
1198 	  sub = sub->next;
1199 	}
1200 	if ( !annotator.empty() && !processors.empty() ){
1201 	  throw XmlError( tag + "-annotation: has both <annotator> node(s) and annotator attribute." );
1202 	}
1203 	declare( at_type, set_name, format, annotator, ann_type, datetime,
1204 		 processors, alias );
1205 	if ( !atts.empty() ){
1206 
1207 	  throw XmlError( "found invalid attribute(s) in <" + prefix
1208 			  + "-declaration> " + atts.toString() );
1209 	}
1210       }
1211       n = n->next;
1212     }
1213     if ( debug ){
1214       cerr << "all group annotations: " << _groupannotations << endl;
1215       cerr << "done with parse_annotation: " << _annotationdefaults << endl;
1216       cerr << "sorting: " << _anno_sort << endl;
1217     }
1218   }
1219 
parse_provenance(const xmlNode * node)1220   void Document::parse_provenance( const xmlNode *node ){
1221     /// parse provenance data from the XmlTree under node
1222     Provenance *result = new Provenance(this);
1223     xmlNode *n = node->children;
1224     while ( n ){
1225       string tag = TiCC::Name( n );
1226       if ( tag == "processor" ){
1227 	result->parse_processor(n);
1228       }
1229       n = n->next;
1230     }
1231     _provenance = result;
1232     //    cerr << "provenance=" << _provenance << endl;
1233   }
1234 
parse_submeta(const xmlNode * node)1235   void Document::parse_submeta( const xmlNode *node ){
1236     /// parse sub metadata from the XmlTree under node
1237     if ( node ){
1238       KWargs node_att = getAttributes( node );
1239       string id = node_att["xml:id"];
1240       if ( id.empty() ){
1241 	throw MetaDataError( "submetadata without xml:id" );
1242       }
1243       //      cerr << "parse submetadata, id=" << id << endl;
1244       string type = node_att["type"];
1245       //      cerr << "parse submetadata, type=" << type << endl;
1246       if ( type.empty() ){
1247 	type = "native";
1248       }
1249       string src = node_att["src"];
1250       if ( !src.empty() ){
1251 	submetadata[id] = new ExternalMetaData( type, src );
1252 	//	cerr << "created External metadata, id=" << id << endl;
1253       }
1254       else if ( type == "native" ){
1255 	submetadata[id] = new NativeMetaData( type );
1256 	//	cerr << "created Native metadata, id=" << id << endl;
1257       }
1258       else {
1259 	submetadata[id] = 0;
1260 	//	cerr << "set metadata to 0, id=" << id << endl;
1261       }
1262       xmlNode *p = node->children;
1263       while ( p ){
1264 	if ( p->type == XML_ELEMENT_NODE ){
1265 	  if ( TiCC::Name(p) == "meta" &&
1266 	       checkNS( p, NSFOLIA ) ){
1267 	    if ( type == "native" ){
1268 	      string txt = TiCC::XmlContent( p );
1269 	      KWargs att = getAttributes( p );
1270 	      string sid = att["id"];
1271 	      if ( !txt.empty() ){
1272 		submetadata[id]->add_av( sid, txt );
1273 		// cerr << "added node to id=" << id
1274 		//      << "(" << sid << "," << txt << ")" << endl;
1275 	      }
1276 	    }
1277 	    else {
1278 	      throw MetaDataError("Encountered a meta element but metadata type is not native!");
1279 	    }
1280 	  }
1281 	  else if ( TiCC::Name(p) == "foreign-data" &&
1282 		    checkNS( p, NSFOLIA ) ){
1283 	    if ( type == "native" ){
1284 	      throw MetaDataError("Encountered a foreign-data element but metadata type is native!");
1285 	    }
1286 	    else if ( submetadata[id] == 0 ){
1287 	      submetadata[id] = new ForeignMetaData( type );
1288 	      //	      cerr << "add new Foreign " << id << endl;
1289 	    }
1290 	    //	    cerr << "in  Foreign " << submetadata[id]->type() << endl;
1291 	    submetadata[id]->add_foreign( p );
1292 	    //	    cerr << "added a foreign id=" << id << endl;
1293 	  }
1294 	}
1295 	p = p->next;
1296       }
1297     }
1298   }
1299 
is_number(const string & s)1300   bool is_number( const string& s ){
1301     /// check that every character in s is a digit
1302     for ( const auto& c : s ){
1303       if ( !isdigit(c) ){
1304 	return false;
1305       }
1306     }
1307     return true;
1308   }
1309 
expand_version_string(const string & vs,int & major,int & minor,int & sub,string & patch)1310   void expand_version_string( const string& vs,
1311 			      int& major,
1312 			      int& minor,
1313 			      int& sub,
1314 			      string& patch ){
1315     /// expand a version string vs into ints major, minor and sub
1316     /*!
1317       \param[in] vs A string holding version information
1318       \param[out] major the major version found
1319       \param[out] minor the minor version found
1320       \param[out] sub the sub version found
1321       \param[out] patch the NON-numeric remainder of vs after parsing
1322 
1323       examples:
1324 
1325       "2.1" ==> major=2, minor=1
1326 
1327       "2.0.3-a" ==> major=2, minor=0, sub=3 patch=-a
1328      */
1329     major = 0;
1330     minor = 0;
1331     sub = 0;
1332     patch.clear();
1333     vector<string> vec = TiCC::split_at( vs, ".", 3 );
1334     for ( size_t i=0; i < vec.size(); ++i ){
1335       if ( i == 0 ){
1336 	int val = 0;
1337 	if ( !TiCC::stringTo( vec[i], val ) ){
1338 	  throw XmlError( "unable to extract major-version from: " + vs );
1339 	}
1340 	major= val;
1341       }
1342       else if ( i == 1 ){
1343 	int val = 0;
1344 	if ( !TiCC::stringTo( vec[i], val ) ){
1345 	  throw XmlError( "unable to extract minor-version from: " + vs );
1346 	}
1347 	minor = val;
1348       }
1349       else if ( i == 2 ){
1350 	if ( is_number( vec[i] ) ){
1351 	  TiCC::stringTo( vec[i], sub );
1352 	}
1353 	else {
1354 	  vector<string> v2 = TiCC::split_at( vec[i], "-", 2 );
1355 	  if ( v2.size() != 2 ){
1356 	    throw XmlError( "invalid sub-version or patch-version in: " + vs );
1357 	  }
1358 	  else {
1359 	    int val = 0;
1360 	    if ( !TiCC::stringTo( v2[0], val ) ){
1361 	      throw XmlError( "unable to extract sub-version from: " + vs );
1362 	    }
1363 	    sub = val;
1364 	    patch = "-" + v2[1]; // include the hyphen
1365 	  }
1366 	}
1367       }
1368     }
1369   }
1370 
check_version(const string & vers)1371   int check_version( const string& vers ){
1372     /// check a version given by 'vers' against the current build
1373     /*!
1374       \param vers a version string (like "2.1.5")
1375       \return 0 when major, minor AND sub version are equal, -1 when the version
1376       is lower and 1 when the version is greater then the current build
1377 
1378      */
1379     int maj = 0;
1380     int min = 0;
1381     int sub = 0;
1382     string patch;
1383     expand_version_string( vers, maj, min, sub, patch );
1384     if ( maj < MAJOR_VERSION ){
1385       return -1;
1386     }
1387     else if ( maj > MAJOR_VERSION ){
1388       return 1;
1389     }
1390     else if ( min < MINOR_VERSION ){
1391       return -1;
1392     }
1393     else if ( min > MINOR_VERSION ){
1394       return 1;
1395     }
1396     else if ( sub < SUB_VERSION ){
1397       return -1;
1398     }
1399     else if ( sub > SUB_VERSION ){
1400       return 1;
1401     }
1402     return 0;
1403   }
1404 
compare_to_build_version() const1405   int Document::compare_to_build_version() const {
1406     /// check the version of the document against the build version
1407     /*!
1408       \return 0 when the versions match, -1 when the document version
1409       is lower and 1 when the version is greater then the current build
1410     */
1411     return check_version( version() );
1412   }
1413 
version_below(int major,int minor) const1414   bool Document::version_below( int major, int minor ) const {
1415     /// check if current document version is strict lower then asked
1416     /*!
1417       \param major the major version we want
1418       \param minor the minor version we want
1419       \return true when the Document's major version is lower than mjor OR
1420       it is equal, but the Document's minor version is lower than minor.
1421     */
1422     if ( _major_version < major ){
1423       return true;
1424     }
1425     else if ( _major_version == major ){
1426       return _minor_version < minor;
1427     }
1428     return false;
1429   }
1430 
adjustTextMode()1431   void Document::adjustTextMode(){
1432     /// set the text checking mode of the Document based on an environment
1433     /// variable and the document version
1434     /*!
1435       When the FOLIA_TEXT_CHECK environment variable is set to YES or NO then
1436       set the CHECKTEXT mode accordingly.
1437 
1438       When the document version is below 1.5 we disable CHECKTEXT except when
1439       FIXTEXT is also set.
1440      */
1441     const char *env = getenv( "FOLIA_TEXT_CHECK" );
1442     if ( env ){
1443       string e = env;
1444       delete env;
1445       cerr << "DETECTED FOLIA_TEXT_CHECK environment variable, value ='"
1446 	   << e << "'"<< endl;
1447       if ( e == "NO" ){
1448 	mode = Mode( int(mode) & ~CHECKTEXT );
1449 	cerr << "FOLIA_TEXT_CHECK disabled" << endl;
1450       }
1451       else if ( e == "YES" ){
1452 	mode = Mode( int(mode) | CHECKTEXT );
1453 	cerr << "FOLIA_TEXT_CHECK enabled" << endl;
1454       }
1455       else {
1456 	cerr << "FOLIA_TEXT_CHECK unchanged:" << (checktext()?"YES":"NO")
1457 	     << endl;
1458       }
1459     }
1460     if ( !( mode & FIXTEXT) && version_below( 1, 5 ) ){
1461       // don't check text consistency for older documents
1462       mode = Mode( int(mode) & ~CHECKTEXT );
1463     }
1464   }
1465 
setDocumentProps(KWargs & kwargs)1466   void Document::setDocumentProps( KWargs& kwargs ){
1467     /// set general properties based on an attribute-value list
1468     /*!
1469       \param kwargs the arguments. Normally these are parsed attributes from
1470       \<FoLiA\> node.
1471       Even with an empty kwarg list, at least the version of the document is
1472       set. We use a special value (1.4.987) to signal that is was not
1473       specified.
1474      */
1475     string value = kwargs.extract( "version" );
1476     if ( !value.empty() ){
1477       _version_string = value;
1478       //      cerr << "So we found version " << _version_string << endl;
1479     }
1480     else {
1481       // assign a 'random' version, but PRE 1.5
1482       _version_string = "1.4.987";
1483       //      cerr << "NO VERSION version " << _version_string << endl;
1484     }
1485     expand_version_string( _version_string,
1486 			   _major_version,
1487 			   _minor_version,
1488 			   _sub_version,
1489 			   _patch_version );
1490     if ( check_version( _version_string ) > 0 ){
1491       cerr << "WARNING!!! the Document "
1492 	   << (_source_filename.empty()?"":"'")
1493 	   << _source_filename
1494 	   << (_source_filename.empty()?"":"' ")
1495 	   << "is created for newer FoLiA version than this library ("
1496 	   << _version_string << " vs " << folia_version()
1497 	   << ")\n\t Any possible subsequent failures in parsing or processing may probably be attributed to this." << endl
1498 	   << "\t Please upgrade libfolia!" << endl;
1499       increment_warn_count();
1500     }
1501 
1502     adjustTextMode();
1503     value = kwargs.extract( "external" );
1504     if ( !value.empty() ){
1505       _external_document = TiCC::stringTo<bool>( value );
1506     }
1507     else {
1508       _external_document = false;
1509     }
1510     bool happy = false;
1511     value = kwargs.extract( "_id" ); // for backward compatibility
1512     if ( value.empty() ){
1513       value = kwargs.extract( "xml:id" );
1514     }
1515     if ( !value.empty() ){
1516       if ( isNCName( value ) ){
1517 	_id = value;
1518       }
1519       else {
1520 	throw XmlError( "'" + value + "' is not a valid NCName." );
1521       }
1522       happy = true;
1523       kwargs["xml:id"] = value;
1524     }
1525     if ( !foliadoc && !happy ){
1526       throw runtime_error( "No Document ID specified" );
1527     }
1528     kwargs.erase( "generator" ); // also delete this unused att-val
1529     kwargs.erase( "form" ); //silently discard form attribute (for normal vs explicit form), we should be able to read either fine
1530   }
1531 
resolveExternals()1532   void Document::resolveExternals(){
1533     /// resolve all external references
1534     /*!
1535       external references are stored during parsing in the _externals array
1536      */
1537     if ( !_externals.empty() ){
1538       for ( const auto& ext : _externals ){
1539 	ext->resolve_external();
1540       }
1541     }
1542   }
1543 
parse_metadata(const xmlNode * node)1544   void Document::parse_metadata( const xmlNode *node ){
1545     /// parse metadata information from the XmlTree under node
1546     KWargs atts = getAttributes( node );
1547     string type = TiCC::lowercase(atts["type"]);
1548     if ( type.empty() ){
1549       type = "native";
1550     }
1551     string src = atts["src"];
1552     if ( !src.empty() ){
1553       _metadata = new ExternalMetaData( type, src );
1554     }
1555     else if ( type == "native" || type == "imdi" ){
1556       _metadata = new NativeMetaData( type );
1557     }
1558     xmlNode *m = node->children;
1559     xmlNode *a_node = 0;
1560     while ( m ){
1561       if ( TiCC::Name(m)  == "METATRANSCRIPT" ){
1562 	if ( !checkNS( m, NSIMDI ) || type != "imdi" ){
1563 	  throw runtime_error( "imdi != imdi " );
1564 	}
1565 	if ( debug > 1 ){
1566 	  cerr << "found IMDI" << endl;
1567 	}
1568 	if ( !_foreign_metadata ){
1569 	  _foreign_metadata = new ForeignMetaData( "imdi" );
1570 	}
1571 	_foreign_metadata->add_foreign( xmlCopyNode(m,1) );
1572       }
1573       else if ( TiCC::Name( m ) == "annotations" &&
1574 		checkNS( m, NSFOLIA ) ){
1575 	if ( debug > 1 ){
1576 	  cerr << "found annotations" << endl;
1577 	}
1578 	// defer parsing until AFTER provenance data
1579 	a_node = m;
1580       }
1581       else if ( TiCC::Name( m ) == "provenance" &&
1582 		checkNS( m, NSFOLIA ) ){
1583 	if ( debug > 1 ){
1584 	  cerr << "found provenance data" << endl;
1585 	}
1586 	parse_provenance( m );
1587 	//	cerr << _provenance << endl;
1588       }
1589       else if ( TiCC::Name( m ) == "meta" &&
1590 		checkNS( m, NSFOLIA ) ){
1591 	if ( debug > 1 ){
1592 	  cerr << "found meta node:" << getAttributes(m) << endl;
1593 	}
1594 	if ( !_metadata ){
1595 	  if ( type == "external" ){
1596 	    throw runtime_error( "cannot add 'meta' nodes to external metadata" );
1597 
1598 	  }
1599 	  _metadata = new NativeMetaData( "native" );
1600 	}
1601 	KWargs att = getAttributes( m );
1602 	string meta_id = att["id"];
1603 	string val = TiCC::XmlContent( m );
1604 	string get = _metadata->get_val( meta_id );
1605 	if ( !get.empty() ){
1606 	  throw runtime_error( "meta tag with id=" + meta_id
1607 			       + " is defined more then once " );
1608 	}
1609 	_metadata->add_av( meta_id, val );
1610       }
1611       else if ( TiCC::Name(m)  == "foreign-data" &&
1612 		checkNS( m, NSFOLIA ) ){
1613 	FoliaElement *t = AbstractElement::createElement( "foreign-data", this );
1614 	if ( t ){
1615 	  t = t->parseXml( m );
1616 	  if ( t ){
1617 	    if ( !_foreign_metadata ){
1618 	      _foreign_metadata = new ForeignMetaData( type );
1619 	    }
1620 	    _foreign_metadata->add_foreign( m );
1621 	  }
1622 	}
1623       }
1624       else if ( TiCC::Name(m)  == "submetadata" &&
1625 		checkNS( m, NSFOLIA ) ){
1626 	parse_submeta( m );
1627       }
1628       m = m->next;
1629     }
1630     if ( a_node ){
1631       //      cerr << "parse deferred annotations" << endl;
1632       parse_annotations( a_node );
1633     }
1634     if ( !_metadata && type == "imdi" ){
1635       // imdi missing all further info
1636       _metadata = new NativeMetaData( type );
1637     }
1638   }
1639 
addStyle(const string & type,const string & href)1640   void Document::addStyle( const string& type, const string& href ){
1641     /// add style-sheet information
1642     /*!
1643       \param type Which type of sheet
1644       \param href the external link for this sheet
1645       We assure that only one "text/xsl" style-sheet is present. All
1646       other style-sheets are silently added as is.
1647      */
1648     if ( type == "text/xsl" ){
1649       const auto& it = styles.find( type );
1650       if ( it != styles.end() ){
1651 	throw XmlError( "multiple 'text/xsl' style-sheets defined." );
1652       }
1653     }
1654     styles.insert( make_pair( type, href ) );
1655   }
1656 
replaceStyle(const string & type,const string & href)1657   void Document::replaceStyle( const string& type,
1658 			       const string& href ){
1659     /// replace a style-sheet
1660     /*!
1661       \param type Which type of sheet
1662       \param href the external link for this sheet
1663 
1664       \note this is sloppy, as multiple sheets with the same type may exist
1665       (except for 'text/xslt') and we replace the first one only.
1666     */
1667     const auto& it = styles.find( type );
1668     if ( it != styles.end() ){
1669       it->second = href;
1670     }
1671     else {
1672       styles.insert( make_pair( type, href ) );
1673     }
1674   }
1675 
parse_styles()1676   void Document::parse_styles(){
1677     /// retrieve all style-sheets from the current XmlTree
1678     xmlNode *pnt = _xmldoc->children;
1679     while ( pnt ){
1680       if ( pnt->type == XML_PI_NODE && TiCC::Name(pnt) == "xml-stylesheet" ){
1681 	string content = TextValue(pnt);
1682 	string type;
1683 	string href;
1684 	vector<string> v = TiCC::split( content );
1685 	if ( v.size() == 2 ){
1686 	  vector<string> w = TiCC::split_at( v[0], "=" );
1687 	  if ( w.size() == 2 && w[0] == "type" ){
1688 	    type = w[1].substr(1,w[1].length()-2);
1689 	  }
1690 	  w = TiCC::split_at( v[1], "=" );
1691 	  if ( w.size() == 2 && w[0] == "href" ){
1692 	    href = w[1].substr(1,w[1].length()-2);
1693 	  }
1694 	}
1695 	if ( !type.empty() && !href.empty() ){
1696 	  addStyle( type, href );
1697 	}
1698 	else {
1699 	  throw XmlError( "problem parsing line: " + content );
1700 	}
1701       }
1702       pnt = pnt->next;
1703     }
1704   }
1705 
fixupNs(xmlNode * p,xmlNs * ns)1706   void fixupNs( xmlNode *p, xmlNs *ns ){
1707     /// make sure that all XmlNodes in the tree p get namespace ns
1708     /*!
1709       \param p an XmlTree (fragment)
1710       \param ns the Namespace value to set
1711       This function is used when a Document uses PERMISSIVE mode
1712      */
1713     while ( p ){
1714       xmlSetNs( p, ns );
1715       fixupNs( p->children, ns );
1716       p = p->next;
1717     }
1718   }
1719 
validate_offsets() const1720   bool Document::validate_offsets() const {
1721     /// Validate all the offset values as found in all \<t\> and \<ph\> nodes
1722     /*!
1723       During Document parsing, \<t\> and \<ph\> nodes are stored in a buffer
1724       until the whole parsing is done.
1725 
1726       Then we are able to examine those nodes in their context and check the
1727       offsets used.
1728      */
1729     set<TextContent*> t_done;
1730     for ( const auto& txt : t_offset_validation_buffer ){
1731       if ( t_done.find( txt ) != t_done.end() ){
1732 	continue;
1733       }
1734       t_done.insert(txt);
1735       int offset = txt->offset();
1736       if ( offset != -1 ){
1737 	try {
1738 	  txt->get_reference();
1739 	}
1740 	catch( const UnresolvableTextContent& e ){
1741 	  string msg = "Text for " + txt->parent()->xmltag() + "(ID="
1742 	    + txt->parent()->id() + ", textclass='" + txt->cls()
1743 	    + "'), has incorrect offset " + TiCC::toString(offset);
1744 
1745 
1746 	  string ref = txt->ref();
1747 	  if ( !ref.empty() ){
1748 	    msg += " or invalid reference:" + ref;
1749 	  }
1750 	  msg += "\n\toriginal msg=";
1751 	  msg += e.what();
1752 
1753           bool warn = false;
1754           try {
1755 	    txt->get_reference(false); //trim_spaces = false
1756 	    msg += "\nHowever, according to the older rules (<v2.4.1) the offsets are accepted. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.";
1757 	    warn = true;
1758           } catch ( const UnresolvableTextContent& ) {
1759 	    msg += "\n(also checked against older rules prior to FoLiA v2.4.1)";
1760           }
1761 
1762           if ( warn ){
1763 	    increment_warn_count();
1764 	    cerr << "WARNING: " << msg << endl;
1765 	  }
1766           else {
1767 	    throw UnresolvableTextContent( msg );
1768 	  }
1769 	}
1770       }
1771     }
1772     set<PhonContent*> p_done;
1773     for ( const auto& phon : p_offset_validation_buffer ){
1774       if ( p_done.find( phon ) != p_done.end() ){
1775 	continue;
1776       }
1777       p_done.insert(phon);
1778       int offset = phon->offset();
1779       if ( offset != -1 ){
1780 	try {
1781 	  phon->get_reference();
1782 	}
1783 	catch( const UnresolvableTextContent& e ){
1784 	  string msg = "Phoneme for " + phon->parent()->xmltag() + ", ID="
1785 	    + phon->parent()->id() + ", textclass='" + phon->cls()
1786 	    + "', has incorrect offset " + TiCC::toString(offset);
1787 
1788 
1789 	  string ref = phon->ref();
1790 	  if ( !ref.empty() ){
1791 	    msg += " or invalid reference:" + ref;
1792 	  }
1793 	  msg += "\n\toriginal msg=";
1794 	  msg += e.what();
1795 
1796           bool warn = false;
1797           try {
1798 	    phon->get_reference(false); //trim_spaces = false
1799 	    msg += "\nHowever, according to the older rules (<v2.4.1) the offsets are accepted. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.";
1800 	    warn = true;
1801           } catch ( const UnresolvableTextContent& ) {
1802 	    msg += "\n(also checked against older rules prior to FoLiA v2.4.1)";
1803           }
1804 
1805           if (warn){
1806 	    increment_warn_count();
1807 	    cerr << "WARNING: " << msg << endl;
1808 	  }
1809           else {
1810 	    throw UnresolvableTextContent( msg );
1811 	  }
1812 	}
1813       }
1814     }
1815     return true;
1816   }
1817 
parseXml()1818   FoliaElement* Document::parseXml( ){
1819     /// parse a complete FoLiA tree from the XmlTree we have got in _xmldoc
1820     parse_styles();
1821     xmlNode *root = xmlDocGetRootElement( _xmldoc );
1822     if ( root->ns ){
1823       if ( root->ns->prefix ){
1824 	_foliaNsIn_prefix = xmlStrdup( root->ns->prefix );
1825       }
1826       _foliaNsIn_href = xmlStrdup( root->ns->href );
1827     }
1828     if ( debug > 2 ){
1829       string dum;
1830       cerr << "root = " << TiCC::Name( root ) << endl;
1831       cerr << "in namespace " << TiCC::getNS( root, dum ) << endl;
1832       cerr << "namespace list" << getNS_definitions( root ) << endl;
1833     }
1834     FoliaElement *result = 0;
1835     if ( root  ){
1836       if ( TiCC::Name( root ) == "FoLiA" ){
1837 	string ns = TiCC::getNS( root );
1838 	if ( ns.empty() ){
1839 	  if ( permissive() ){
1840 	    _foliaNsIn_href = xmlCharStrdup( NSFOLIA.c_str() );
1841 	    _foliaNsIn_prefix = 0;
1842 	    xmlNs *defNs = xmlNewNs( root,
1843 				     _foliaNsIn_href, _foliaNsIn_prefix );
1844 	    fixupNs( root, defNs );
1845 	  }
1846 	  else {
1847 	    throw XmlError( "Folia Document should have namespace declaration "
1848 			    + NSFOLIA + " but none found " );
1849 	  }
1850 	}
1851 	else if ( ns != NSFOLIA ){
1852 	  throw XmlError( "Folia Document should have namespace declaration "
1853 			  + NSFOLIA + " but found: " + ns );
1854 	}
1855 	try {
1856 	  FoLiA *folia = new FoLiA( this );
1857 	  result = folia->parseXml( root );
1858 	  resolveExternals();
1859 	}
1860 	catch ( const InconsistentText& e ){
1861 	  throw;
1862 	}
1863 	catch ( const XmlError& e ){
1864 	  throw;
1865 	}
1866 	catch ( const exception& e ){
1867 	  throw XmlError( e.what() );
1868 	}
1869       }
1870       else if ( TiCC::Name( root ) == "DCOI" &&
1871 		checkNS( root, NSDCOI ) ){
1872 	throw XmlError( "DCOI format not supported" );
1873       }
1874       else {
1875 	throw XmlError( "root node must be FoLiA" );
1876       }
1877     }
1878     return result;
1879   }
1880 
auto_declare(AnnotationType type,const string & _setname)1881   void Document::auto_declare( AnnotationType type,
1882 			       const string& _setname ) {
1883     /// create a default declaration for the given AnnotationType
1884     /*!
1885       \param type which default do we want to add
1886       \param _setname which setname to add
1887       If _setname is empty, that is used, except for TEXT and PHON, which
1888       have a default setname which is assigned
1889      */
1890     string setname = _setname;
1891     if ( setname.empty() ) {
1892       if ( type == AnnotationType::TEXT ){
1893 	setname = DEFAULT_TEXT_SET;
1894       }
1895       else if ( type == AnnotationType::PHON ){
1896 	setname = DEFAULT_PHON_SET;
1897       }
1898     }
1899     if ( setname.empty() ){
1900       declare( type, "" );
1901     }
1902     else {
1903       declare( type, setname );
1904     }
1905   }
1906 
declare(AnnotationType type,const string & setname,const string & args)1907   void Document::declare( AnnotationType type,
1908 			  const string& setname,
1909 			  const string& args ){
1910     /// Add an annotation declaration
1911     /*!
1912       \param type The AnnotationType for which to add a setname
1913       \param setname The Set name to add
1914       \param args a string representation of an attribute-value list with
1915       additional parameters
1916     */
1917     KWargs kwargs = getArgs( args );
1918     return declare( type, setname, kwargs );
1919   }
1920 
declare(AnnotationType type,const string & setname,const KWargs & _args)1921   void Document::declare( AnnotationType type,
1922 			  const string& setname,
1923 			  const KWargs& _args ){
1924     /// Add an annotation declaration
1925     /*!
1926       \param type The AnnotationType for which to add a setname
1927       \param setname The Set name to add
1928       \param _args an attribute-value list with additional parameters
1929     */
1930     KWargs args = _args;
1931     if ( debug ){
1932       cerr << "declare( " << folia::toString(type) << "," << setname << ", ["
1933 	   << args << "] )" << endl;
1934     }
1935     string st = setname;
1936     if ( st.empty() ){
1937       if ( version_below( 1, 6 ) ){
1938 	st = "undefined";
1939       }
1940       else {
1941 	string prefix = folia::toString(type);
1942 	auto et_it = annotationtype_elementtype_map.find( type );
1943 	if ( et_it == annotationtype_elementtype_map.end() ){
1944 	  throw logic_error( "no matching element_type for annotation_type: "
1945 			     + prefix );
1946 	}
1947 	auto et = et_it->second;
1948 	properties *prop = element_props[et];
1949 	if ( prop->REQUIRED_ATTRIBS & Attrib::CLASS ) {
1950 	  throw XmlError( "setname may not be empty for " + prefix
1951 			  + "-annotation" );
1952 	}
1953       }
1954       if ( st.empty() ){
1955 	st = "None";
1956       }
1957     }
1958     set<string> processors;
1959     string a = args["annotator"];
1960     string t = args["annotatortype"];
1961     string f = args["format"];
1962     string d = args["datetime"];
1963     string alias = args["alias"];
1964     string processor = args["processor"];
1965     if ( !processor.empty() ){
1966       processors.insert( processor );
1967     }
1968     args.erase("annotator");
1969     args.erase("annotatortype");
1970     args.erase("format");
1971     args.erase("datetime");
1972     args.erase("alias");
1973     args.erase("processor");
1974     if ( args.size() != 0 ){
1975       throw XmlError( "declaration: expected 'annotator', 'annotatortype', 'processor', 'alias' or 'datetime', got '" + args.begin()->first + "'" );
1976     }
1977     declare( type, st, f, a, t, d, processors, alias );
1978   }
1979 
unalias(AnnotationType type,const string & alias) const1980   string Document::unalias( AnnotationType type,
1981 			    const string& alias ) const {
1982     /// resolve an alias for a setname to the full setname
1983     /*!
1984       \param type the AnnotationType
1985       \param alias the alias to resolve
1986       \return the setname belonging to alias for this type, or alias if not
1987       found
1988     */
1989     const auto& ti = _alias_set.find(type);
1990     if ( ti != _alias_set.end() ){
1991       const auto& sti = ti->second.find( alias );
1992       if ( sti != ti->second.end() ){
1993 	return sti->second;
1994       }
1995     }
1996     return alias;
1997   }
1998 
alias(AnnotationType type,const string & setname) const1999   string Document::alias( AnnotationType type,
2000 			  const string& setname ) const {
2001     /// give the alias for a setname
2002     /*!
2003       \param type the AnnotationType
2004       \param setname the alias to resolve
2005       \return the alias belonging setname for this type, or setname if
2006       not found
2007      */
2008     const auto& ti = _set_alias.find(type);
2009     if ( ti != _set_alias.end() ){
2010       const auto& ali = ti->second.find( setname );
2011       if ( ali != ti->second.end() ){
2012 	return ali->second;
2013       }
2014     }
2015     return setname;
2016   }
2017 
declare(AnnotationType type,const string & setname,const string & format,const string & annotator,const string & annotator_type,const string & date_time,const set<string> & _processors,const string & _alias)2018   void Document::declare( AnnotationType type,
2019 			  const string& setname,
2020 			  const string& format,
2021 			  const string& annotator,
2022 			  const string& annotator_type,
2023 			  const string& date_time,
2024 			  const set<string>& _processors,
2025 			  const string& _alias ){
2026     /// Add an annotation declaration
2027     /*!
2028       \param type The AnnotationType for which to add a setname
2029       \param setname The Set name to add
2030       \param format the format to add
2031       \param annotator the name of the annotator
2032       \param annotator_type the type of annotator
2033       \param date_time the date and time to set. The value "now()" will set it
2034       to the current time.
2035       \param _processors a set of processor id's to relate to this declaration
2036       \param _alias an alias value for the setname
2037     */
2038     if ( debug ){
2039       cerr << "declare( " << folia::toString(type) << "," << setname
2040 	   << ", format=" << format << "," << annotator << ","
2041 	   << annotator_type << "," << date_time << "," << _alias << ","
2042 	   << _processors << ") " << endl;
2043     }
2044     AnnotatorType ant = UNDEFINED;
2045     try {
2046       ant = TiCC::stringTo<AnnotatorType>( annotator_type );
2047     }
2048     catch (...) {
2049       throw XmlError( "declare(): illegal value '"
2050 		      + annotator_type + "' for annotator type" );
2051     }
2052     if ( !_alias.empty() ){
2053       string set_ali = alias(type,setname);
2054       if ( !set_ali.empty() ){
2055 	if ( set_ali != setname
2056 	     && set_ali != _alias ){
2057 	  throw XmlError( "setname: '" + setname + "' already has an alias: '"
2058 			  + set_ali );
2059 	}
2060       }
2061       string ali_ali = alias(type,_alias);
2062       string ali_set = unalias(type,_alias);
2063       if ( ali_ali != _alias ){
2064 	throw XmlError( "alias: '" + _alias +
2065 			"' is also in use as a setname for set:'"
2066 			+ ali_set + "'" );
2067       }
2068       if ( ali_set != _alias
2069 	   && ali_set != setname ){
2070 	throw XmlError( "alias: '" + _alias + "' already used for setname: '"
2071 			+ ali_set + "'" );
2072       }
2073     }
2074     if ( !declared( type, setname, annotator, ant, _processors ) ){
2075       set<string> procs = _processors;
2076       if ( !unalias(type,setname).empty()
2077 	   && unalias(type,setname) != setname ){
2078 	throw XmlError( "setname: '" + setname
2079 			+ "' is also in use as an alias" );
2080       }
2081       string d = date_time;
2082       if ( d == "now()" ){
2083 	d = get_ISO_date();
2084       }
2085       if ( procs.empty() ){
2086 	// old style
2087 	_annotationdefaults[type].insert( make_pair( setname,
2088 						     at_t(annotator,ant,d,format,procs) ) );
2089       }
2090       else {
2091 	// new style
2092 	auto set_pos = _annotationdefaults[type].find(setname);
2093 	if ( set_pos == _annotationdefaults[type].end() ){
2094 	  // no processor annotations yet
2095 	  _annotationdefaults[type].insert( make_pair( setname,
2096 						       at_t(annotator,ant,d,format,procs) ) );
2097 
2098 	}
2099 	else {
2100 	  // add to the existing
2101 	  for ( const auto& p : procs ){
2102 	    set_pos->second._processors.insert( p );
2103 	  }
2104 	}
2105       }
2106       if ( debug ){
2107 	cerr << "ADD to sort: " << folia::toString(type) << " ("
2108 	     << setname << ")"  << endl;
2109       }
2110       _anno_sort.push_back(make_pair(type,setname));
2111       _annotationrefs[type][setname] = 0;
2112       if ( !_alias.empty() ){
2113 	_alias_set[type][_alias] = setname;
2114 	_set_alias[type][setname] = _alias;
2115       }
2116       else {
2117 	_alias_set[type][setname] = setname;
2118 	_set_alias[type][setname] = setname;
2119       }
2120     }
2121   }
2122 
un_declare(AnnotationType type,const string & set_name)2123   void Document::un_declare( AnnotationType type,
2124 			     const string& set_name ){
2125     /// remove a declaration for an AnnotationType/setname pair
2126     /*!
2127       \param type the AnnotationType
2128       \param set_name the setname. May be empty ("")
2129 
2130       When \em set_name is "", ALL declarations of \em type are deleted
2131      */
2132     string setname = unalias(type,set_name);
2133     if ( debug ){
2134       cerr << "undeclare: " << folia::toString(type) << "(" << set_name << "."
2135 	   << setname << ")" << endl;
2136     }
2137     if ( _annotationrefs[type][setname] != 0 ){
2138       throw XmlError( "unable to undeclare " + toString(type) + "-type("
2139 		      + setname + ") (references remain)" );
2140     }
2141     auto const adt = _annotationdefaults.find(type);
2142     if ( adt != _annotationdefaults.end() ){
2143       if ( debug ){
2144 	cerr << "matched type=" << folia::toString(type) << endl;
2145       }
2146       auto it = adt->second.begin();
2147       while ( it != adt->second.end() ){
2148 	if ( debug ){
2149 	  cerr << "zoek set:" << setname << endl;
2150 	}
2151 	if ( setname.empty() || it->first == setname ){
2152 	  if ( debug ){
2153 	    cerr << "erase:" << setname << "==" << it->first << endl;
2154 	  }
2155 	  it = adt->second.erase(it);
2156 	}
2157 	else {
2158 	  ++it;
2159 	}
2160       }
2161       if ( debug ){
2162 	cerr << "ANNO-SORT: IN " << _anno_sort << endl;
2163       }
2164       auto it2 = _anno_sort.begin();
2165       while ( it2 != _anno_sort.end() ){
2166 	if ( debug ){
2167 	  cerr << "zoek set:" << setname << endl;
2168 	}
2169 	if ( it2->first == type
2170 	     && ( setname.empty() || it2->second == setname ) ){
2171 	  if ( debug ){
2172 	    cerr << "_annosort:erase:" << setname << "==" << it->first << endl;
2173 	  }
2174 	  it2 = _anno_sort.erase( it2 );
2175 	}
2176 	else {
2177 	  ++it2;
2178 	}
2179       }
2180       if ( debug ){
2181 	cerr << "ANNO-SORT: UIT " << _anno_sort << endl;
2182       }
2183       auto it3 = _alias_set[type].begin();
2184       while ( it3 != _alias_set[type].end() ){
2185 	if ( it3->first == setname || it3->second == setname ){
2186 	  it3 = _alias_set[type].erase( it3 );
2187 	}
2188 	else {
2189 	  ++it3;
2190 	}
2191       }
2192       auto it4 = _set_alias[type].begin();
2193       while ( it4 != _set_alias[type].end() ){
2194 	if ( it4->first == setname || it4->second == setname ){
2195 	  it4 = _set_alias[type].erase( it4 );
2196 	}
2197 	else {
2198 	  ++it4;
2199 	}
2200       }
2201       if ( adt->second.empty() ){
2202 	_annotationdefaults.erase(adt);
2203       }
2204     }
2205   }
2206 
unused_declarations() const2207   multimap<AnnotationType, string> Document::unused_declarations( ) const {
2208     /// search for declarations not referencec in the Document
2209     /*!
2210       \return a list of all AnntotationType/setname pairs that are not used
2211      */
2212     multimap<AnnotationType,string> result;
2213     for ( const auto& tit : _annotationrefs ){
2214       for ( const auto& mit : tit.second ){
2215 	if ( mit.second == 0 ){
2216 	  result.insert( make_pair(tit.first, mit.first ) );
2217 	}
2218       }
2219     }
2220     return result;
2221   }
2222 
setTextRoot(const KWargs & args)2223   Text* Document::setTextRoot( const KWargs& args ) {
2224     /// create a Text element as root for the document
2225     /*!
2226       \param args extra attribute-value pairs as attributes to use
2227       \return the created Text node
2228     */
2229     Text *t = new Text( args );
2230     foliadoc->append( t );
2231     return t;
2232   }
2233 
setTextRoot()2234   Text* Document::setTextRoot() {
2235     /// create a Text element as root for the document
2236     KWargs empty;
2237     return setTextRoot( empty );
2238   }
2239 
setSpeechRoot(const KWargs & args)2240   Speech* Document::setSpeechRoot( const KWargs& args ) {
2241     /// create a Speech element as root for the document
2242     /*!
2243       \param args extra attribute-value pairs as attributes to use
2244       \return the created Speech node
2245     */
2246     Speech *s = new Speech( args );
2247     foliadoc->append( s );
2248     return s;
2249   }
2250 
setSpeechRoot()2251   Speech* Document::setSpeechRoot() {
2252     /// create a Speech element as root for the document
2253     KWargs empty;
2254     return setSpeechRoot( empty );
2255   }
2256 
getRoot()2257   FoliaElement *Document::getRoot(){
2258     /// return the root element, if any
2259     if ( foliadoc && foliadoc->size() > 0 ){
2260       return foliadoc->index(0);
2261     }
2262     else {
2263       return 0;
2264     }
2265   }
2266 
append(FoliaElement * t)2267   FoliaElement* Document::append( FoliaElement *t ){
2268     /// append a root element tot the Document
2269     /*!
2270       \param t a root element to add
2271       \return the added root (also t). Throws on error.
2272 
2273       This function will check if a root is already there.
2274       Is only accepts Speech or Text nodes as root.
2275      */
2276 
2277     FoliaElement *root = getRoot();
2278     if ( root ){
2279       throw XmlError( "cannot append a root element to a Document. Already there." );
2280     }
2281     if ( t->element_id() == Text_t
2282 	 || t->element_id() == Speech_t ) {
2283       foliadoc->append( t );
2284       return t;
2285     }
2286     throw XmlError( "Only can append 'text' or 'speech' as root of a Document." );
2287   }
2288 
declared(const AnnotationType & type,const string & set_name,const string & annotator,const AnnotatorType & annotator_type,const string & processor) const2289   bool Document::declared( const AnnotationType& type,
2290 			   const string& set_name,
2291 			   const string& annotator,
2292 			   const AnnotatorType& annotator_type,
2293 			   const string& processor ) const {
2294     /// check if a given combination of AnnotationType, setname, annotators etc.
2295     /// is declared
2296     /*!
2297       \param type the AnnotationType
2298       \param set_name a setname OR an alias (may be empty)
2299       \param annotator the annotator to check (may be empty)
2300       \param annotator_type the annotator_type to check (may be UNDEFINED)
2301       \param processor the processor to match (may be empty)
2302       \return true when all values match.
2303 
2304       For the type NO_ANN, the result is always true.
2305 
2306       If set_name is empty ("") a match is found when a declarion for \e type
2307       exists
2308 
2309       Otherwise, all values are checked for a match
2310     */
2311     if ( debug ){
2312       cerr << "isdeclared? ( " << folia::toString(type) << "," << set_name << ","
2313 	   << annotator << "," << toString(annotator_type) << "," << processor
2314 	   << ") " << endl;
2315     }
2316     //
2317     // We DO NOT check the date. if all parameters match, it is OK
2318     //
2319     if ( type == AnnotationType::NO_ANN ){
2320       if ( debug ){
2321 	cerr << "\t\t TRUE want NO_ANN" << endl;
2322       }
2323       return true;
2324     }
2325     if ( !processor.empty()
2326 	 && !get_processor( processor ) ){
2327       throw XmlError( folia::toString(type)
2328 		      + "-annotation is referring an undefined processor '"
2329 		      + processor + "'" );
2330     }
2331     string setname = unalias(type,set_name);
2332     const auto& it1 = _annotationdefaults.find(type);
2333     if ( it1 != _annotationdefaults.end() ){
2334       if ( debug ){
2335 	cerr << "OK, found an entry for type: " << folia::toString(type) << endl;
2336       }
2337       if ( setname.empty() ){
2338 	// 'wildcard' for setname
2339 	return true;
2340       }
2341       auto mit2 = it1->second.lower_bound(setname);
2342       while ( mit2 != it1->second.upper_bound(setname) ){
2343 	if ( debug ){
2344 	  cerr << "OK, found an entry for set='" << setname  << "'" << endl;
2345 	  cerr << "content: " << mit2->second << endl;
2346 	}
2347 	if ( mit2->second._annotator == annotator
2348 	     && mit2->second._ann_type == annotator_type
2349 	     && ( (mit2->second._processors.empty() && processor.empty() )
2350 		  || ( mit2->second._processors.find(processor)
2351 		       != mit2->second._processors.end() ) ) ){
2352 	  if ( debug ){
2353 	    cerr << "\t\t declared ==> TRUE" << endl;
2354 	  }
2355 	  return true;
2356 	}
2357 	++mit2;
2358       }
2359     }
2360     if ( debug ){
2361       cerr << "\t\t declared() ==> FALSE" << endl;
2362     }
2363     return false;
2364   }
2365 
declared(const AnnotationType & type,const string & set_name,const string & annotator,const AnnotatorType & annotator_type,const set<string> & processors) const2366   bool Document::declared( const AnnotationType& type,
2367 			   const string& set_name,
2368 			   const string& annotator,
2369 			   const AnnotatorType& annotator_type,
2370 			   const set<string>& processors ) const {
2371     /// check if a given combination of AnnotationType, setname, annotators etc.
2372     /// is declared
2373     /*!
2374       \param type the AnnotationType
2375       \param set_name a setname OR an alias (may be empty)
2376       \param annotator the annotator to check (may be empty)
2377       \param annotator_type the annotator_type to check (may be UNDEFINED)
2378       \param processors a list of processors to match (may be empty)
2379       \return true when all values match.
2380 
2381       For the type NO_ANN, the result is always true.
2382 
2383       If set_name is empty ("") a match is found when a declarion for \e type
2384       exists
2385 
2386       Otherwise, all values are checked for a match for at least 1 of the
2387       processors.
2388     */
2389     if ( processors.empty() ){
2390       return declared( type, set_name, annotator, annotator_type, "" );
2391     }
2392     else {
2393       for ( const auto& s : processors ){
2394 	if ( declared( type, set_name, annotator, annotator_type, s ) ){
2395 	  return true;
2396 	}
2397       }
2398       return false;
2399     }
2400   }
2401 
incrRef(AnnotationType type,const string & s)2402   void Document::incrRef( AnnotationType type,
2403 			  const string& s ){
2404     /// increment the reference count for the AnnotationType/set combination
2405     /*!
2406       \param type the AnnotationType
2407       \param s the setname
2408     */
2409     if ( type != AnnotationType::NO_ANN ){
2410       string st = s;
2411       if ( st.empty() ){
2412 	st = default_set(type);
2413       }
2414       ++_annotationrefs[type][st];
2415       // cerr << "increment " << toString(type) << "(" << st << ") to: "
2416       // 	   << _annotationrefs[type][s] << endl;
2417     }
2418   }
2419 
decrRef(AnnotationType type,const string & s)2420   void Document::decrRef( AnnotationType type,
2421 			  const string& s ){
2422     /// decrement the reference count for the AnnotationType/set combination
2423     /*!
2424       \param type the AnnotationType
2425       \param s the setname
2426     */
2427     if ( type != AnnotationType::NO_ANN
2428 	 && _annotationrefs[type][s] > 0 ){
2429       --_annotationrefs[type][s];
2430       // cerr << "decrement " << toString(type) << "(" << s << ") to: "
2431       // 	   << _annotationrefs[type][s] << endl;
2432     }
2433   }
2434 
declared(const AnnotationType & type,const string & set_name) const2435   bool Document::declared( const AnnotationType& type,
2436 			   const string& set_name ) const {
2437     /// check if a given combination of AnnotationType and setname
2438     /// is declared
2439     /*!
2440       \param type the AnnotationType
2441       \param set_name a setname OR an alias (may be empty)
2442       \return true when there is a match
2443 
2444       For the type NO_ANN, the result is always true.
2445 
2446       If set_name is empty ("") a match is found when a declarion for \e type
2447       exists
2448 
2449     */
2450     if ( debug ){
2451       cerr << "declared(" << folia::toString(type) << ",'"
2452 	   << set_name << "')" << endl;
2453     }
2454     if ( type == AnnotationType::NO_ANN ){
2455       if ( debug ){
2456 	cerr << "always true for NO_ANN" << endl;
2457       }
2458       return true;
2459     }
2460     if ( debug ){
2461       cerr << "Doorzoek: " << _annotationdefaults << endl;
2462     }
2463     const auto& mit1 = _annotationdefaults.find(type);
2464     if ( mit1 != _annotationdefaults.end() ){
2465       if ( debug ){
2466 	cerr << "found some: " << mit1->second << endl;
2467       }
2468       if ( set_name.empty() ){
2469 	// 'wildcard' for setname
2470 	if ( debug ){
2471 	  cerr << "return TRUE" << endl;
2472 	}
2473 	return true;
2474       }
2475       string s_name = unalias(type,set_name);
2476       if ( debug ){
2477 	cerr << "lookup: " << set_name << " (" << s_name << ")" << endl;
2478       }
2479       const auto& mit2 = mit1->second.find(s_name);
2480       if ( debug ){
2481 	if ( mit2 != mit1->second.end() ){
2482 	  cerr << "return TRUE" << endl;
2483 	}
2484 	else {
2485 	  cerr << "return FALSE" << endl;
2486 	}
2487       }
2488       return mit2 != mit1->second.end();
2489     }
2490     if ( debug ){
2491       cerr << "return DIRECTLY FALSE" << endl;
2492     }
2493     return false;
2494   }
2495 
declared(ElementType et,const string & set_name) const2496   bool Document::declared( ElementType et,
2497 			   const string& set_name ) const {
2498     /// check if the AnnotationType belonging to the ElementType and setname
2499     /// is declared
2500     /*!
2501       \param et the ElementType
2502       \param set_name a setname OR an alias (may be empty)
2503       \return true when there is a match
2504 
2505       For the type NO_ANN, the result is always true.
2506 
2507       If set_name is empty ("") a match is found when a declarion for \em type
2508       exists
2509     */
2510     AnnotationType at = element_annotation_map[et];
2511     return declared( at, set_name );
2512   }
2513 
default_set(AnnotationType type) const2514   string Document::default_set( AnnotationType type ) const {
2515     /// return the default setname for the type. If any.
2516     /*!
2517       \param type the AnnotationType
2518       \return the setname. May be empty ("") when there is none defined OR it
2519       is ambiguous.
2520     */
2521     if ( type == AnnotationType::NO_ANN ){
2522       return "";
2523     }
2524     // search a set. it must be unique. Otherwise return ""
2525     if ( debug ){
2526       cerr << "\nzoek voor '" << toString(type) << "' de default set in:\n"
2527 	   <<  _annotationdefaults << endl;
2528     }
2529     string result;
2530     const auto& mit1 = _annotationdefaults.find(type);
2531     if ( mit1 != _annotationdefaults.end() ){
2532       if ( debug ){
2533 	cerr << "vind tussen " <<  mit1->second << endl;
2534       }
2535       if ( mit1->second.size() == 1 ){
2536 	// so it is unique
2537 	result = mit1->second.begin()->first;
2538       }
2539     }
2540     if ( debug ){
2541       cerr << "default_set ==> " << result << endl;
2542     }
2543     return result;
2544   }
2545 
default_annotator(AnnotationType type,const string & setname) const2546   string Document::default_annotator( AnnotationType type,
2547 				      const string& setname ) const {
2548     /// return the default annotator for the type/setname combination.
2549     /*!
2550       \param type the AnnotationType
2551       \param setname the annotation set. An empty string ("") means ANY set.
2552       \return the annotator. May be empty ("") when there is none defined OR it
2553       is ambiguous.
2554     */
2555     if ( type == AnnotationType::NO_ANN ){
2556       return "";
2557     }
2558     const auto& mit1 = _annotationdefaults.find(type);
2559     string result;
2560     if ( mit1 != _annotationdefaults.end() ){
2561       //      cerr << "vind tussen " <<  mit1->second << endl;
2562       if ( setname.empty() ){
2563 	// 'wildcard' search
2564 	if ( mit1->second.size() == 1 ){
2565 	  // so it is unique
2566 	  result = mit1->second.begin()->second._annotator;
2567 	  return result;
2568 	}
2569       }
2570       else {
2571 	if ( mit1->second.count( setname ) == 1 ){
2572 	  // so it is unique
2573 	  const auto& mit2 = mit1->second.find( setname );
2574 	  result = mit2->second._annotator;
2575 	}
2576       }
2577     }
2578     //    cerr << "get default ==> " << result << endl;
2579     return result;
2580   }
2581 
default_annotatortype(AnnotationType type,const string & setname) const2582   AnnotatorType Document::default_annotatortype( AnnotationType type,
2583 						 const string& setname ) const {
2584     /// return the default annotator type for the type/setname combination.
2585     /*!
2586       \param type the AnnotationType
2587       \param setname the AnnotationType. An empty string ("") means ANY set.
2588       \return the annotator. May be empty ("") when there is none defined OR it
2589       is ambiguous.
2590     */
2591     if ( debug ){
2592       cerr << "annotationdefaults= " <<  _annotationdefaults << endl;
2593       cerr << "lookup: " << folia::toString(type) << endl;
2594     }
2595     AnnotatorType result = UNDEFINED;
2596     if ( type == AnnotationType::NO_ANN ){
2597       return result;
2598     }
2599     const auto& mit1 = _annotationdefaults.find(type);
2600     if ( mit1 != _annotationdefaults.end() ){
2601       if ( debug ){
2602 	cerr << "found a hit for type=" << folia::toString( type ) << endl;
2603       }
2604       if ( setname.empty() ){
2605 	// 'wildcard' search
2606 	if ( mit1->second.size() == 1 ){
2607 	  // so it is unique
2608 	  result = mit1->second.begin()->second._ann_type;
2609 	}
2610 	return result;
2611       }
2612       else {
2613 	if ( mit1->second.count( setname ) == 1 ){
2614 	  // so it is unique
2615 	  const auto& mit2 = mit1->second.find( setname );
2616 	  result = mit2->second._ann_type;
2617 	}
2618       }
2619     }
2620     //  cerr << "get default ==> " << result << endl;
2621     return result;
2622   }
2623 
default_datetime(AnnotationType type,const string & setname) const2624   string Document::default_datetime( AnnotationType type,
2625 				     const string& setname ) const {
2626     /// return the default datetime value for the type/setname combination.
2627     /*!
2628       \param type the AnnotationType
2629       \param setname the annotation set.  An empty string ("") means ANY set.
2630       \return the datetime value. May be empty ("") when there is none defined
2631       OR it is ambiguous.
2632     */
2633     const auto& mit1 = _annotationdefaults.find(type);
2634     string result;
2635     if ( mit1 != _annotationdefaults.end() ){
2636       if ( setname.empty() ){
2637 	// 'wildcard' search
2638 	if ( mit1->second.size() == 1 ){
2639 	  // so it is unique
2640 	  result = mit1->second.begin()->second._date;
2641 	}
2642       }
2643       else {
2644 	if ( mit1->second.count( setname ) == 1 ){
2645 	  // so it is unique
2646 	  const auto& mit2 = mit1->second.find( setname );
2647 	  result = mit2->second._date;
2648 	}
2649       }
2650     }
2651     //  cerr << "get default ==> " << result << endl;
2652     return result;
2653   }
2654 
default_processor(AnnotationType type,const string & setname) const2655   string Document::default_processor( AnnotationType type,
2656 				      const string& setname ) const{
2657     /// return the default processor type for the type/setname combination.
2658     /*!
2659       \param type the AnnotationType
2660       \param setname the annotation set.  An empty string ("") means ANY set.
2661       \return the processor. May be empty ("") when there is none defined OR it
2662       is ambiguous.
2663     */
2664     if ( debug ){
2665       cerr << "defaultprocessor(" << toString( type ) << ","
2666 	   << setname << ")" << endl;
2667     }
2668     auto const& it = _annotationdefaults.find(type);
2669     if ( it != _annotationdefaults.end() ){
2670       if ( debug ){
2671 	cerr << "found some defs: " << it->second << endl;
2672 	cerr << "NOW search for set: " << setname << endl;
2673       }
2674       if ( setname.empty() ){
2675 	// 'wildcard' search
2676 	if ( it->second.size() == 1
2677 	     && it->second.begin()->second._processors.size() == 1 ){
2678 	  // so it is unique for setname AND for the number of processors
2679 	  return *it->second.begin()->second._processors.begin();
2680 	}
2681 	else {
2682 	  return "";
2683 	}
2684       }
2685       set<string> results;
2686       auto s_it = it->second.lower_bound(setname);
2687       while ( s_it != it->second.upper_bound(setname) ){
2688 	if ( debug ){
2689 	  cerr << "found sub strings: " << s_it->second << endl;
2690 	}
2691 	results.insert( s_it->second._processors.begin(),
2692 			s_it->second._processors.end() );
2693 	++s_it;
2694       }
2695       if ( results.size() == 1 ){
2696 	// so we found exactly 1 processor
2697 	return *results.begin();
2698       }
2699       else if ( results.size() > 1 ){
2700 	auto const& as = annotationtype_xml_map.find(type);
2701 	if ( as != annotationtype_xml_map.end() ){
2702 	  throw NoDefaultError("No processor specified for <"
2703 			       + as->second +  ">, but the presence of multiple declarations prevent assigning a default");
2704 	}
2705       }
2706     }
2707     return "";
2708   }
2709 
original_default_set(AnnotationType type) const2710   string Document::original_default_set( AnnotationType type ) const {
2711     /// return the default setname for the type in the ORIGINAL definitions.
2712     /*!
2713       \param type the AnnotationType
2714       \return the setname. May be empty ("") when there is none defined OR it
2715       is ambiguous.
2716 
2717       In case of \e incremental Document building, we are allowed to add
2718       annotation declarations at any moment. That might render the default_set
2719       of an AnnotationType undefined. With this function, we still are able to
2720       find the original value and use that e.g. on output.
2721     */
2722     auto const& it = _orig_ann_default_sets.find(type);
2723     if ( it == _orig_ann_default_sets.end() ){
2724       return "";
2725     }
2726     else {
2727       return it->second;
2728     }
2729   }
2730 
original_default_processor(AnnotationType type) const2731   string Document::original_default_processor( AnnotationType type ) const {
2732     /// return the default processor name for the type in the ORIGINAL definitions.
2733     /*!
2734       \param type the AnnotationType
2735       \return the processor name. May be empty ("") when there is none defined
2736       OR it is ambiguous.
2737 
2738       In case of \e incremental Document building, we are allowed to add
2739       annotation declarations at any moment. That might render the default
2740       processor of an AnnotationType undefined. With this function, we still
2741       are able to find the original value and use that e.g. on output.
2742     */
2743     auto const& it = _orig_ann_default_procs.find(type);
2744     if ( it == _orig_ann_default_procs.end() ){
2745       return "";
2746     }
2747     else {
2748       return it->second;
2749     }
2750   }
2751 
get_annotators(AnnotationType type,const string & setname) const2752   vector<string> Document::get_annotators( AnnotationType type,
2753 					   const string& setname ) const {
2754     /// return all the annotators for the type/setname combination.
2755     /*!
2756       \param type the AnnotationType
2757       \param setname the annotation set. An empty string ("") means ANY set.
2758       \return a list of annotators.
2759     */
2760     vector<string> result;
2761     if ( type == AnnotationType::NO_ANN ){
2762       return result;
2763     }
2764     const auto& mit1 = _annotationdefaults.find(type);
2765     if ( mit1 != _annotationdefaults.end() ){
2766       //    cerr << "vond iets voor " << toString(type) << endl;
2767       for ( auto pos = mit1->second.lower_bound(setname);
2768 	    pos != mit1->second.upper_bound(setname);
2769 	    ++pos ){
2770 	copy( pos->second._processors.begin(),
2771 	      pos->second._processors.end(),
2772 	      back_inserter(result) );
2773       }
2774     }
2775     //    cerr << "get default ==> " << result << endl;
2776     return result;
2777 
2778   }
2779 
get_processors(AnnotationType type,const string & setname) const2780   vector<const processor*> Document::get_processors( AnnotationType type,
2781 						     const string& setname ) const {
2782     /// return all the processors for the type/setname combination.
2783     /*!
2784       \param type the AnnotationType
2785       \param setname the annotation set. An empty string ("") means ANY set.
2786       \return a list of processors.
2787     */
2788     vector<const processor*> result;
2789     if ( debug ){
2790       cerr << "getprocessors(" << toString( type ) << ","
2791 	   << setname << ")" << endl;
2792     }
2793     if ( type == AnnotationType::NO_ANN ){
2794       return result;
2795     }
2796     auto const& it = _annotationdefaults.find(type);
2797     if ( it != _annotationdefaults.end() ){
2798       if ( debug ){
2799 	cerr << "found some defs: " << it->second << endl;
2800       }
2801       for ( auto pos = it->second.lower_bound(setname);
2802 	    pos != it->second.upper_bound(setname);
2803 	    ++pos ){
2804 	transform( pos->second._processors.begin(),
2805 		   pos->second._processors.end(),
2806 		   back_inserter(result),
2807 		   [&]( const string& p ){ return get_processor(p); } );
2808       }
2809     }
2810     return result;
2811   }
2812 
add_one_anno(const pair<AnnotationType,string> & pair,xmlNode * node,set<string> & done) const2813   void Document::add_one_anno( const pair<AnnotationType,string>& pair,
2814 			       xmlNode *node,
2815 			       set<string>& done ) const {
2816     /// create an annotation declaration entry under the xmlNode node
2817     /*!
2818       \param pair an AnnotationType/setname pair
2819       \param node the node we want to add to
2820       \param done a set of "labels" to keep track of already handled cases
2821 
2822      */
2823     AnnotationType type = pair.first;
2824     string sett = pair.second;
2825     string label = annotation_type_to_string( type );
2826     if ( done.find(label+sett) != done.end() ){
2827       return;
2828     }
2829     done.insert(label+sett);
2830     label += "-annotation";
2831     const auto& mm = _annotationdefaults.find(type);
2832     auto it = mm->second.lower_bound(sett);
2833     while ( it != mm->second.upper_bound(sett) ){
2834       string s = it->second._annotator;
2835       if ( !s.empty() ){
2836 	// old style
2837 	KWargs args;
2838 	args["annotator"] = s;
2839 	AnnotatorType ant = it->second._ann_type;
2840 	if ( ant != UNDEFINED && ant != AUTO ){
2841 	  args["annotatortype"] = toString(ant);
2842 	}
2843 	if ( !strip() ){
2844 	  s = it->second._date;
2845 	  if ( !s.empty() ){
2846 	    args["datetime"] = s;
2847 	  }
2848 	}
2849 	s = it->second._format;
2850 	if ( !s.empty() ){
2851 	  args["format"] = s;
2852 	}
2853 	s = it->first;
2854 	if ( s == "None" ){ // "empty" set
2855 	  // skip
2856 	}
2857 	else if ( s != "undefined" ){ // the default
2858 	  args["set"] = s;
2859 	}
2860 	auto const& t_it = _groupannotations.find(type);
2861 	if ( t_it != _groupannotations.end() ){
2862 	  auto const& s_it = t_it->second.find(s);
2863 	  if ( s_it != t_it->second.end()
2864 	       && s_it->second ){
2865 	    args["groupannotations"] = "yes";
2866 	  }
2867 	}
2868 
2869 	const auto& ti = _set_alias.find(type);
2870 	if ( ti != _set_alias.end() ){
2871 	  const auto& alias = ti->second.find(s);
2872 	  if ( alias->second != s ){
2873 	    args["alias"] = alias->second;
2874 	  }
2875 	}
2876 	xmlNode *n = TiCC::XmlNewNode( foliaNs(), label );
2877 	addAttributes( n, args );
2878 	xmlAddChild( node, n );
2879       }
2880       else {
2881 	// we have new style processors
2882 	KWargs args;
2883 	if ( !strip() ){
2884 	  s = it->second._date;
2885 	  if ( !s.empty() ){
2886 	    args["datetime"] = s;
2887 	  }
2888 	}
2889 	s = it->second._format;
2890 	if ( !s.empty() ){
2891 	  args["format"] = s;
2892 	}
2893 	s = it->first;
2894 	if ( s == "None" ){ // "empty" set
2895 	  // skip
2896 	}
2897 	else if ( s != "undefined" ){ // the default
2898 	  args["set"] = s;
2899 	}
2900 	const auto& ti = _set_alias.find(type);
2901 	if ( ti != _set_alias.end() ){
2902 	  const auto& alias = ti->second.find(s);
2903 	  if ( alias->second != s ){
2904 	    args["alias"] = alias->second;
2905 	  }
2906 	}
2907 	auto const& t_it = _groupannotations.find(type);
2908 	if ( t_it != _groupannotations.end() ){
2909 	  auto const& s_it = t_it->second.find(s);
2910 	  if ( s_it != t_it->second.end()
2911 	       && s_it->second ){
2912 	    args["groupannotations"] = "yes";
2913 	  }
2914 	}
2915 	xmlNode *n = TiCC::XmlNewNode( foliaNs(), label );
2916 	addAttributes( n, args );
2917 	xmlAddChild( node, n );
2918 	args.clear();
2919 	for ( const auto& p : it->second._processors ){
2920 	  xmlNode *a = TiCC::XmlNewNode( foliaNs(), "annotator" );
2921 	  args["processor"] = p;
2922 	  addAttributes( a, args );
2923 	  xmlAddChild( n, a );
2924 	}
2925       }
2926       ++it;
2927     }
2928   }
2929 
add_annotations(xmlNode * metadata) const2930   void Document::add_annotations( xmlNode *metadata ) const {
2931     /// create an annotations block under the xmlNode metadata
2932     /*!
2933       \param metadata the parent to add to
2934       calls add_one_anno() for every annotation declaration.
2935     */
2936     if ( debug ){
2937       cerr << "start add_annotations: " << _annotationdefaults << endl;
2938       cerr << "sorting: " << _anno_sort << endl;
2939     }
2940     xmlNode *node = xmlAddChild( metadata,
2941 				 TiCC::XmlNewNode( foliaNs(),
2942 						   "annotations" ) );
2943     set<string> done;
2944     if ( canonical() ){
2945       multimap<AnnotationType,
2946 	       pair<AnnotationType,string>> ordered;
2947       for ( const auto& pair : _anno_sort ){
2948 	ordered.insert(make_pair(pair.first,pair));
2949       }
2950       for ( const auto& it : ordered ){
2951 	add_one_anno( it.second, node, done );
2952       }
2953     }
2954     else {
2955       for ( const auto& pair : _anno_sort ){
2956 	add_one_anno( pair, node, done );
2957       }
2958     }
2959   }
2960 
append_processor(xmlNode * node,const processor * p) const2961   void Document::append_processor( xmlNode *node, const processor *p ) const {
2962     /// add a processor xml structure to the parent 'node'
2963     /*!
2964       \param node the xml node to add to
2965       \param p the processor of which to add te info
2966     */
2967     xmlNode *pr = xmlAddChild( node, TiCC::XmlNewNode( foliaNs(), "processor" ) );
2968     KWargs atts;
2969     atts["xml:id"] = p->_id;
2970     atts["name"] = p->_name;
2971     if ( p->_type != AUTO || has_explicit() ){
2972       atts["type"] = toString(p->_type);
2973     }
2974     if ( !strip() ){
2975       if ( !p->_version.empty() ){
2976 	atts["version"] = p->_version;
2977       }
2978       if ( !p->_folia_version.empty() ){
2979 	atts["folia_version"] = p->_folia_version;
2980       }
2981       if ( !p->_command.empty() ){
2982 	atts["command"] = p->_command;
2983       }
2984       if ( !p->_host.empty() ){
2985 	atts["host"] = p->_host;
2986       }
2987       if ( !p->_user.empty() ){
2988 	atts["user"] = p->_user;
2989       }
2990       if ( !p->_begindatetime.empty() ){
2991 	atts["begindatetime"] = p->_begindatetime;
2992       }
2993       if ( !p->_enddatetime.empty() ){
2994 	atts["enddatetime"] = p->_enddatetime;
2995       }
2996     }
2997     else {
2998       if ( p->_name == "libfolia" ){
2999 	atts["name"] = "stripped";
3000       }
3001       else if ( p->_name == "foliapy" ){
3002 	atts["name"] = "stripped";
3003       }
3004       else if ( !p->_name.empty() ){
3005 	atts["name"] = p->_name;
3006       }
3007       if ( !p->_version.empty() ){
3008 	atts["version"] = "stripped";
3009       }
3010       if ( !p->_folia_version.empty() ){
3011 	atts["folia_version"] = "stripped";
3012       }
3013       if ( !p->_command.empty() ){
3014 	atts["command"] = "stripped";
3015       }
3016       if ( !p->_host.empty() ){
3017 	atts["host"] = "stripped";
3018       }
3019       if ( !p->_user.empty() ){
3020 	atts["user"] = "stripped";
3021       }
3022       if ( !p->_begindatetime.empty() ){
3023 	atts["begindatetime"] = "stripped";
3024       }
3025       if ( !p->_enddatetime.empty() ){
3026 	atts["enddatetime"] = "stripped";
3027       }
3028     }
3029     if ( !p->_document_version.empty() ){
3030       atts["document_version"] = p->_document_version;
3031     }
3032     if ( !p->_resourcelink.empty() ){
3033       atts["resourcelink"] = p->_resourcelink;
3034     }
3035     if ( !p->_src.empty() ){
3036       atts["src"] = p->_src;
3037     }
3038     if ( !p->_format.empty() ){
3039       atts["format"] = p->_format;
3040     }
3041     addAttributes( pr, atts );
3042     for ( const auto& it : p->_metadata ){
3043       xmlNode *m = xmlAddChild( pr, TiCC::XmlNewNode( foliaNs(), "meta" ) );
3044       KWargs args;
3045       args["id"] = it.first;
3046       addAttributes( m, args );
3047       xmlAddChild( m, xmlNewText( (const xmlChar*)it.second.c_str()) );
3048     }
3049     for ( const auto& s : p->_processors ){
3050       append_processor( pr, s );
3051     }
3052   }
3053 
add_provenance(xmlNode * metadata) const3054   void Document::add_provenance( xmlNode *metadata ) const {
3055     /// create a provenance block under the xmlNode metadata
3056     /*!
3057       \param metadata the parent to add to
3058       calls append_processor() for every processor available
3059     */
3060     if ( !_provenance ){
3061       return;
3062     }
3063     xmlNode *node = xmlAddChild( metadata,
3064 				 TiCC::XmlNewNode( foliaNs(),
3065 						   "provenance" ) );
3066     for ( const auto& p : _provenance->processors ){
3067       append_processor( node, p );
3068     }
3069   }
3070 
add_submetadata(xmlNode * node) const3071   void Document::add_submetadata( xmlNode *node ) const {
3072     /// add a submetadata block to node
3073     for ( const auto& it : submetadata ){
3074       xmlNode *sm = TiCC::XmlNewNode( foliaNs(), "submetadata" );
3075       KWargs atts;
3076       atts["xml:id"] = it.first;
3077       addAttributes( sm, atts );
3078       MetaData *md = submetadata.find(it.first)->second;
3079       string type = md->type();
3080       atts.clear();
3081       atts["type"] = type;
3082       addAttributes( sm, atts );
3083       xmlAddChild( node, sm );
3084       if ( type == "native" ){
3085 	atts = it.second->get_avs();
3086 	// cerr << "atts: " << atts << endl;
3087 	for ( const auto& av : atts ){
3088 	  xmlNode *m = TiCC::XmlNewNode( foliaNs(), "meta" );
3089 	  KWargs args;
3090 	  args["id"] = av.first;
3091 	  addAttributes( m, args );
3092 	  xmlAddChild( m, xmlNewText( (const xmlChar*)av.second.c_str()) );
3093 	  xmlAddChild( sm, m );
3094 	}
3095       }
3096       else if ( md->datatype() == "ExternalMetaData" ){
3097 	KWargs args;
3098 	args["src"] = md->src();
3099 	addAttributes( sm, args );
3100       }
3101       else if ( md->datatype() == "ForeignMetaData" ){
3102 	for ( const auto& foreign : md->get_foreigners() ) {
3103 	  xmlNode *f = foreign->xml( true, false );
3104 	  xmlAddChild( sm, f );
3105 	}
3106       }
3107     }
3108   }
3109 
add_metadata(xmlNode * node) const3110   void Document::add_metadata( xmlNode *node ) const{
3111     /// add a metadata block to node
3112     if ( _metadata ){
3113       if ( _metadata->datatype() == "ExternalMetaData" ){
3114 	KWargs atts;
3115 	atts["type"] = "external";
3116 	string src = _metadata->src();
3117 	if ( !src.empty() ){
3118 	  atts["src"] = src;
3119 	}
3120 	addAttributes( node, atts );
3121       }
3122       else {
3123 	KWargs atts;
3124 	atts["type"] = _metadata->type();
3125 	addAttributes( node, atts );
3126 	for ( const auto& it : _metadata->get_avs() ){
3127 	  xmlNode *m = TiCC::XmlNewNode( foliaNs(), "meta" );
3128 	  xmlAddChild( m, xmlNewText( (const xmlChar*)it.second.c_str()) );
3129 	  KWargs meta_atts;
3130 	  meta_atts["id"] = it.first;
3131 	  addAttributes( m, meta_atts );
3132 	  xmlAddChild( node, m );
3133 	}
3134       }
3135     }
3136     if ( _foreign_metadata ){
3137       if ( !_metadata ){
3138 	KWargs atts;
3139 	atts["type"] = "foreign";
3140 	addAttributes( node, atts );
3141       }
3142       for ( const auto& foreign : _foreign_metadata->get_foreigners() ) {
3143 	xmlNode *f = foreign->xml( true, false );
3144 	xmlAddChild( node, f );
3145       }
3146     }
3147     if ( !_metadata
3148 	 && !_foreign_metadata ){
3149       KWargs atts;
3150       atts["type"] = "native";
3151       addAttributes( node, atts );
3152     }
3153     add_submetadata( node );
3154   }
3155 
add_styles(xmlDoc * doc) const3156   void Document::add_styles( xmlDoc* doc ) const {
3157     /// add a styles block to the output document
3158     /*!
3159       \param doc the output document
3160     */
3161     for ( const auto& it : styles ){
3162       string content = "type=\"" + it.first + "\" href=\"" + it.second + "\"";
3163       xmlAddChild( (xmlNode*)doc,
3164 		   xmlNewDocPI( doc,
3165 				(const xmlChar*)"xml-stylesheet",
3166 				(const xmlChar*)content.c_str() ) );
3167     }
3168   }
3169 
to_xmlDoc(const string & ns_label) const3170   xmlDoc *Document::to_xmlDoc( const string& ns_label ) const {
3171     /// convert the Document to an xmlDoc
3172     /*!
3173       \param ns_label a namespace label to use. (default "")
3174     */
3175     xmlDoc *outDoc = xmlNewDoc( (const xmlChar*)"1.0" );
3176     add_styles( outDoc );
3177     xmlNode *root = xmlNewDocNode( outDoc, 0, (const xmlChar*)"FoLiA", 0 );
3178     xmlDocSetRootElement( outDoc, root );
3179     xmlNs *xl = xmlNewNs( root, (const xmlChar *)"http://www.w3.org/1999/xlink",
3180 			  (const xmlChar *)"xlink" );
3181     xmlSetNs( root, xl );
3182     if ( _foliaNsIn_href == 0 ){
3183       if ( ns_label.empty() ){
3184 	_foliaNsOut = xmlNewNs( root, (const xmlChar *)NSFOLIA.c_str(), 0 );
3185       }
3186       else {
3187 	_foliaNsOut = xmlNewNs( root,
3188 				(const xmlChar *)NSFOLIA.c_str(),
3189 				(const xmlChar*)ns_label.c_str() );
3190       }
3191     }
3192     else {
3193       _foliaNsOut = xmlNewNs( root,
3194 			      _foliaNsIn_href,
3195 			      _foliaNsIn_prefix );
3196     }
3197     xmlSetNs( root, _foliaNsOut );
3198     KWargs attribs;
3199     attribs["xml:id"] = foliadoc->id();
3200     if ( strip() ){
3201       attribs["generator"] = "";
3202       attribs["version"] = "";
3203     }
3204     else {
3205       attribs["generator"] = "libfolia-v" + library_version();
3206       attribs["version"] = _version_string;
3207       // attribs["version"] = folia_version();
3208     }
3209     if ( has_explicit() ){
3210       attribs["form"] = "explicit";
3211     }
3212     if ( _external_document ){
3213       attribs["external"] = "yes";
3214     }
3215     addAttributes( root, attribs );
3216 
3217     xmlNode *md = xmlAddChild( root, TiCC::XmlNewNode( foliaNs(), "metadata" ) );
3218     add_annotations( md );
3219     add_provenance( md );
3220     add_metadata( md );
3221     for ( size_t i=0; i < foliadoc->size(); ++i ){
3222       FoliaElement* el = foliadoc->index(i);
3223       xmlAddChild( root, el->xml( true, canonical() ) );
3224     }
3225     return outDoc;
3226   }
3227 
toXml(const string & ns_label) const3228   string Document::toXml( const string& ns_label ) const {
3229     /// dump the Document to a string
3230     /*!
3231       \param ns_label a namespace label to use. (default "")
3232     */
3233     string result;
3234     if ( foliadoc ){
3235       xmlDoc *outDoc = to_xmlDoc( ns_label );
3236       xmlChar *buf; int size;
3237       xmlDocDumpFormatMemoryEnc( outDoc, &buf, &size,
3238 				 output_encoding, 1 );
3239       result = string( (const char *)buf, size );
3240       xmlFree( buf );
3241       xmlFreeDoc( outDoc );
3242       _foliaNsOut = 0;
3243     }
3244     else {
3245       throw runtime_error( "can't save, no doc" );
3246     }
3247     return result;
3248   }
3249 
toXml(const string & file_name,const string & ns_label) const3250   bool Document::toXml( const string& file_name,
3251 			const string& ns_label ) const {
3252     /// write the Document to a file
3253     /*!
3254       \param file_name the name of the file to create
3255       \param ns_label a namespace label to use. (default "")
3256       \return false on error, true otherwise
3257       automaticly detects .gz and .bz2 filenames and will handle accordingly
3258     */
3259     if ( foliadoc ){
3260       long int res = 0;
3261       if ( TiCC::match_back( file_name, ".bz2" ) ){
3262 	string tmpname = file_name.substr( 0, file_name.length() - 3 ) + "tmp";
3263 	if ( toXml( tmpname, ns_label ) ){
3264 	  bool stat = TiCC::bz2Compress( tmpname, file_name );
3265 	  remove( tmpname.c_str() );
3266 	  if ( !stat ){
3267 	    res = -1;
3268 	  }
3269 	}
3270       }
3271       else {
3272 	xmlDoc *outDoc = to_xmlDoc( ns_label );
3273 	if ( TiCC::match_back( file_name, ".gz" ) ){
3274 	  xmlSetDocCompressMode(outDoc,9);
3275 	}
3276 	res = xmlSaveFormatFileEnc( file_name.c_str(),
3277 				    outDoc,
3278 				    output_encoding, 1 );
3279 	xmlFreeDoc( outDoc );
3280 	_foliaNsOut = 0;
3281       }
3282       if ( res == -1 ){
3283 	return false;
3284       }
3285     }
3286     else {
3287       return false;
3288     }
3289     return true;
3290   }
3291 
Pattern(const vector<string> & pat_vec,const ElementType et,const string & args)3292   Pattern::Pattern( const vector<string>& pat_vec,
3293 		    const ElementType et,
3294 		    const string& args ): matchannotation(et) {
3295     /// create a Pattern structure for searching
3296     /*!
3297       \param pat_vec a list of search terms (may be regular expressions)
3298       \param et The kind of elements to match on
3299       \param args additionale search options as attribute/value pairs
3300     */
3301     regexp = false;
3302     case_sensitive = false;
3303     KWargs kw = getArgs( args );
3304     matchannotationset = kw["matchannotationset"];
3305     if (kw["regexp"] != "" ){
3306       regexp = TiCC::stringTo<bool>( kw["regexp"] );
3307     }
3308     if (kw["maxgapsize"] != "" ){
3309       maxgapsize = TiCC::stringTo<int>( kw["maxgapsize"] );
3310     }
3311     else {
3312       maxgapsize = 10;
3313     }
3314     if ( kw["casesensitive"] != "" ){
3315       case_sensitive = TiCC::stringTo<bool>( kw["casesensitive"] );
3316     }
3317     for ( const auto& pat : pat_vec ){
3318       if ( pat.find( "regexp('" ) == 0 &&
3319 	   pat.rfind( "')" ) == pat.length()-2 ){
3320 	string tmp = pat.substr( 8, pat.length() - 10 );
3321 	UnicodeString us = TiCC::UnicodeFromUTF8( tmp );
3322 	UErrorCode u_stat = U_ZERO_ERROR;
3323 	RegexMatcher *matcher = new RegexMatcher(us, 0, u_stat);
3324 	if ( U_FAILURE(u_stat) ){
3325 	  throw runtime_error( "failed to create a regexp matcher with '" + tmp + "'" );
3326 	}
3327 	matchers.push_back( matcher );
3328 	sequence.push_back( "" );
3329       }
3330       else {
3331 	sequence.push_back( TiCC::UnicodeFromUTF8(pat) );
3332 	matchers.push_back( 0 );
3333 	if ( !case_sensitive ){
3334 	  sequence.back().toLower();
3335 	}
3336       }
3337     }
3338   }
3339 
Pattern(const std::vector<std::string> & pat_vec,const std::string & args)3340   Pattern::Pattern( const std::vector<std::string>& pat_vec,
3341 		    const std::string& args ) : matchannotation(BASE) {
3342     /// create a Pattern structure for searching
3343     /*!
3344       \param pat_vec a list if search terms (may be regular expressions)
3345       \param args additionale search options as attribute/value pairs
3346     */
3347     regexp = false;
3348     case_sensitive = false;
3349     KWargs kw = getArgs( args );
3350     matchannotationset = kw["matchannotationset"];
3351     if (kw["regexp"] != "" ){
3352       regexp = TiCC::stringTo<bool>( kw["regexp"] );
3353     }
3354     if (kw["maxgapsize"] != "" ){
3355       maxgapsize = TiCC::stringTo<int>( kw["maxgapsize"] );
3356     }
3357     else {
3358       maxgapsize = 10;
3359     }
3360     if ( kw["casesensitive"] != "" ){
3361       case_sensitive = TiCC::stringTo<bool>( kw["casesensitive"] );
3362     }
3363     for ( const auto& pat : pat_vec ){
3364       if ( pat.find( "regexp('" ) == 0 &&
3365 	   pat.rfind( "')" ) == pat.length()-2 ){
3366 	string tmp = pat.substr( 8, pat.length() - 10 );
3367 	UnicodeString us = TiCC::UnicodeFromUTF8( tmp );
3368 	UErrorCode u_stat = U_ZERO_ERROR;
3369 	RegexMatcher *matcher = new RegexMatcher(us, 0, u_stat);
3370 	if ( U_FAILURE(u_stat) ){
3371 	  throw runtime_error( "failed to create a regexp matcher with '" + tmp + "'" );
3372 	}
3373 	matchers.push_back( matcher );
3374 	sequence.push_back( "" );
3375       }
3376       else {
3377 	sequence.push_back( TiCC::UnicodeFromUTF8(pat) );
3378 	matchers.push_back( 0 );
3379 	if ( !case_sensitive ){
3380 	  sequence.back().toLower();
3381 	}
3382       }
3383     }
3384   }
3385 
~Pattern()3386   Pattern::~Pattern(){
3387     /// destroy a Pattern
3388     for ( const auto& m : matchers ){
3389       delete m;
3390     }
3391   }
3392 
operator <<(ostream & os,const Pattern & p)3393   inline ostream& operator<<( ostream& os, const Pattern& p ){
3394     /// debugging only: output the sequence part of a Pattern
3395     using TiCC::operator <<;
3396     os << "pattern: " << p.sequence;
3397     return os;
3398   }
3399 
match(const UnicodeString & us,size_t & pos,int & gap,bool & done,bool & flag) const3400   bool Pattern::match( const UnicodeString& us,
3401 		       size_t& pos,
3402 		       int& gap,
3403 		       bool& done,
3404 		       bool& flag ) const {
3405     /// try to match the input string to this pattern
3406     /*!
3407       \param us A UnicodeString to match
3408       \param pos the position of the (regex) matcher to try
3409       \param gap
3410       \param done
3411       \param flag
3412       \return true on a succesful match
3413     */
3414     UnicodeString s = us;
3415     //  cerr << "gap = " << gap << "cursor=" << pos << " vergelijk '" <<  sequence[pos] << "' met '" << us << "'" << endl;
3416     if ( matchers[pos] ){
3417       matchers[pos]->reset( s );
3418       UErrorCode u_stat = U_ZERO_ERROR;
3419       if ( matchers[pos]->matches( u_stat ) ){
3420 	done = ( ++pos >= sequence.size() );
3421 	return true;
3422       }
3423       else {
3424 	++pos;
3425 	return false;
3426       }
3427     }
3428     else {
3429       if ( !case_sensitive ){
3430 	s.toLower();
3431       }
3432       if ( sequence[pos] == s || sequence[pos] == "*:1" ){
3433 	done = ( ++pos >= sequence.size() );
3434 	return true;
3435       }
3436       else if ( sequence[pos] == "*" ){
3437 	if ( (pos + 1 ) >= sequence.size() ){
3438 	  done = true;
3439 	}
3440 	else if ( sequence[pos+1] == s ){
3441 	  //	cerr << "    but next matched!" << endl;
3442 	  flag = ( ++gap < maxgapsize );
3443 	  if ( !flag ){
3444 	    pos = pos + gap;
3445 	    done = ( ++pos >= sequence.size() );
3446 	  }
3447 	  else {
3448 	    done = true;
3449 	  }
3450 	}
3451 	else if ( ++gap == maxgapsize ){
3452 	  ++pos;
3453 	}
3454 	else {
3455 	  flag = true;
3456 	}
3457 	return true;
3458       }
3459       else {
3460 	++pos;
3461 	return false;
3462       }
3463     }
3464   }
3465 
variablesize() const3466   bool Pattern::variablesize() const {
3467     /// look if at least one sequence in the Pattern is "*"
3468     return any_of( sequence.begin(),
3469 		   sequence.end(),
3470 		   []( const UnicodeString& s ) { return s == "*"; } );
3471   }
3472 
unsetwild()3473   void Pattern::unsetwild() {
3474     /// replace all sequence in the Pattern with value "*" by "*:1"
3475     replace_if( sequence.begin(),
3476 		sequence.end(),
3477 		[]( const UnicodeString& s ) { return s == "*"; },
3478 		"*:1"
3479 		);
3480   }
3481 
variablewildcards() const3482   set<int> Pattern::variablewildcards() const {
3483     /// build an index of all "*" sequences
3484     set<int> result;
3485     for ( size_t i=0; i < sequence.size(); ++i ){
3486       if ( sequence[i] == "*" ){
3487 	result.insert( i );
3488       }
3489     }
3490     return result;
3491   }
3492 
findwords(const Pattern & pat,const string & args) const3493   vector<vector<Word*> > Document::findwords( const Pattern& pat,
3494 					      const string& args ) const {
3495     /// search the Document for vector of Word list matching the Pattern
3496     /*!
3497       \param pat The search Pattern
3498       \param args additional search options as attribute/value pairs
3499       \return a vector of Word list that matched. (if any)
3500       supported additional arguments can be 'leftcontext' and 'rightcontext'
3501     */
3502     size_t leftcontext = 0;
3503     size_t rightcontext = 0;
3504     KWargs kw = getArgs( args );
3505     string val = kw["leftcontext"];
3506     if ( !val.empty() ){
3507       leftcontext = TiCC::stringTo<size_t>(val);
3508     }
3509     val = kw["rightcontext"];
3510     if ( !val.empty() ){
3511       rightcontext = TiCC::stringTo<size_t>(val);
3512     }
3513     vector<vector<Word*> > result;
3514     vector<Word*> matched;
3515     if ( pat.regexp ){
3516       throw runtime_error( "regexp not supported yet in patterns" );
3517     }
3518     vector<Word*> mywords = words();
3519     for ( size_t startpos =0; startpos < mywords.size(); ++startpos ){
3520       // loop over all words
3521       //    cerr << "outer loop STARTPOS = " << startpos << endl;
3522       size_t cursor = 0;
3523       int gap = 0;
3524       bool goon = true;
3525       for ( size_t i = startpos; i < mywords.size() && goon ; ++i ){
3526 	//      cerr << "inner LOOP I = " << i << " myword=" << mywords[i] << endl;
3527 	UnicodeString value;
3528 	if ( pat.matchannotation == BASE ){
3529 	  value = mywords[i]->text();
3530 	}
3531 	else {
3532 	  vector<FoliaElement *> v = mywords[i]->select( pat.matchannotation );
3533 	  if ( v.size() != 1 ){
3534 	    continue;
3535 	  }
3536 	  value = TiCC::UnicodeFromUTF8(v[0]->cls());
3537 	}
3538 	bool done = false;
3539 	bool flag = false;
3540 	if ( pat.match( value, cursor, gap, done, flag ) ){
3541 	  // cerr << "matched, " << (done?"done":"not done")
3542 	  //      << (flag?" Flagged!":":{") << endl;
3543 	  matched.push_back(mywords[i]);
3544 	  if ( cursor == 0 ){
3545 	    startpos = i; // restart search here
3546 	  }
3547 	  if ( done ){
3548 	    vector<Word*> keep = matched;
3549 	    //	  cerr << "findnodes() tussenresultaat ==> " << matched << endl;
3550 	    vector<Word*> tmp1;
3551 	    if ( leftcontext > 0 ){
3552 	      tmp1 = matched[0]->leftcontext(leftcontext);
3553 	      //	    cerr << "findnodes() tmp1 ==> " << tmp1 << endl;
3554 	      copy( matched.begin(), matched.end(), back_inserter(tmp1) );
3555 	      //	    cerr << "findnodes() tmp1 na copy ==> " << tmp1 << endl;
3556 	    }
3557 	    else {
3558 	      tmp1 = matched;
3559 	    }
3560 	    vector<Word*> tmp2;
3561 	    if ( rightcontext > 0 ){
3562 	      tmp2 = matched.back()->rightcontext(rightcontext);
3563 	      //	    cerr << "findnodes() tmp2 ==> " << tmp2 << endl;
3564 	      copy( tmp2.begin(), tmp2.end(), back_inserter(tmp1) );
3565 	      //	    cerr << "findnodes() tmp2 na copy ==> " << tmp2 << endl;
3566 	    }
3567 	    result.push_back(tmp1);
3568 	    //	  cerr << "findnodes() tussenresultaat 2 ==> " << tmp1 << endl;
3569 	    if ( flag ){
3570 	      matched = keep;
3571 	    }
3572 	    else {
3573 	      cursor = 0;
3574 	      matched.clear();
3575 	      goon = false;
3576 	    }
3577 	  }
3578 	}
3579 	else {
3580 	  cursor = 0;
3581 	  matched.clear();
3582 	  goon = false;
3583 	}
3584       }
3585     }
3586     //  cerr << "findnodes() result ==> " << result << endl;
3587     return result;
3588   }
3589 
findwords(list<Pattern> & pats,const string & args) const3590   vector<vector<Word*> > Document::findwords( list<Pattern>& pats,
3591 					      const string& args ) const {
3592     /// search the Document for vector of Word list matching one of the Pattern
3593     /*!
3594       \param pats a list of search Patterns
3595       \param args additional search options as attribute/value pairs
3596       \return a vector of Word list that matched. (if any)
3597       supported additional arguments can be 'leftcontext' and 'rightcontext'
3598     */
3599     size_t prevsize = 0;
3600     bool start = true;
3601     bool unsetwildcards = false;
3602     set<int> variablewildcards;
3603     int index = 0;
3604     for ( const auto& it : pats ){
3605       //    cerr << "bekijk patroon : " << *it << endl;
3606       if ( start ){
3607 	prevsize = it.size();
3608 	start = false;
3609       }
3610       else if ( it.size() != prevsize ){
3611 	throw runtime_error( "findnodes(): If multiple patterns are provided, they must all have the same length!" );
3612       }
3613       if ( it.variablesize() ){
3614 	if ( index > 0 && variablewildcards.empty() ){
3615 	  unsetwildcards = true;
3616 	}
3617 	else {
3618 	  if ( !variablewildcards.empty() &&
3619 	       variablewildcards != it.variablewildcards() ){
3620 	    throw runtime_error("If multiple patterns are provided with variable wildcards, then these wildcards must all be in the same positions!");
3621 	  }
3622 	  variablewildcards = it.variablewildcards();
3623 	}
3624       }
3625       else if ( !variablewildcards.empty() ){
3626 	unsetwildcards = true;
3627       }
3628       ++index;
3629     }
3630     if ( unsetwildcards ){
3631       for ( auto& it : pats ){
3632 	it.unsetwild();
3633       }
3634     }
3635     vector<vector<Word*> > result;
3636     for ( const auto& it : pats ){
3637       vector<vector<Word*> > res = findwords( it, args );
3638       if ( result.empty() ){
3639 	result = res;
3640       }
3641       else if ( res != result ){
3642 	result.clear();
3643 	break;
3644       }
3645     }
3646     return result;
3647   }
3648 
3649 } // namespace folia
3650