1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of libfolia 7 8 libfolia is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 libfolia is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 */ 26 #include <cassert> 27 #include <cstdlib> 28 #include <iostream> 29 #include <fstream> 30 #include <string> 31 #include <algorithm> 32 #include <vector> 33 #include <map> 34 #include <stdexcept> 35 #include "config.h" 36 #include "ticcutils/PrettyPrint.h" 37 #include "ticcutils/XMLtools.h" 38 #include "ticcutils/StringOps.h" 39 #include "ticcutils/Unicode.h" 40 #include "ticcutils/zipper.h" 41 #include "libfolia/folia.h" 42 #include "libfolia/folia_properties.h" 43 #include "libxml/xmlstring.h" 44 45 using namespace std; 46 using namespace icu; 47 48 /// the default output encoding, in fact the only one we allow 49 const char *output_encoding = "UTF-8"; 50 51 namespace folia { 52 using TiCC::operator<<; 53 operator <<(ostream & os,const Document::at_t & at)54 ostream& operator<<( ostream& os, const Document::at_t& at ){ 55 /// output an at_t structure (Debugging only) 56 /*! 57 \param os the output stream 58 \param at the at_t object 59 */ 60 os << "<" << at._annotator << "," << TiCC::toString(at._ann_type) 61 << "," << at._date << "," << at._processors << ">"; 62 return os; 63 } 64 Document()65 Document::Document(){ 66 /// create and initalize a FoLiA Document. 67 init(); 68 } 69 init_args(const KWargs & kwargs)70 void Document::init_args( const KWargs& kwargs ){ 71 /// init some Document properties from a key-value list 72 /*! 73 \param kwargs a list of key-value pairs 74 75 this function initializes a Document and can set the attributes 76 \e 'debug' and \e 'mode' 77 78 When the attributes \e 'file' or \e 'string' are found, the value is used 79 to extract a complete FoLiA document from that file or string. 80 */ 81 init(); 82 KWargs args = kwargs; 83 string value = args.extract( "debug" ); 84 if ( !value.empty() ){ 85 debug = TiCC::stringTo<int>( value ); 86 } 87 value = args.extract( "mode" ); 88 if ( !value.empty() ){ 89 setmode( value ); 90 } 91 value = args.extract( "file" ); 92 if ( !value.empty() ){ 93 // extract a Document from a file 94 read_from_file( value ); 95 } 96 else { 97 value = args.extract( "string" ); 98 if ( !value.empty() ){ 99 // extract a Document from a string 100 read_from_string( value ); 101 } 102 } 103 if ( !foliadoc ){ 104 // so NO 'file' or 'string' argument. 105 // (read_from_file/read_from_string create a foliadoc OR throw ) 106 if ( args.find( "version" ) == args.end() ){ 107 // no version attribute. set it to the current default 108 args["version"] = folia_version(); 109 } 110 // create an 'empty' document using the args, with a FoLiA root node. 111 foliadoc = new FoLiA( args, this ); 112 } 113 } 114 Document(const KWargs & kwargs)115 Document::Document( const KWargs& kwargs ) { 116 /// initialize a Document using an attribute-value list 117 /*! 118 \param kwargs an attribute-value list 119 */ 120 init_args( kwargs ); 121 } 122 Document(const string & s)123 Document::Document( const string& s ) { 124 /// initialize a Document using a string (filename or attribute-value list) 125 /*! 126 \param s a string representing a filename OR an attribute value list 127 128 the string \e s can be a string encoded attribute value list OR just a 129 filename. 130 131 Some examples: 132 133 Document doc("my_first.folia.xml") creates a Document doc from 134 the file \e my_first.folia.xml. 135 136 Document doc( "file='my_first.folia.xml', debug='3', mode='nochecktext'" ) This creates a document from the file \e my_first.folia.xml with a 137 debugging level of 3 and textchecking set to OFF 138 139 Document doc( "xml:id='test'" ) creates a yet empty document with a 140 document ID with value 'test' 141 142 */ 143 KWargs args = getArgs(s); 144 if ( args.empty() ){ 145 args["file"] = s; 146 } 147 init_args( args ); 148 } 149 folia_version()150 string folia_version(){ 151 /// return the FoLiA version of this build 152 stringstream ss; 153 ss << MAJOR_VERSION << "." << MINOR_VERSION << "." << SUB_VERSION; 154 return ss.str(); 155 } 156 doc_version() const157 string Document::doc_version() const { 158 /// return the FoLiA version of this Document 159 stringstream ss; 160 ss << _major_version << "." << _minor_version << "." << _sub_version; 161 return ss.str(); 162 } 163 library_version()164 string library_version(){ 165 /// return the version of the library 166 return VERSION; 167 } 168 update_version()169 string Document::update_version(){ 170 /// override the document version with the version of the build 171 /*! 172 \return the old value of the documents version 173 */ 174 string old = _version_string; 175 _version_string = folia_version(); 176 return old; 177 } 178 init()179 void Document::init(){ 180 /// initialize a Document structure with default values 181 _metadata = 0; 182 _foreign_metadata = 0; 183 _provenance = 0; 184 _xmldoc = 0; 185 foliadoc = 0; 186 _foliaNsIn_href = 0; 187 _foliaNsIn_prefix = 0; 188 _foliaNsOut = 0; 189 debug = 0; 190 mode = Mode( CHECKTEXT|AUTODECLARE ); 191 _external_document = false; 192 _incremental_parse = false; 193 _preserve_spaces = false; 194 _warn_count = 0; 195 _major_version = 0; 196 _minor_version = 0; 197 _sub_version = 0; 198 } 199 ~Document()200 Document::~Document(){ 201 /// Destroy a Document structure including al it's members 202 /*! 203 This also finally deletes FoLiA nodes that were marked for deletion 204 but not yet really destroyed. (because they might still be referenced) 205 */ 206 xmlFreeDoc( _xmldoc ); 207 xmlFree( (xmlChar*)_foliaNsIn_href ); 208 xmlFree( (xmlChar*)_foliaNsIn_prefix ); 209 sindex.clear(); 210 if ( foliadoc ){ 211 foliadoc->destroy(); 212 } 213 set<FoliaElement*> bulk; 214 for ( const auto& it : delSet ){ 215 it->unravel( bulk ); 216 } 217 for ( const auto& it : bulk ){ 218 it->destroy(); 219 } 220 delete _metadata; 221 delete _foreign_metadata; 222 for ( const auto& it : submetadata ){ 223 delete it.second; 224 } 225 delete _provenance; 226 } 227 setmode(const string & ms) const228 void Document::setmode( const string& ms ) const { 229 /// Sets the mode attributes of a document 230 /*! 231 \param ms an encoded string of attribute-values pairs giving modes 232 \note mode is mutable, so this even sets mode on CONST documents! 233 234 The following modes can be set: 235 '(no)permissive' (default is NO), 236 '(no)strip' (default is NO), 237 '(no)canonical' (default is NO), 238 '(no)checktext' (default is checktext), 239 '(no)fixtext' (default is NO), 240 '(no)autodeclare' (default is NO) 241 242 example: 243 244 doc.setmode( "strip,nochecktext,autodeclare" ); 245 */ 246 vector<string> modev = TiCC::split_at( ms, "," ); 247 for ( const auto& mod : modev ){ 248 if ( mod == "permissive" ){ 249 mode = Mode( (int)mode | PERMISSIVE ); 250 } 251 else if ( mod == "nopermissive" ){ 252 mode = Mode( (int)mode & ~PERMISSIVE ); 253 } 254 else if ( mod == "strip" ){ 255 mode = Mode( (int)mode | STRIP ); 256 } 257 else if ( mod == "nostrip" ){ 258 mode = Mode( (int)mode & ~STRIP ); 259 } 260 else if ( mod == "canonical" ){ 261 mode = Mode( (int)mode | CANONICAL ); 262 } 263 else if ( mod == "nocanonical" ){ 264 mode = Mode( (int)mode & ~CANONICAL ); 265 } 266 else if ( mod == "kanon" ){ // backward compatible 267 mode = Mode( (int)mode | CANONICAL ); 268 } 269 else if ( mod == "nokanon" ){ // backward compatible 270 mode = Mode( (int)mode & ~CANONICAL ); 271 } 272 else if ( mod == "checktext" ){ 273 mode = Mode( int(mode) | CHECKTEXT ); 274 } 275 else if ( mod == "nochecktext" ){ 276 mode = Mode( int(mode) & ~CHECKTEXT ); 277 } 278 else if ( mod == "fixtext" ){ 279 mode = Mode( int(mode) | FIXTEXT ); 280 } 281 else if ( mod == "nofixtext" ){ 282 mode = Mode( int(mode) & ~FIXTEXT ); 283 } 284 else if ( mod == "autodeclare" ){ 285 mode = Mode( int(mode) | AUTODECLARE ); 286 } 287 else if ( mod == "noautodeclare" ){ 288 mode = Mode( int(mode) & ~AUTODECLARE ); 289 } 290 else if ( mod == "explicit" ){ 291 mode = Mode( int(mode) | EXPLICIT ); 292 } 293 else if ( mod == "noexplicit" ){ 294 mode = Mode( int(mode) & ~EXPLICIT ); 295 } 296 else { 297 throw invalid_argument( "FoLiA::Document: unsupported mode value: "+ mod ); 298 } 299 } 300 } 301 getmode() const302 string Document::getmode() const { 303 /// returns the curent mode(s) as a string 304 /*! 305 \return a string explaining the modes set 306 307 example: 308 309 doc.getmode() might return: "mode=strip,nohecktext,autodeclare," 310 */ 311 string result = "mode="; 312 if ( mode & PERMISSIVE ){ 313 result += "permissive,"; 314 } 315 if ( mode & STRIP ){ 316 result += "strip,"; 317 } 318 if ( mode & CHECKTEXT ){ 319 result += "checktext,"; 320 } 321 else { 322 result += "nochecktext,"; 323 } 324 if ( mode & FIXTEXT ){ 325 result += "fixtext,"; 326 } 327 if ( mode & CANONICAL ){ 328 result += "canonical,"; 329 } 330 if ( mode & AUTODECLARE ){ 331 result += "autodeclare,"; 332 } 333 else { 334 result += "noautodeclare,"; 335 } 336 if ( mode & EXPLICIT ){ 337 result += "explicit,"; 338 } 339 return result; 340 } 341 set_strip(bool new_val) const342 bool Document::set_strip( bool new_val ) const{ 343 /// sets the 'strip' mode to on/off 344 /*! 345 \param new_val the boolean to use for on/off 346 \return the previous value 347 */ 348 bool old_val = (mode & STRIP); 349 if ( new_val ){ 350 mode = Mode( (int)mode | STRIP ); 351 } 352 else { 353 mode = Mode( (int)mode & ~STRIP ); 354 } 355 return old_val; 356 } 357 set_permissive(bool new_val) const358 bool Document::set_permissive( bool new_val ) const{ 359 /// sets the 'permissive' mode to on/off 360 /*! 361 \param new_val the boolean to use for on/off 362 \return the previous value 363 */ 364 bool old_val = (mode & PERMISSIVE); 365 if ( new_val ){ 366 mode = Mode( (int)mode | PERMISSIVE ); 367 } 368 else { 369 mode = Mode( (int)mode & ~PERMISSIVE ); 370 } 371 return old_val; 372 } 373 set_checktext(bool new_val) const374 bool Document::set_checktext( bool new_val ) const{ 375 /// sets the 'checktext' mode to on/off 376 /*! 377 \param new_val the boolean to use for on/off 378 \return the previous value 379 */ 380 bool old_val = (mode & CHECKTEXT); 381 if ( new_val ){ 382 mode = Mode( (int)mode | CHECKTEXT ); 383 } 384 else { 385 mode = Mode( (int)mode & ~CHECKTEXT ); 386 } 387 return old_val; 388 } 389 390 set_fixtext(bool new_val) const391 bool Document::set_fixtext( bool new_val ) const{ 392 /// sets the 'fixtext' mode to on/off 393 /*! 394 \param new_val the boolean to use for on/off 395 \return the previous value 396 */ 397 bool old_val = (mode & FIXTEXT); 398 if ( new_val ){ 399 mode = Mode( (int)mode | FIXTEXT ); 400 } 401 else { 402 mode = Mode( (int)mode & ~FIXTEXT ); 403 } 404 return old_val; 405 } 406 set_canonical(bool new_val) const407 bool Document::set_canonical( bool new_val ) const{ 408 /// sets the 'canonical' mode to on/off 409 /*! 410 \param new_val the boolean to use for on/off 411 \return the previous value 412 */ 413 bool old_val = (mode & CANONICAL); 414 if ( new_val ){ 415 mode = Mode( (int)mode | CANONICAL ); 416 } 417 else { 418 mode = Mode( (int)mode & ~CANONICAL ); 419 } 420 return old_val; 421 } 422 set_autodeclare(bool new_val) const423 bool Document::set_autodeclare( bool new_val ) const{ 424 /// sets the 'autodeclare' mode to on/off 425 /*! 426 \param new_val the boolean to use for on/off 427 \return the previous value 428 */ 429 bool old_val = (mode & AUTODECLARE); 430 if ( new_val ){ 431 mode = Mode( (int)mode | AUTODECLARE ); 432 } 433 else { 434 mode = Mode( (int)mode & ~AUTODECLARE ); 435 } 436 return old_val; 437 } 438 set_explicit(bool new_val) const439 bool Document::set_explicit( bool new_val ) const{ 440 /// sets the 'explicit' mode to on/off 441 /*! 442 \param new_val the boolean to use for on/off 443 \return the previous value 444 */ 445 bool old_val = (mode & EXPLICIT); 446 if ( new_val ){ 447 mode = Mode( (int)mode | EXPLICIT ); 448 } 449 else { 450 mode = Mode( (int)mode & ~EXPLICIT ); 451 } 452 return old_val; 453 } 454 add_doc_index(FoliaElement * el)455 void Document::add_doc_index( FoliaElement* el ){ 456 /// add a FoliaElement to the index 457 /*! 458 \param el the FoliaElement to add 459 will throw when \em el->id() is already in the index 460 */ 461 const string id = el->id(); 462 if ( id.empty() ) { 463 return; 464 } 465 auto it = sindex.find( id ); 466 if ( it == sindex.end() ){ 467 sindex[id] = el; 468 } 469 else { 470 throw DuplicateIDError( id ); 471 } 472 } 473 del_doc_index(const string & id)474 void Document::del_doc_index( const string& id ){ 475 /// remove an id from the index 476 /*! 477 \param id The id to remove 478 */ 479 if ( sindex.empty() ){ 480 // only when ~Document is in progress 481 return; 482 } 483 if ( id.empty() ) { 484 return; 485 } 486 sindex.erase(id); 487 } 488 annotation_type_to_string(AnnotationType ann) const489 string Document::annotation_type_to_string( AnnotationType ann ) const { 490 /// return the ANNOTATIONTYPE translated to a string in a Document context. 491 /// takes the version into account, for older labels 492 /*! 493 \param ann the annotationtype 494 \return a string representation of \e ann. 495 496 Taking into account the version of the Dcocument, translating to 497 old labels for pre 1.6 versions 498 */ 499 const string& result = toString( ann ); 500 if ( version_below(1,6) ){ 501 const auto& it = reverse_old.find(result); 502 if ( it != reverse_old.end() ){ 503 return it->second; 504 } 505 } 506 return result; 507 } 508 error_sink(void * mydata,xmlError * error)509 static void error_sink(void *mydata, xmlError *error ){ 510 /// helper function for libxml2 to catch and display problems in an 511 /// orderly fashion 512 /*! 513 \param a pointer to a struct to hold persisten data. In our case just an 514 int. 515 \param error an xmlEror structure created by a libxml2 function 516 517 For the first error encountered, a message is sent to stderr. Further 518 errors are just counted. It is up to calling functions to react on a 519 a count > 0 520 */ 521 int *cnt = (int*)mydata; 522 if ( *cnt == 0 ){ 523 string line = "\n"; 524 if ( error->file ){ 525 line += string(error->file) + ":"; 526 if ( error->line > 0 ){ 527 line += TiCC::toString(error->line) + ":"; 528 } 529 } 530 line += " XML-error: " + string(error->message); 531 cerr << line << endl; 532 } 533 (*cnt)++; 534 return; 535 } 536 read_from_file(const string & file_name)537 bool Document::read_from_file( const string& file_name ){ 538 /// read a FoLiA document from a file 539 /*! 540 \param file_name the name of the file 541 \return true on succes. Will throw otherwise. 542 543 This function also takes care of files in .bz2 or .gz format when the 544 right extension is given. 545 */ 546 ifstream is( file_name ); 547 if ( !is.good() ){ 548 throw invalid_argument( "file not found: " + file_name ); 549 } 550 if ( foliadoc ){ 551 throw logic_error( "Document is already initialized" ); 552 } 553 _source_filename = file_name; 554 if ( TiCC::match_back( file_name, ".bz2" ) ){ 555 string buffer = TiCC::bz2ReadFile( file_name ); 556 return read_from_string( buffer ); 557 } 558 int cnt = 0; 559 xmlSetStructuredErrorFunc( &cnt, (xmlStructuredErrorFunc)error_sink ); 560 _xmldoc = xmlReadFile( file_name.c_str(), 561 0, 562 XML_PARSER_OPTIONS ); 563 if ( _xmldoc ){ 564 if ( cnt > 0 ){ 565 throw XmlError( "document is invalid" ); 566 } 567 if ( debug ){ 568 cout << "read a doc from " << file_name << endl; 569 } 570 foliadoc = parseXml(); 571 if ( !validate_offsets() ){ 572 // cannot happen. validate_offsets() throws on error 573 throw InconsistentText("MEH"); 574 } 575 if ( debug ){ 576 if ( foliadoc ){ 577 cout << "successful parsed the doc from: " << file_name << endl; 578 } 579 else { 580 cout << "failed to parse the doc from: " << file_name << endl; 581 } 582 } 583 xmlFreeDoc( _xmldoc ); 584 _xmldoc = 0; 585 return foliadoc != 0; 586 } 587 if ( debug ){ 588 cout << "Failed to read a doc from " << file_name << endl; 589 } 590 throw XmlError( "No valid FoLiA read" ); 591 } 592 read_from_string(const string & buffer)593 bool Document::read_from_string( const string& buffer ){ 594 /// read a FoLiA Document from a string buffer 595 /*! 596 \param buffer A complete FoLiA document in a string buffer 597 \return true on succes. Will throw otherwise. 598 */ 599 if ( foliadoc ){ 600 throw logic_error( "Document is already initialized" ); 601 return false; 602 } 603 int cnt = 0; 604 xmlSetStructuredErrorFunc( &cnt, (xmlStructuredErrorFunc)error_sink ); 605 _xmldoc = xmlReadMemory( buffer.c_str(), buffer.length(), 0, 0, 606 XML_PARSER_OPTIONS ); 607 if ( _xmldoc ){ 608 if ( cnt > 0 ){ 609 throw XmlError( "document is invalid" ); 610 } 611 if ( debug ){ 612 cout << "read a doc from string" << endl; 613 } 614 foliadoc = parseXml(); 615 if ( !validate_offsets() ){ 616 // cannot happen. validate_offsets() throws on error 617 throw InconsistentText("MEH"); 618 } 619 if ( debug ){ 620 if ( foliadoc ){ 621 cout << "successful parsed the doc" << endl; 622 } 623 else{ 624 cout << "failed to parse the doc" << endl; 625 } 626 } 627 xmlFreeDoc( _xmldoc ); 628 _xmldoc = 0; 629 return foliadoc != 0; 630 } 631 if ( debug ){ 632 throw runtime_error( "Failed to read a doc from a string" ); 633 } 634 return false; 635 } 636 operator <<(ostream & os,const Document * d)637 ostream& operator<<( ostream& os, const Document *d ){ 638 /// output a Document to a stream 639 /*! 640 \param os the output stream 641 \param d the document to output 642 */ 643 if ( d ){ 644 os << d->toXml( "" ); 645 // the toXml() string already ends with a newline (i hope....) 646 // but flush the stream 647 os.flush(); 648 } 649 else { 650 os << "MISSING DOCUMENT" << endl; 651 } 652 return os; 653 } 654 save(ostream & os,const string & ns_label,bool canonical) const655 bool Document::save( ostream& os, 656 const string& ns_label, 657 bool canonical ) const { 658 /// save the Document to a stream 659 /*! 660 \param os the output stream 661 \param ns_label the namespace name to use, the default is "" placing all 662 FoLiA nodes in the default namespace. 663 \param canonical determines to output in canonical order. Default is no. 664 */ 665 bool old_k = set_canonical(canonical); 666 os << toXml( ns_label ); 667 // the toXml() string already ends with a newline (i hope....) 668 // but flush the stream 669 os.flush(); 670 set_canonical(old_k); 671 return os.good(); 672 } 673 save(const string & file_name,const string & ns_label,bool canonical) const674 bool Document::save( const string& file_name, 675 const string& ns_label, 676 bool canonical ) const { 677 /// save the Document to a file 678 /*! 679 \param file_name the name of the file to create 680 \param ns_label the namespace name to use, the default is "" placing all 681 FoLiA nodes in the default namespace. 682 \param canonical determines to output in canonical order. Default is no. 683 684 This function also takes care of output to files in .bz2 or .gz format 685 when the right extension is given. 686 */ 687 bool old_k = set_canonical(canonical); 688 bool result = false; 689 try { 690 result = toXml( file_name, ns_label ); 691 } 692 catch ( const exception& e ){ 693 throw runtime_error( "saving to file " + file_name + " failed: " + e.what() ); 694 } 695 set_canonical( old_k ); 696 return result; 697 } 698 xmlstring(bool canonical) const699 string Document::xmlstring( bool canonical ) const { 700 /// dump the Document in a string buffer 701 /*! 702 \param canonical determines to output in canonical order. Default is no. 703 \return the complete document in an unformatted string 704 */ 705 bool old_k = set_canonical(canonical); 706 xmlDoc *outDoc = to_xmlDoc( "" ); 707 set_canonical(old_k); 708 xmlChar *buf; int size; 709 xmlDocDumpFormatMemoryEnc( outDoc, &buf, &size, 710 output_encoding, 0 ); // no formatting 711 string result = string( (const char *)buf, size ); 712 xmlFree( buf ); 713 xmlFreeDoc( outDoc ); 714 _foliaNsOut = 0; 715 return result; 716 } 717 index(const string & id) const718 FoliaElement* Document::index( const string& id ) const { 719 /// search for the element with xml:id id 720 /*! 721 \param id the id we search 722 \return the FoliaElement with this \e id or 0, when not present 723 */ 724 const auto& it = sindex.find( id ); 725 if ( it == sindex.end() ){ 726 return 0; 727 } 728 else { 729 return it->second; 730 } 731 } 732 operator [](const string & id) const733 FoliaElement* Document::operator []( const string& id ) const { 734 /// search for the element with xml:id id 735 /*! 736 \param id the id we search 737 \return the FoliaElement with this \e id or 0, when not present 738 739 example: 740 741 FoliaElement *e = doc["doc.sent.1"]; 742 when Document doc has a node with id="doc.sent.1", \e e refer that node 743 otherwise \e e will be set to 0; 744 */ 745 return index(id); 746 } 747 text(const TextPolicy & tp) const748 UnicodeString Document::text( const TextPolicy& tp ) const { 749 /// return the text content of the whole document, restricted by the 750 /// parameters. 751 /*! 752 \param tp The TextPolicy to use 753 \return the complete text matching the criteria as an UnicodeString 754 */ 755 return foliadoc->text( tp ); 756 } 757 text(const std::string & cls,bool retaintok,bool strict) const758 UnicodeString Document::text( const std::string& cls, 759 bool retaintok, 760 bool strict ) const { 761 /// return the text content of the whole document, restricted by the 762 /// parameters. 763 /*! 764 \param cls The textclass to use fro searching. 765 \param retaintok Should we retain the tokenization. Default NO. 766 \param strict Should we perform a strict search? Default NO. 767 \return the complete text matching the criteria as an UnicodeString 768 */ 769 TEXT_FLAGS flags = TEXT_FLAGS::NONE; 770 if ( retaintok ){ 771 flags = flags | TEXT_FLAGS::RETAIN; 772 } 773 if ( strict ){ 774 flags = flags | TEXT_FLAGS::STRICT; 775 } 776 return foliadoc->text( cls, flags ); 777 } 778 779 static const set<ElementType> quoteSet = { Quote_t }; 780 static const set<ElementType> emptySet; 781 sentences() const782 vector<Sentence*> Document::sentences() const { 783 /// return all Sentences in the Document, except those in Quotes 784 return foliadoc->select<Sentence>( quoteSet ); 785 } 786 sentenceParts() const787 vector<Sentence*> Document::sentenceParts() const { 788 /// return all Sentences in the Document, including those in Quotes 789 vector<Sentence*> sents = foliadoc->select<Sentence>( emptySet ); 790 return sents; 791 } 792 sentences(size_t index) const793 Sentence *Document::sentences( size_t index ) const { 794 /// return the Sentence at position \e index 795 /*! 796 \param index the index to search for 797 \return The Sentence found. 798 will throw when the index is out of range 799 */ 800 vector<Sentence*> v = sentences(); 801 if ( index < v.size() ){ 802 return v[index]; 803 } 804 throw range_error( "sentences() index out of range" ); 805 } 806 rsentences(size_t index) const807 Sentence *Document::rsentences( size_t index ) const { 808 /// return the Sentence at position \e index from the back of the Document 809 /*! 810 \param index the index to search for 811 \return The Sentence found. 812 will throw when the index is out of range 813 */ 814 vector<Sentence*> v = sentences(); 815 if ( index < v.size() ){ 816 return v[v.size()-1-index]; 817 } 818 throw range_error( "rsentences() index out of range" ); 819 } 820 words() const821 vector<Word*> Document::words() const { 822 /// return all the Words in the Document, ignoring those within structure 823 /// annotations 824 /*! 825 \return The Words found. 826 */ 827 return foliadoc->select<Word>( default_ignore_structure ); 828 } 829 words(size_t index) const830 Word *Document::words( size_t index ) const { 831 /// return the Word at position \e index, ignoring those within structure 832 /// annotations 833 /*! 834 \param index the index to search for 835 \return The Word found. 836 will throw when the index is out of range 837 */ 838 vector<Word*> v = words(); 839 if ( index < v.size() ){ 840 return v[index]; 841 } 842 throw range_error( "words() index out of range" ); 843 } 844 rwords(size_t index) const845 Word *Document::rwords( size_t index ) const { 846 /// return the Word at position \e index from the back of the Document, 847 /// ignoring those within structure annotations 848 /*! 849 \param index the index to search for 850 \return The Word found. 851 will throw when the index is out of range 852 */ 853 vector<Word*> v = words(); 854 if ( index < v.size() ){ 855 return v[v.size()-1-index]; 856 } 857 throw range_error( "rwords() index out of range" ); 858 } 859 paragraphs() const860 vector<Paragraph*> Document::paragraphs() const { 861 /// return all Paragraphs in the Document 862 return foliadoc->select<Paragraph>(); 863 } 864 paragraphs(size_t index) const865 Paragraph *Document::paragraphs( size_t index ) const { 866 /// return the Paragraph at position \e index 867 /*! 868 \param index the index to search for 869 \return The Paragraph found. 870 will throw when the index is out of range 871 */ 872 vector<Paragraph*> v = paragraphs(); 873 if ( index < v.size() ){ 874 return v[index]; 875 } 876 throw range_error( "paragraphs() index out of range" ); 877 } 878 rparagraphs(size_t index) const879 Paragraph *Document::rparagraphs( size_t index ) const { 880 /// return the Word at position \e index from the back of the Document 881 /*! 882 \param index the index to search for 883 \return The Paragraph found. 884 will throw when the index is out of range 885 */ 886 vector<Paragraph*> v = paragraphs(); 887 if ( index < v.size() ){ 888 return v[v.size()-1-index]; 889 } 890 throw range_error( "rparagraphs() index out of range" ); 891 } 892 language() const893 string Document::language() const { 894 /// extract the language from the metadata 895 /*! 896 \return the metadata language value or "" when not set 897 */ 898 string result; 899 if ( _metadata ){ 900 result = _metadata->get_val("language"); 901 } 902 return result; 903 } 904 metadata_type() const905 string Document::metadata_type() const { 906 /// returns the metadata type 907 /*! 908 \return the metadata type or "native" when not set 909 */ 910 if ( _metadata ){ 911 return _metadata->type(); 912 } 913 else if ( _foreign_metadata ){ 914 return _foreign_metadata->type(); 915 } 916 return "native"; 917 } 918 metadata_file() const919 string Document::metadata_file() const { 920 /// returns the metadata filename. if any 921 /*! 922 \return the metadata file name. 923 */ 924 if ( _metadata ){ 925 if ( _metadata->datatype() != "ExternalMetaData" ){ 926 return ""; 927 } 928 return _metadata->src(); 929 } 930 return ""; 931 } 932 parse_imdi(const xmlNode * node)933 void Document::parse_imdi( const xmlNode *node ){ 934 /// set IMDI values. DEPRECATED 935 xmlNode *n = TiCC::xPath( node, "//imdi:Session/imdi:Title" ); 936 if ( n ){ 937 _metadata->add_av( "title", TiCC::XmlContent( n ) ); 938 } 939 n = TiCC::xPath( node, "//imdi:Session/imdi:Date" ); 940 if ( n ){ 941 _metadata->add_av( "date", TiCC::XmlContent( n ) ); 942 } 943 n = TiCC::xPath( node, "//imdi:Source/imdi:Access/imdi:Publisher" ); 944 if ( n ){ 945 _metadata->add_av( "publisher", TiCC::XmlContent( n ) ); 946 } 947 n = TiCC::xPath( node, "//imdi:Source/imdi:Access/imdi:Availability" ); 948 if ( n ){ 949 _metadata->add_av( "licence", TiCC::XmlContent( n ) ); 950 } 951 n = TiCC::xPath( node, "//imdi:Languages/imdi:Language/imdi:ID" ); 952 if ( n ){ 953 _metadata->add_av( "language", TiCC::XmlContent( n ) ); 954 } 955 } 956 set_metadata(const string & attribute,const string & value)957 void Document::set_metadata( const string& attribute, const string& value ){ 958 /// add a metadata attribute/value pair to the Document 959 /*! 960 \param attribute the attribute to set 961 \param value the value of the attribute 962 963 Will throw if the current metadata is NOT 'native' 964 965 May create a new NativeMetaData structure. 966 */ 967 if ( !_metadata ){ 968 _metadata = new NativeMetaData( "native" ); 969 } 970 else if ( _metadata->datatype() == "ExternalMetaData" ){ 971 throw MetaDataError( "cannot set meta values on ExternalMetaData" ); 972 } 973 if ( _metadata->type() == "imdi" ){ 974 throw MetaDataError( "cannot set meta values on IMDI MetaData" ); 975 } 976 _metadata->add_av( attribute, value ); 977 } 978 get_metadata(const string & attribute) const979 const string Document::get_metadata( const string& attribute ) const { 980 /// return the metadata value for a metadata attribute 981 /*! 982 \param attribute the attribite to lookup 983 \return the requested metadata value. May return "" if no metadata is 984 available or the attribute is not found. 985 */ 986 if ( _metadata ){ 987 return _metadata->get_val( attribute ); 988 } 989 else { 990 return ""; 991 } 992 } 993 get_default_processor() const994 processor *Document::get_default_processor() const { 995 /// return the default processor for this document 996 /*! 997 \return the main processor in the provenance data. can be 0; 998 */ 999 if ( _provenance ){ 1000 return _provenance->get_top_processor(); 1001 } 1002 else { 1003 return 0; 1004 } 1005 } 1006 get_processor(const string & pid) const1007 processor *Document::get_processor( const string& pid ) const { 1008 /// return the processor with ID=pid 1009 /*! 1010 \param pid the processorID we look for 1011 \return the processor found, or 0 1012 */ 1013 if ( _provenance ){ 1014 return _provenance->get_processor_by_id( pid ); 1015 } 1016 else { 1017 return 0; 1018 } 1019 } 1020 get_processors_by_name(const string & name) const1021 vector<processor*> Document::get_processors_by_name( const string& name ) const { 1022 /// return all the processor with name=name 1023 /*! 1024 \param name the name of the processors we look for 1025 \return al list of matching processors 1026 */ 1027 vector<processor*> result; 1028 if ( _provenance ){ 1029 result = _provenance->get_processors_by_name( name ); 1030 } 1031 return result; 1032 } 1033 add_processor(const KWargs & args,processor * parent)1034 processor *Document::add_processor( const KWargs& args, 1035 processor *parent ){ 1036 /// create new processor and add it to the provenance data 1037 /*! 1038 \param args the argument list for creating the new provessor 1039 \param parent add the new processor as a child to this parent. 1040 When the parent = 0, add to the Documents provenance structure. 1041 1042 May create a new Provenance structure if not yet available. 1043 */ 1044 if ( debug ){ 1045 cerr << "ADD_PROCESSOR: " << args << endl; 1046 } 1047 if ( !parent 1048 && !_provenance ){ 1049 _provenance = new Provenance(this); 1050 } 1051 processor *p = new processor( _provenance, parent, args ); 1052 if ( parent ){ 1053 parent->_processors.push_back( p ); 1054 } 1055 else { 1056 _provenance->processors.push_back( p ); 1057 } 1058 return p; 1059 } 1060 set_foreign_metadata(xmlNode * node)1061 void Document::set_foreign_metadata( xmlNode *node ){ 1062 /// create a ForeigMetaData element from 'node' 1063 /*! 1064 \param node the xml node we are parsing 1065 1066 FoLiA treats foreign metadata by adding a copy of the xml tree under node 1067 to the folia, without further notice. 1068 */ 1069 if ( !_foreign_metadata ){ 1070 _foreign_metadata = new ForeignMetaData( "foreign" ); 1071 } 1072 ForeignData *add = new ForeignData(); 1073 if ( TiCC::Name( node ) != "foreign-data" ){ 1074 // we need an extra layer then 1075 xmlNode *n = TiCC::XmlNewNode( "foreign-data" ); 1076 xmlAddChild( n, xmlCopyNode( node, 1 ) ); 1077 add->set_data( n ); 1078 _foreign_metadata->add_foreign( n ); 1079 xmlFreeNode (n ); 1080 } 1081 else { 1082 add->set_data( node ); 1083 _foreign_metadata->add_foreign( node ); 1084 } 1085 } 1086 save_orig_ann_defaults()1087 void Document::save_orig_ann_defaults(){ 1088 /// make a copy of the _annotationdefaults 1089 /*! 1090 For incremental document creation (using folia::Engine) we need to 1091 'remember' which annotationdefaults there were initially, so before 1092 any new annotations are added with declare(). 1093 1094 But we only need those that would return a default annotation or 1095 default processor. 1096 */ 1097 for ( const auto& it : _annotationdefaults ){ 1098 if ( it.second.size() == 1 ){ 1099 // so 1 set 1100 _orig_ann_default_sets.insert( make_pair(it.first,it.second.begin()->first) ); 1101 auto procs = it.second.begin()->second._processors; 1102 if ( procs.size() == 1 ){ 1103 _orig_ann_default_procs.insert( make_pair(it.first,*procs.begin()) ); 1104 } 1105 } 1106 } 1107 } 1108 parse_annotations(const xmlNode * node)1109 void Document::parse_annotations( const xmlNode *node ){ 1110 /// parse all annotation declarations from the Xml tree given by node 1111 if ( debug ){ 1112 cerr << "parse annotations " << TiCC::Name(node) << endl; 1113 } 1114 xmlNode *n = node->children; 1115 _anno_sort.clear(); 1116 while ( n ){ 1117 string tag = TiCC::Name( n ); 1118 if ( tag.length() > 11 && tag.substr( tag.length() - 11 ) == "-annotation" ){ 1119 string prefix = tag.substr( 0, tag.length() - 11 ); 1120 AnnotationType at_type 1121 = TiCC::stringTo<AnnotationType>( prefix ); 1122 if ( debug ){ 1123 cerr << "parse " << prefix << "-annotation" << endl; 1124 } 1125 KWargs atts = getAttributes( n ); 1126 ElementType et = BASE; 1127 string set_name = atts.extract("set" ); 1128 if ( set_name.empty() ){ 1129 if ( version_below( 1, 6 ) ){ 1130 set_name = "undefined"; // default value 1131 } 1132 else if ( at_type == AnnotationType::TEXT ){ 1133 if ( debug ){ 1134 cerr << "assign default for TEXT: " << DEFAULT_TEXT_SET << endl; 1135 } 1136 set_name = DEFAULT_TEXT_SET; 1137 } 1138 else if ( at_type == AnnotationType::PHON ){ 1139 if ( debug ){ 1140 cerr << "assign default for PHON: " << DEFAULT_PHON_SET << endl; 1141 } 1142 set_name = DEFAULT_PHON_SET; 1143 } 1144 else { 1145 auto et_it = annotationtype_elementtype_map.find( at_type ); 1146 if ( et_it == annotationtype_elementtype_map.end() ){ 1147 throw logic_error( "no matching element_type for annotation_type: " 1148 + prefix ); 1149 } 1150 et = et_it->second; 1151 properties *prop = element_props[et]; 1152 if ( prop->REQUIRED_ATTRIBS & Attrib::CLASS ) { 1153 throw XmlError( "setname may not be empty for " + prefix 1154 + "-annotation" ); 1155 } 1156 } 1157 } 1158 if ( set_name.empty() ){ 1159 set_name = "None"; 1160 } 1161 string format = atts.extract( "format" ); 1162 string annotator = atts.extract( "annotator" ); 1163 string ann_type = atts.extract( "annotatortype" ); 1164 string datetime = parseDate( atts.extract( "datetime" ) ); 1165 string alias = atts.extract( "alias" ); 1166 string gran_val = atts.extract( "groupannotations" ); 1167 if ( !gran_val.empty() ){ 1168 if ( !isSubClass( et, AbstractSpanAnnotation_t ) ){ 1169 throw XmlError( "attribute 'groupannotations' not allowed for '" 1170 + prefix + "-annotation" ); 1171 } 1172 if ( gran_val == "yes" 1173 || gran_val == "true" ){ 1174 _groupannotations[at_type][set_name] = true; 1175 } 1176 else { 1177 throw XmlError( "invalid value '" + gran_val 1178 + "' for attribute groupannotations" ); 1179 } 1180 } 1181 else { 1182 _groupannotations[at_type][set_name] = false; 1183 } 1184 set<string> processors; 1185 xmlNode *sub = n->children; 1186 while ( sub ){ 1187 string subtag = TiCC::Name( sub ); 1188 if ( debug ){ 1189 cerr << "parse subtag:" << subtag << endl; 1190 } 1191 if ( subtag == "annotator" ){ 1192 KWargs args = getAttributes( sub ); 1193 if ( args["processor"].empty() ){ 1194 throw XmlError( tag + "-annotation: <annotator> misses attribute 'processor'" ); 1195 } 1196 processors.insert( args["processor"] ); 1197 } 1198 sub = sub->next; 1199 } 1200 if ( !annotator.empty() && !processors.empty() ){ 1201 throw XmlError( tag + "-annotation: has both <annotator> node(s) and annotator attribute." ); 1202 } 1203 declare( at_type, set_name, format, annotator, ann_type, datetime, 1204 processors, alias ); 1205 if ( !atts.empty() ){ 1206 1207 throw XmlError( "found invalid attribute(s) in <" + prefix 1208 + "-declaration> " + atts.toString() ); 1209 } 1210 } 1211 n = n->next; 1212 } 1213 if ( debug ){ 1214 cerr << "all group annotations: " << _groupannotations << endl; 1215 cerr << "done with parse_annotation: " << _annotationdefaults << endl; 1216 cerr << "sorting: " << _anno_sort << endl; 1217 } 1218 } 1219 parse_provenance(const xmlNode * node)1220 void Document::parse_provenance( const xmlNode *node ){ 1221 /// parse provenance data from the XmlTree under node 1222 Provenance *result = new Provenance(this); 1223 xmlNode *n = node->children; 1224 while ( n ){ 1225 string tag = TiCC::Name( n ); 1226 if ( tag == "processor" ){ 1227 result->parse_processor(n); 1228 } 1229 n = n->next; 1230 } 1231 _provenance = result; 1232 // cerr << "provenance=" << _provenance << endl; 1233 } 1234 parse_submeta(const xmlNode * node)1235 void Document::parse_submeta( const xmlNode *node ){ 1236 /// parse sub metadata from the XmlTree under node 1237 if ( node ){ 1238 KWargs node_att = getAttributes( node ); 1239 string id = node_att["xml:id"]; 1240 if ( id.empty() ){ 1241 throw MetaDataError( "submetadata without xml:id" ); 1242 } 1243 // cerr << "parse submetadata, id=" << id << endl; 1244 string type = node_att["type"]; 1245 // cerr << "parse submetadata, type=" << type << endl; 1246 if ( type.empty() ){ 1247 type = "native"; 1248 } 1249 string src = node_att["src"]; 1250 if ( !src.empty() ){ 1251 submetadata[id] = new ExternalMetaData( type, src ); 1252 // cerr << "created External metadata, id=" << id << endl; 1253 } 1254 else if ( type == "native" ){ 1255 submetadata[id] = new NativeMetaData( type ); 1256 // cerr << "created Native metadata, id=" << id << endl; 1257 } 1258 else { 1259 submetadata[id] = 0; 1260 // cerr << "set metadata to 0, id=" << id << endl; 1261 } 1262 xmlNode *p = node->children; 1263 while ( p ){ 1264 if ( p->type == XML_ELEMENT_NODE ){ 1265 if ( TiCC::Name(p) == "meta" && 1266 checkNS( p, NSFOLIA ) ){ 1267 if ( type == "native" ){ 1268 string txt = TiCC::XmlContent( p ); 1269 KWargs att = getAttributes( p ); 1270 string sid = att["id"]; 1271 if ( !txt.empty() ){ 1272 submetadata[id]->add_av( sid, txt ); 1273 // cerr << "added node to id=" << id 1274 // << "(" << sid << "," << txt << ")" << endl; 1275 } 1276 } 1277 else { 1278 throw MetaDataError("Encountered a meta element but metadata type is not native!"); 1279 } 1280 } 1281 else if ( TiCC::Name(p) == "foreign-data" && 1282 checkNS( p, NSFOLIA ) ){ 1283 if ( type == "native" ){ 1284 throw MetaDataError("Encountered a foreign-data element but metadata type is native!"); 1285 } 1286 else if ( submetadata[id] == 0 ){ 1287 submetadata[id] = new ForeignMetaData( type ); 1288 // cerr << "add new Foreign " << id << endl; 1289 } 1290 // cerr << "in Foreign " << submetadata[id]->type() << endl; 1291 submetadata[id]->add_foreign( p ); 1292 // cerr << "added a foreign id=" << id << endl; 1293 } 1294 } 1295 p = p->next; 1296 } 1297 } 1298 } 1299 is_number(const string & s)1300 bool is_number( const string& s ){ 1301 /// check that every character in s is a digit 1302 for ( const auto& c : s ){ 1303 if ( !isdigit(c) ){ 1304 return false; 1305 } 1306 } 1307 return true; 1308 } 1309 expand_version_string(const string & vs,int & major,int & minor,int & sub,string & patch)1310 void expand_version_string( const string& vs, 1311 int& major, 1312 int& minor, 1313 int& sub, 1314 string& patch ){ 1315 /// expand a version string vs into ints major, minor and sub 1316 /*! 1317 \param[in] vs A string holding version information 1318 \param[out] major the major version found 1319 \param[out] minor the minor version found 1320 \param[out] sub the sub version found 1321 \param[out] patch the NON-numeric remainder of vs after parsing 1322 1323 examples: 1324 1325 "2.1" ==> major=2, minor=1 1326 1327 "2.0.3-a" ==> major=2, minor=0, sub=3 patch=-a 1328 */ 1329 major = 0; 1330 minor = 0; 1331 sub = 0; 1332 patch.clear(); 1333 vector<string> vec = TiCC::split_at( vs, ".", 3 ); 1334 for ( size_t i=0; i < vec.size(); ++i ){ 1335 if ( i == 0 ){ 1336 int val = 0; 1337 if ( !TiCC::stringTo( vec[i], val ) ){ 1338 throw XmlError( "unable to extract major-version from: " + vs ); 1339 } 1340 major= val; 1341 } 1342 else if ( i == 1 ){ 1343 int val = 0; 1344 if ( !TiCC::stringTo( vec[i], val ) ){ 1345 throw XmlError( "unable to extract minor-version from: " + vs ); 1346 } 1347 minor = val; 1348 } 1349 else if ( i == 2 ){ 1350 if ( is_number( vec[i] ) ){ 1351 TiCC::stringTo( vec[i], sub ); 1352 } 1353 else { 1354 vector<string> v2 = TiCC::split_at( vec[i], "-", 2 ); 1355 if ( v2.size() != 2 ){ 1356 throw XmlError( "invalid sub-version or patch-version in: " + vs ); 1357 } 1358 else { 1359 int val = 0; 1360 if ( !TiCC::stringTo( v2[0], val ) ){ 1361 throw XmlError( "unable to extract sub-version from: " + vs ); 1362 } 1363 sub = val; 1364 patch = "-" + v2[1]; // include the hyphen 1365 } 1366 } 1367 } 1368 } 1369 } 1370 check_version(const string & vers)1371 int check_version( const string& vers ){ 1372 /// check a version given by 'vers' against the current build 1373 /*! 1374 \param vers a version string (like "2.1.5") 1375 \return 0 when major, minor AND sub version are equal, -1 when the version 1376 is lower and 1 when the version is greater then the current build 1377 1378 */ 1379 int maj = 0; 1380 int min = 0; 1381 int sub = 0; 1382 string patch; 1383 expand_version_string( vers, maj, min, sub, patch ); 1384 if ( maj < MAJOR_VERSION ){ 1385 return -1; 1386 } 1387 else if ( maj > MAJOR_VERSION ){ 1388 return 1; 1389 } 1390 else if ( min < MINOR_VERSION ){ 1391 return -1; 1392 } 1393 else if ( min > MINOR_VERSION ){ 1394 return 1; 1395 } 1396 else if ( sub < SUB_VERSION ){ 1397 return -1; 1398 } 1399 else if ( sub > SUB_VERSION ){ 1400 return 1; 1401 } 1402 return 0; 1403 } 1404 compare_to_build_version() const1405 int Document::compare_to_build_version() const { 1406 /// check the version of the document against the build version 1407 /*! 1408 \return 0 when the versions match, -1 when the document version 1409 is lower and 1 when the version is greater then the current build 1410 */ 1411 return check_version( version() ); 1412 } 1413 version_below(int major,int minor) const1414 bool Document::version_below( int major, int minor ) const { 1415 /// check if current document version is strict lower then asked 1416 /*! 1417 \param major the major version we want 1418 \param minor the minor version we want 1419 \return true when the Document's major version is lower than mjor OR 1420 it is equal, but the Document's minor version is lower than minor. 1421 */ 1422 if ( _major_version < major ){ 1423 return true; 1424 } 1425 else if ( _major_version == major ){ 1426 return _minor_version < minor; 1427 } 1428 return false; 1429 } 1430 adjustTextMode()1431 void Document::adjustTextMode(){ 1432 /// set the text checking mode of the Document based on an environment 1433 /// variable and the document version 1434 /*! 1435 When the FOLIA_TEXT_CHECK environment variable is set to YES or NO then 1436 set the CHECKTEXT mode accordingly. 1437 1438 When the document version is below 1.5 we disable CHECKTEXT except when 1439 FIXTEXT is also set. 1440 */ 1441 const char *env = getenv( "FOLIA_TEXT_CHECK" ); 1442 if ( env ){ 1443 string e = env; 1444 delete env; 1445 cerr << "DETECTED FOLIA_TEXT_CHECK environment variable, value ='" 1446 << e << "'"<< endl; 1447 if ( e == "NO" ){ 1448 mode = Mode( int(mode) & ~CHECKTEXT ); 1449 cerr << "FOLIA_TEXT_CHECK disabled" << endl; 1450 } 1451 else if ( e == "YES" ){ 1452 mode = Mode( int(mode) | CHECKTEXT ); 1453 cerr << "FOLIA_TEXT_CHECK enabled" << endl; 1454 } 1455 else { 1456 cerr << "FOLIA_TEXT_CHECK unchanged:" << (checktext()?"YES":"NO") 1457 << endl; 1458 } 1459 } 1460 if ( !( mode & FIXTEXT) && version_below( 1, 5 ) ){ 1461 // don't check text consistency for older documents 1462 mode = Mode( int(mode) & ~CHECKTEXT ); 1463 } 1464 } 1465 setDocumentProps(KWargs & kwargs)1466 void Document::setDocumentProps( KWargs& kwargs ){ 1467 /// set general properties based on an attribute-value list 1468 /*! 1469 \param kwargs the arguments. Normally these are parsed attributes from 1470 \<FoLiA\> node. 1471 Even with an empty kwarg list, at least the version of the document is 1472 set. We use a special value (1.4.987) to signal that is was not 1473 specified. 1474 */ 1475 string value = kwargs.extract( "version" ); 1476 if ( !value.empty() ){ 1477 _version_string = value; 1478 // cerr << "So we found version " << _version_string << endl; 1479 } 1480 else { 1481 // assign a 'random' version, but PRE 1.5 1482 _version_string = "1.4.987"; 1483 // cerr << "NO VERSION version " << _version_string << endl; 1484 } 1485 expand_version_string( _version_string, 1486 _major_version, 1487 _minor_version, 1488 _sub_version, 1489 _patch_version ); 1490 if ( check_version( _version_string ) > 0 ){ 1491 cerr << "WARNING!!! the Document " 1492 << (_source_filename.empty()?"":"'") 1493 << _source_filename 1494 << (_source_filename.empty()?"":"' ") 1495 << "is created for newer FoLiA version than this library (" 1496 << _version_string << " vs " << folia_version() 1497 << ")\n\t Any possible subsequent failures in parsing or processing may probably be attributed to this." << endl 1498 << "\t Please upgrade libfolia!" << endl; 1499 increment_warn_count(); 1500 } 1501 1502 adjustTextMode(); 1503 value = kwargs.extract( "external" ); 1504 if ( !value.empty() ){ 1505 _external_document = TiCC::stringTo<bool>( value ); 1506 } 1507 else { 1508 _external_document = false; 1509 } 1510 bool happy = false; 1511 value = kwargs.extract( "_id" ); // for backward compatibility 1512 if ( value.empty() ){ 1513 value = kwargs.extract( "xml:id" ); 1514 } 1515 if ( !value.empty() ){ 1516 if ( isNCName( value ) ){ 1517 _id = value; 1518 } 1519 else { 1520 throw XmlError( "'" + value + "' is not a valid NCName." ); 1521 } 1522 happy = true; 1523 kwargs["xml:id"] = value; 1524 } 1525 if ( !foliadoc && !happy ){ 1526 throw runtime_error( "No Document ID specified" ); 1527 } 1528 kwargs.erase( "generator" ); // also delete this unused att-val 1529 kwargs.erase( "form" ); //silently discard form attribute (for normal vs explicit form), we should be able to read either fine 1530 } 1531 resolveExternals()1532 void Document::resolveExternals(){ 1533 /// resolve all external references 1534 /*! 1535 external references are stored during parsing in the _externals array 1536 */ 1537 if ( !_externals.empty() ){ 1538 for ( const auto& ext : _externals ){ 1539 ext->resolve_external(); 1540 } 1541 } 1542 } 1543 parse_metadata(const xmlNode * node)1544 void Document::parse_metadata( const xmlNode *node ){ 1545 /// parse metadata information from the XmlTree under node 1546 KWargs atts = getAttributes( node ); 1547 string type = TiCC::lowercase(atts["type"]); 1548 if ( type.empty() ){ 1549 type = "native"; 1550 } 1551 string src = atts["src"]; 1552 if ( !src.empty() ){ 1553 _metadata = new ExternalMetaData( type, src ); 1554 } 1555 else if ( type == "native" || type == "imdi" ){ 1556 _metadata = new NativeMetaData( type ); 1557 } 1558 xmlNode *m = node->children; 1559 xmlNode *a_node = 0; 1560 while ( m ){ 1561 if ( TiCC::Name(m) == "METATRANSCRIPT" ){ 1562 if ( !checkNS( m, NSIMDI ) || type != "imdi" ){ 1563 throw runtime_error( "imdi != imdi " ); 1564 } 1565 if ( debug > 1 ){ 1566 cerr << "found IMDI" << endl; 1567 } 1568 if ( !_foreign_metadata ){ 1569 _foreign_metadata = new ForeignMetaData( "imdi" ); 1570 } 1571 _foreign_metadata->add_foreign( xmlCopyNode(m,1) ); 1572 } 1573 else if ( TiCC::Name( m ) == "annotations" && 1574 checkNS( m, NSFOLIA ) ){ 1575 if ( debug > 1 ){ 1576 cerr << "found annotations" << endl; 1577 } 1578 // defer parsing until AFTER provenance data 1579 a_node = m; 1580 } 1581 else if ( TiCC::Name( m ) == "provenance" && 1582 checkNS( m, NSFOLIA ) ){ 1583 if ( debug > 1 ){ 1584 cerr << "found provenance data" << endl; 1585 } 1586 parse_provenance( m ); 1587 // cerr << _provenance << endl; 1588 } 1589 else if ( TiCC::Name( m ) == "meta" && 1590 checkNS( m, NSFOLIA ) ){ 1591 if ( debug > 1 ){ 1592 cerr << "found meta node:" << getAttributes(m) << endl; 1593 } 1594 if ( !_metadata ){ 1595 if ( type == "external" ){ 1596 throw runtime_error( "cannot add 'meta' nodes to external metadata" ); 1597 1598 } 1599 _metadata = new NativeMetaData( "native" ); 1600 } 1601 KWargs att = getAttributes( m ); 1602 string meta_id = att["id"]; 1603 string val = TiCC::XmlContent( m ); 1604 string get = _metadata->get_val( meta_id ); 1605 if ( !get.empty() ){ 1606 throw runtime_error( "meta tag with id=" + meta_id 1607 + " is defined more then once " ); 1608 } 1609 _metadata->add_av( meta_id, val ); 1610 } 1611 else if ( TiCC::Name(m) == "foreign-data" && 1612 checkNS( m, NSFOLIA ) ){ 1613 FoliaElement *t = AbstractElement::createElement( "foreign-data", this ); 1614 if ( t ){ 1615 t = t->parseXml( m ); 1616 if ( t ){ 1617 if ( !_foreign_metadata ){ 1618 _foreign_metadata = new ForeignMetaData( type ); 1619 } 1620 _foreign_metadata->add_foreign( m ); 1621 } 1622 } 1623 } 1624 else if ( TiCC::Name(m) == "submetadata" && 1625 checkNS( m, NSFOLIA ) ){ 1626 parse_submeta( m ); 1627 } 1628 m = m->next; 1629 } 1630 if ( a_node ){ 1631 // cerr << "parse deferred annotations" << endl; 1632 parse_annotations( a_node ); 1633 } 1634 if ( !_metadata && type == "imdi" ){ 1635 // imdi missing all further info 1636 _metadata = new NativeMetaData( type ); 1637 } 1638 } 1639 addStyle(const string & type,const string & href)1640 void Document::addStyle( const string& type, const string& href ){ 1641 /// add style-sheet information 1642 /*! 1643 \param type Which type of sheet 1644 \param href the external link for this sheet 1645 We assure that only one "text/xsl" style-sheet is present. All 1646 other style-sheets are silently added as is. 1647 */ 1648 if ( type == "text/xsl" ){ 1649 const auto& it = styles.find( type ); 1650 if ( it != styles.end() ){ 1651 throw XmlError( "multiple 'text/xsl' style-sheets defined." ); 1652 } 1653 } 1654 styles.insert( make_pair( type, href ) ); 1655 } 1656 replaceStyle(const string & type,const string & href)1657 void Document::replaceStyle( const string& type, 1658 const string& href ){ 1659 /// replace a style-sheet 1660 /*! 1661 \param type Which type of sheet 1662 \param href the external link for this sheet 1663 1664 \note this is sloppy, as multiple sheets with the same type may exist 1665 (except for 'text/xslt') and we replace the first one only. 1666 */ 1667 const auto& it = styles.find( type ); 1668 if ( it != styles.end() ){ 1669 it->second = href; 1670 } 1671 else { 1672 styles.insert( make_pair( type, href ) ); 1673 } 1674 } 1675 parse_styles()1676 void Document::parse_styles(){ 1677 /// retrieve all style-sheets from the current XmlTree 1678 xmlNode *pnt = _xmldoc->children; 1679 while ( pnt ){ 1680 if ( pnt->type == XML_PI_NODE && TiCC::Name(pnt) == "xml-stylesheet" ){ 1681 string content = TextValue(pnt); 1682 string type; 1683 string href; 1684 vector<string> v = TiCC::split( content ); 1685 if ( v.size() == 2 ){ 1686 vector<string> w = TiCC::split_at( v[0], "=" ); 1687 if ( w.size() == 2 && w[0] == "type" ){ 1688 type = w[1].substr(1,w[1].length()-2); 1689 } 1690 w = TiCC::split_at( v[1], "=" ); 1691 if ( w.size() == 2 && w[0] == "href" ){ 1692 href = w[1].substr(1,w[1].length()-2); 1693 } 1694 } 1695 if ( !type.empty() && !href.empty() ){ 1696 addStyle( type, href ); 1697 } 1698 else { 1699 throw XmlError( "problem parsing line: " + content ); 1700 } 1701 } 1702 pnt = pnt->next; 1703 } 1704 } 1705 fixupNs(xmlNode * p,xmlNs * ns)1706 void fixupNs( xmlNode *p, xmlNs *ns ){ 1707 /// make sure that all XmlNodes in the tree p get namespace ns 1708 /*! 1709 \param p an XmlTree (fragment) 1710 \param ns the Namespace value to set 1711 This function is used when a Document uses PERMISSIVE mode 1712 */ 1713 while ( p ){ 1714 xmlSetNs( p, ns ); 1715 fixupNs( p->children, ns ); 1716 p = p->next; 1717 } 1718 } 1719 validate_offsets() const1720 bool Document::validate_offsets() const { 1721 /// Validate all the offset values as found in all \<t\> and \<ph\> nodes 1722 /*! 1723 During Document parsing, \<t\> and \<ph\> nodes are stored in a buffer 1724 until the whole parsing is done. 1725 1726 Then we are able to examine those nodes in their context and check the 1727 offsets used. 1728 */ 1729 set<TextContent*> t_done; 1730 for ( const auto& txt : t_offset_validation_buffer ){ 1731 if ( t_done.find( txt ) != t_done.end() ){ 1732 continue; 1733 } 1734 t_done.insert(txt); 1735 int offset = txt->offset(); 1736 if ( offset != -1 ){ 1737 try { 1738 txt->get_reference(); 1739 } 1740 catch( const UnresolvableTextContent& e ){ 1741 string msg = "Text for " + txt->parent()->xmltag() + "(ID=" 1742 + txt->parent()->id() + ", textclass='" + txt->cls() 1743 + "'), has incorrect offset " + TiCC::toString(offset); 1744 1745 1746 string ref = txt->ref(); 1747 if ( !ref.empty() ){ 1748 msg += " or invalid reference:" + ref; 1749 } 1750 msg += "\n\toriginal msg="; 1751 msg += e.what(); 1752 1753 bool warn = false; 1754 try { 1755 txt->get_reference(false); //trim_spaces = false 1756 msg += "\nHowever, according to the older rules (<v2.4.1) the offsets are accepted. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish."; 1757 warn = true; 1758 } catch ( const UnresolvableTextContent& ) { 1759 msg += "\n(also checked against older rules prior to FoLiA v2.4.1)"; 1760 } 1761 1762 if ( warn ){ 1763 increment_warn_count(); 1764 cerr << "WARNING: " << msg << endl; 1765 } 1766 else { 1767 throw UnresolvableTextContent( msg ); 1768 } 1769 } 1770 } 1771 } 1772 set<PhonContent*> p_done; 1773 for ( const auto& phon : p_offset_validation_buffer ){ 1774 if ( p_done.find( phon ) != p_done.end() ){ 1775 continue; 1776 } 1777 p_done.insert(phon); 1778 int offset = phon->offset(); 1779 if ( offset != -1 ){ 1780 try { 1781 phon->get_reference(); 1782 } 1783 catch( const UnresolvableTextContent& e ){ 1784 string msg = "Phoneme for " + phon->parent()->xmltag() + ", ID=" 1785 + phon->parent()->id() + ", textclass='" + phon->cls() 1786 + "', has incorrect offset " + TiCC::toString(offset); 1787 1788 1789 string ref = phon->ref(); 1790 if ( !ref.empty() ){ 1791 msg += " or invalid reference:" + ref; 1792 } 1793 msg += "\n\toriginal msg="; 1794 msg += e.what(); 1795 1796 bool warn = false; 1797 try { 1798 phon->get_reference(false); //trim_spaces = false 1799 msg += "\nHowever, according to the older rules (<v2.4.1) the offsets are accepted. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish."; 1800 warn = true; 1801 } catch ( const UnresolvableTextContent& ) { 1802 msg += "\n(also checked against older rules prior to FoLiA v2.4.1)"; 1803 } 1804 1805 if (warn){ 1806 increment_warn_count(); 1807 cerr << "WARNING: " << msg << endl; 1808 } 1809 else { 1810 throw UnresolvableTextContent( msg ); 1811 } 1812 } 1813 } 1814 } 1815 return true; 1816 } 1817 parseXml()1818 FoliaElement* Document::parseXml( ){ 1819 /// parse a complete FoLiA tree from the XmlTree we have got in _xmldoc 1820 parse_styles(); 1821 xmlNode *root = xmlDocGetRootElement( _xmldoc ); 1822 if ( root->ns ){ 1823 if ( root->ns->prefix ){ 1824 _foliaNsIn_prefix = xmlStrdup( root->ns->prefix ); 1825 } 1826 _foliaNsIn_href = xmlStrdup( root->ns->href ); 1827 } 1828 if ( debug > 2 ){ 1829 string dum; 1830 cerr << "root = " << TiCC::Name( root ) << endl; 1831 cerr << "in namespace " << TiCC::getNS( root, dum ) << endl; 1832 cerr << "namespace list" << getNS_definitions( root ) << endl; 1833 } 1834 FoliaElement *result = 0; 1835 if ( root ){ 1836 if ( TiCC::Name( root ) == "FoLiA" ){ 1837 string ns = TiCC::getNS( root ); 1838 if ( ns.empty() ){ 1839 if ( permissive() ){ 1840 _foliaNsIn_href = xmlCharStrdup( NSFOLIA.c_str() ); 1841 _foliaNsIn_prefix = 0; 1842 xmlNs *defNs = xmlNewNs( root, 1843 _foliaNsIn_href, _foliaNsIn_prefix ); 1844 fixupNs( root, defNs ); 1845 } 1846 else { 1847 throw XmlError( "Folia Document should have namespace declaration " 1848 + NSFOLIA + " but none found " ); 1849 } 1850 } 1851 else if ( ns != NSFOLIA ){ 1852 throw XmlError( "Folia Document should have namespace declaration " 1853 + NSFOLIA + " but found: " + ns ); 1854 } 1855 try { 1856 FoLiA *folia = new FoLiA( this ); 1857 result = folia->parseXml( root ); 1858 resolveExternals(); 1859 } 1860 catch ( const InconsistentText& e ){ 1861 throw; 1862 } 1863 catch ( const XmlError& e ){ 1864 throw; 1865 } 1866 catch ( const exception& e ){ 1867 throw XmlError( e.what() ); 1868 } 1869 } 1870 else if ( TiCC::Name( root ) == "DCOI" && 1871 checkNS( root, NSDCOI ) ){ 1872 throw XmlError( "DCOI format not supported" ); 1873 } 1874 else { 1875 throw XmlError( "root node must be FoLiA" ); 1876 } 1877 } 1878 return result; 1879 } 1880 auto_declare(AnnotationType type,const string & _setname)1881 void Document::auto_declare( AnnotationType type, 1882 const string& _setname ) { 1883 /// create a default declaration for the given AnnotationType 1884 /*! 1885 \param type which default do we want to add 1886 \param _setname which setname to add 1887 If _setname is empty, that is used, except for TEXT and PHON, which 1888 have a default setname which is assigned 1889 */ 1890 string setname = _setname; 1891 if ( setname.empty() ) { 1892 if ( type == AnnotationType::TEXT ){ 1893 setname = DEFAULT_TEXT_SET; 1894 } 1895 else if ( type == AnnotationType::PHON ){ 1896 setname = DEFAULT_PHON_SET; 1897 } 1898 } 1899 if ( setname.empty() ){ 1900 declare( type, "" ); 1901 } 1902 else { 1903 declare( type, setname ); 1904 } 1905 } 1906 declare(AnnotationType type,const string & setname,const string & args)1907 void Document::declare( AnnotationType type, 1908 const string& setname, 1909 const string& args ){ 1910 /// Add an annotation declaration 1911 /*! 1912 \param type The AnnotationType for which to add a setname 1913 \param setname The Set name to add 1914 \param args a string representation of an attribute-value list with 1915 additional parameters 1916 */ 1917 KWargs kwargs = getArgs( args ); 1918 return declare( type, setname, kwargs ); 1919 } 1920 declare(AnnotationType type,const string & setname,const KWargs & _args)1921 void Document::declare( AnnotationType type, 1922 const string& setname, 1923 const KWargs& _args ){ 1924 /// Add an annotation declaration 1925 /*! 1926 \param type The AnnotationType for which to add a setname 1927 \param setname The Set name to add 1928 \param _args an attribute-value list with additional parameters 1929 */ 1930 KWargs args = _args; 1931 if ( debug ){ 1932 cerr << "declare( " << folia::toString(type) << "," << setname << ", [" 1933 << args << "] )" << endl; 1934 } 1935 string st = setname; 1936 if ( st.empty() ){ 1937 if ( version_below( 1, 6 ) ){ 1938 st = "undefined"; 1939 } 1940 else { 1941 string prefix = folia::toString(type); 1942 auto et_it = annotationtype_elementtype_map.find( type ); 1943 if ( et_it == annotationtype_elementtype_map.end() ){ 1944 throw logic_error( "no matching element_type for annotation_type: " 1945 + prefix ); 1946 } 1947 auto et = et_it->second; 1948 properties *prop = element_props[et]; 1949 if ( prop->REQUIRED_ATTRIBS & Attrib::CLASS ) { 1950 throw XmlError( "setname may not be empty for " + prefix 1951 + "-annotation" ); 1952 } 1953 } 1954 if ( st.empty() ){ 1955 st = "None"; 1956 } 1957 } 1958 set<string> processors; 1959 string a = args["annotator"]; 1960 string t = args["annotatortype"]; 1961 string f = args["format"]; 1962 string d = args["datetime"]; 1963 string alias = args["alias"]; 1964 string processor = args["processor"]; 1965 if ( !processor.empty() ){ 1966 processors.insert( processor ); 1967 } 1968 args.erase("annotator"); 1969 args.erase("annotatortype"); 1970 args.erase("format"); 1971 args.erase("datetime"); 1972 args.erase("alias"); 1973 args.erase("processor"); 1974 if ( args.size() != 0 ){ 1975 throw XmlError( "declaration: expected 'annotator', 'annotatortype', 'processor', 'alias' or 'datetime', got '" + args.begin()->first + "'" ); 1976 } 1977 declare( type, st, f, a, t, d, processors, alias ); 1978 } 1979 unalias(AnnotationType type,const string & alias) const1980 string Document::unalias( AnnotationType type, 1981 const string& alias ) const { 1982 /// resolve an alias for a setname to the full setname 1983 /*! 1984 \param type the AnnotationType 1985 \param alias the alias to resolve 1986 \return the setname belonging to alias for this type, or alias if not 1987 found 1988 */ 1989 const auto& ti = _alias_set.find(type); 1990 if ( ti != _alias_set.end() ){ 1991 const auto& sti = ti->second.find( alias ); 1992 if ( sti != ti->second.end() ){ 1993 return sti->second; 1994 } 1995 } 1996 return alias; 1997 } 1998 alias(AnnotationType type,const string & setname) const1999 string Document::alias( AnnotationType type, 2000 const string& setname ) const { 2001 /// give the alias for a setname 2002 /*! 2003 \param type the AnnotationType 2004 \param setname the alias to resolve 2005 \return the alias belonging setname for this type, or setname if 2006 not found 2007 */ 2008 const auto& ti = _set_alias.find(type); 2009 if ( ti != _set_alias.end() ){ 2010 const auto& ali = ti->second.find( setname ); 2011 if ( ali != ti->second.end() ){ 2012 return ali->second; 2013 } 2014 } 2015 return setname; 2016 } 2017 declare(AnnotationType type,const string & setname,const string & format,const string & annotator,const string & annotator_type,const string & date_time,const set<string> & _processors,const string & _alias)2018 void Document::declare( AnnotationType type, 2019 const string& setname, 2020 const string& format, 2021 const string& annotator, 2022 const string& annotator_type, 2023 const string& date_time, 2024 const set<string>& _processors, 2025 const string& _alias ){ 2026 /// Add an annotation declaration 2027 /*! 2028 \param type The AnnotationType for which to add a setname 2029 \param setname The Set name to add 2030 \param format the format to add 2031 \param annotator the name of the annotator 2032 \param annotator_type the type of annotator 2033 \param date_time the date and time to set. The value "now()" will set it 2034 to the current time. 2035 \param _processors a set of processor id's to relate to this declaration 2036 \param _alias an alias value for the setname 2037 */ 2038 if ( debug ){ 2039 cerr << "declare( " << folia::toString(type) << "," << setname 2040 << ", format=" << format << "," << annotator << "," 2041 << annotator_type << "," << date_time << "," << _alias << "," 2042 << _processors << ") " << endl; 2043 } 2044 AnnotatorType ant = UNDEFINED; 2045 try { 2046 ant = TiCC::stringTo<AnnotatorType>( annotator_type ); 2047 } 2048 catch (...) { 2049 throw XmlError( "declare(): illegal value '" 2050 + annotator_type + "' for annotator type" ); 2051 } 2052 if ( !_alias.empty() ){ 2053 string set_ali = alias(type,setname); 2054 if ( !set_ali.empty() ){ 2055 if ( set_ali != setname 2056 && set_ali != _alias ){ 2057 throw XmlError( "setname: '" + setname + "' already has an alias: '" 2058 + set_ali ); 2059 } 2060 } 2061 string ali_ali = alias(type,_alias); 2062 string ali_set = unalias(type,_alias); 2063 if ( ali_ali != _alias ){ 2064 throw XmlError( "alias: '" + _alias + 2065 "' is also in use as a setname for set:'" 2066 + ali_set + "'" ); 2067 } 2068 if ( ali_set != _alias 2069 && ali_set != setname ){ 2070 throw XmlError( "alias: '" + _alias + "' already used for setname: '" 2071 + ali_set + "'" ); 2072 } 2073 } 2074 if ( !declared( type, setname, annotator, ant, _processors ) ){ 2075 set<string> procs = _processors; 2076 if ( !unalias(type,setname).empty() 2077 && unalias(type,setname) != setname ){ 2078 throw XmlError( "setname: '" + setname 2079 + "' is also in use as an alias" ); 2080 } 2081 string d = date_time; 2082 if ( d == "now()" ){ 2083 d = get_ISO_date(); 2084 } 2085 if ( procs.empty() ){ 2086 // old style 2087 _annotationdefaults[type].insert( make_pair( setname, 2088 at_t(annotator,ant,d,format,procs) ) ); 2089 } 2090 else { 2091 // new style 2092 auto set_pos = _annotationdefaults[type].find(setname); 2093 if ( set_pos == _annotationdefaults[type].end() ){ 2094 // no processor annotations yet 2095 _annotationdefaults[type].insert( make_pair( setname, 2096 at_t(annotator,ant,d,format,procs) ) ); 2097 2098 } 2099 else { 2100 // add to the existing 2101 for ( const auto& p : procs ){ 2102 set_pos->second._processors.insert( p ); 2103 } 2104 } 2105 } 2106 if ( debug ){ 2107 cerr << "ADD to sort: " << folia::toString(type) << " (" 2108 << setname << ")" << endl; 2109 } 2110 _anno_sort.push_back(make_pair(type,setname)); 2111 _annotationrefs[type][setname] = 0; 2112 if ( !_alias.empty() ){ 2113 _alias_set[type][_alias] = setname; 2114 _set_alias[type][setname] = _alias; 2115 } 2116 else { 2117 _alias_set[type][setname] = setname; 2118 _set_alias[type][setname] = setname; 2119 } 2120 } 2121 } 2122 un_declare(AnnotationType type,const string & set_name)2123 void Document::un_declare( AnnotationType type, 2124 const string& set_name ){ 2125 /// remove a declaration for an AnnotationType/setname pair 2126 /*! 2127 \param type the AnnotationType 2128 \param set_name the setname. May be empty ("") 2129 2130 When \em set_name is "", ALL declarations of \em type are deleted 2131 */ 2132 string setname = unalias(type,set_name); 2133 if ( debug ){ 2134 cerr << "undeclare: " << folia::toString(type) << "(" << set_name << "." 2135 << setname << ")" << endl; 2136 } 2137 if ( _annotationrefs[type][setname] != 0 ){ 2138 throw XmlError( "unable to undeclare " + toString(type) + "-type(" 2139 + setname + ") (references remain)" ); 2140 } 2141 auto const adt = _annotationdefaults.find(type); 2142 if ( adt != _annotationdefaults.end() ){ 2143 if ( debug ){ 2144 cerr << "matched type=" << folia::toString(type) << endl; 2145 } 2146 auto it = adt->second.begin(); 2147 while ( it != adt->second.end() ){ 2148 if ( debug ){ 2149 cerr << "zoek set:" << setname << endl; 2150 } 2151 if ( setname.empty() || it->first == setname ){ 2152 if ( debug ){ 2153 cerr << "erase:" << setname << "==" << it->first << endl; 2154 } 2155 it = adt->second.erase(it); 2156 } 2157 else { 2158 ++it; 2159 } 2160 } 2161 if ( debug ){ 2162 cerr << "ANNO-SORT: IN " << _anno_sort << endl; 2163 } 2164 auto it2 = _anno_sort.begin(); 2165 while ( it2 != _anno_sort.end() ){ 2166 if ( debug ){ 2167 cerr << "zoek set:" << setname << endl; 2168 } 2169 if ( it2->first == type 2170 && ( setname.empty() || it2->second == setname ) ){ 2171 if ( debug ){ 2172 cerr << "_annosort:erase:" << setname << "==" << it->first << endl; 2173 } 2174 it2 = _anno_sort.erase( it2 ); 2175 } 2176 else { 2177 ++it2; 2178 } 2179 } 2180 if ( debug ){ 2181 cerr << "ANNO-SORT: UIT " << _anno_sort << endl; 2182 } 2183 auto it3 = _alias_set[type].begin(); 2184 while ( it3 != _alias_set[type].end() ){ 2185 if ( it3->first == setname || it3->second == setname ){ 2186 it3 = _alias_set[type].erase( it3 ); 2187 } 2188 else { 2189 ++it3; 2190 } 2191 } 2192 auto it4 = _set_alias[type].begin(); 2193 while ( it4 != _set_alias[type].end() ){ 2194 if ( it4->first == setname || it4->second == setname ){ 2195 it4 = _set_alias[type].erase( it4 ); 2196 } 2197 else { 2198 ++it4; 2199 } 2200 } 2201 if ( adt->second.empty() ){ 2202 _annotationdefaults.erase(adt); 2203 } 2204 } 2205 } 2206 unused_declarations() const2207 multimap<AnnotationType, string> Document::unused_declarations( ) const { 2208 /// search for declarations not referencec in the Document 2209 /*! 2210 \return a list of all AnntotationType/setname pairs that are not used 2211 */ 2212 multimap<AnnotationType,string> result; 2213 for ( const auto& tit : _annotationrefs ){ 2214 for ( const auto& mit : tit.second ){ 2215 if ( mit.second == 0 ){ 2216 result.insert( make_pair(tit.first, mit.first ) ); 2217 } 2218 } 2219 } 2220 return result; 2221 } 2222 setTextRoot(const KWargs & args)2223 Text* Document::setTextRoot( const KWargs& args ) { 2224 /// create a Text element as root for the document 2225 /*! 2226 \param args extra attribute-value pairs as attributes to use 2227 \return the created Text node 2228 */ 2229 Text *t = new Text( args ); 2230 foliadoc->append( t ); 2231 return t; 2232 } 2233 setTextRoot()2234 Text* Document::setTextRoot() { 2235 /// create a Text element as root for the document 2236 KWargs empty; 2237 return setTextRoot( empty ); 2238 } 2239 setSpeechRoot(const KWargs & args)2240 Speech* Document::setSpeechRoot( const KWargs& args ) { 2241 /// create a Speech element as root for the document 2242 /*! 2243 \param args extra attribute-value pairs as attributes to use 2244 \return the created Speech node 2245 */ 2246 Speech *s = new Speech( args ); 2247 foliadoc->append( s ); 2248 return s; 2249 } 2250 setSpeechRoot()2251 Speech* Document::setSpeechRoot() { 2252 /// create a Speech element as root for the document 2253 KWargs empty; 2254 return setSpeechRoot( empty ); 2255 } 2256 getRoot()2257 FoliaElement *Document::getRoot(){ 2258 /// return the root element, if any 2259 if ( foliadoc && foliadoc->size() > 0 ){ 2260 return foliadoc->index(0); 2261 } 2262 else { 2263 return 0; 2264 } 2265 } 2266 append(FoliaElement * t)2267 FoliaElement* Document::append( FoliaElement *t ){ 2268 /// append a root element tot the Document 2269 /*! 2270 \param t a root element to add 2271 \return the added root (also t). Throws on error. 2272 2273 This function will check if a root is already there. 2274 Is only accepts Speech or Text nodes as root. 2275 */ 2276 2277 FoliaElement *root = getRoot(); 2278 if ( root ){ 2279 throw XmlError( "cannot append a root element to a Document. Already there." ); 2280 } 2281 if ( t->element_id() == Text_t 2282 || t->element_id() == Speech_t ) { 2283 foliadoc->append( t ); 2284 return t; 2285 } 2286 throw XmlError( "Only can append 'text' or 'speech' as root of a Document." ); 2287 } 2288 declared(const AnnotationType & type,const string & set_name,const string & annotator,const AnnotatorType & annotator_type,const string & processor) const2289 bool Document::declared( const AnnotationType& type, 2290 const string& set_name, 2291 const string& annotator, 2292 const AnnotatorType& annotator_type, 2293 const string& processor ) const { 2294 /// check if a given combination of AnnotationType, setname, annotators etc. 2295 /// is declared 2296 /*! 2297 \param type the AnnotationType 2298 \param set_name a setname OR an alias (may be empty) 2299 \param annotator the annotator to check (may be empty) 2300 \param annotator_type the annotator_type to check (may be UNDEFINED) 2301 \param processor the processor to match (may be empty) 2302 \return true when all values match. 2303 2304 For the type NO_ANN, the result is always true. 2305 2306 If set_name is empty ("") a match is found when a declarion for \e type 2307 exists 2308 2309 Otherwise, all values are checked for a match 2310 */ 2311 if ( debug ){ 2312 cerr << "isdeclared? ( " << folia::toString(type) << "," << set_name << "," 2313 << annotator << "," << toString(annotator_type) << "," << processor 2314 << ") " << endl; 2315 } 2316 // 2317 // We DO NOT check the date. if all parameters match, it is OK 2318 // 2319 if ( type == AnnotationType::NO_ANN ){ 2320 if ( debug ){ 2321 cerr << "\t\t TRUE want NO_ANN" << endl; 2322 } 2323 return true; 2324 } 2325 if ( !processor.empty() 2326 && !get_processor( processor ) ){ 2327 throw XmlError( folia::toString(type) 2328 + "-annotation is referring an undefined processor '" 2329 + processor + "'" ); 2330 } 2331 string setname = unalias(type,set_name); 2332 const auto& it1 = _annotationdefaults.find(type); 2333 if ( it1 != _annotationdefaults.end() ){ 2334 if ( debug ){ 2335 cerr << "OK, found an entry for type: " << folia::toString(type) << endl; 2336 } 2337 if ( setname.empty() ){ 2338 // 'wildcard' for setname 2339 return true; 2340 } 2341 auto mit2 = it1->second.lower_bound(setname); 2342 while ( mit2 != it1->second.upper_bound(setname) ){ 2343 if ( debug ){ 2344 cerr << "OK, found an entry for set='" << setname << "'" << endl; 2345 cerr << "content: " << mit2->second << endl; 2346 } 2347 if ( mit2->second._annotator == annotator 2348 && mit2->second._ann_type == annotator_type 2349 && ( (mit2->second._processors.empty() && processor.empty() ) 2350 || ( mit2->second._processors.find(processor) 2351 != mit2->second._processors.end() ) ) ){ 2352 if ( debug ){ 2353 cerr << "\t\t declared ==> TRUE" << endl; 2354 } 2355 return true; 2356 } 2357 ++mit2; 2358 } 2359 } 2360 if ( debug ){ 2361 cerr << "\t\t declared() ==> FALSE" << endl; 2362 } 2363 return false; 2364 } 2365 declared(const AnnotationType & type,const string & set_name,const string & annotator,const AnnotatorType & annotator_type,const set<string> & processors) const2366 bool Document::declared( const AnnotationType& type, 2367 const string& set_name, 2368 const string& annotator, 2369 const AnnotatorType& annotator_type, 2370 const set<string>& processors ) const { 2371 /// check if a given combination of AnnotationType, setname, annotators etc. 2372 /// is declared 2373 /*! 2374 \param type the AnnotationType 2375 \param set_name a setname OR an alias (may be empty) 2376 \param annotator the annotator to check (may be empty) 2377 \param annotator_type the annotator_type to check (may be UNDEFINED) 2378 \param processors a list of processors to match (may be empty) 2379 \return true when all values match. 2380 2381 For the type NO_ANN, the result is always true. 2382 2383 If set_name is empty ("") a match is found when a declarion for \e type 2384 exists 2385 2386 Otherwise, all values are checked for a match for at least 1 of the 2387 processors. 2388 */ 2389 if ( processors.empty() ){ 2390 return declared( type, set_name, annotator, annotator_type, "" ); 2391 } 2392 else { 2393 for ( const auto& s : processors ){ 2394 if ( declared( type, set_name, annotator, annotator_type, s ) ){ 2395 return true; 2396 } 2397 } 2398 return false; 2399 } 2400 } 2401 incrRef(AnnotationType type,const string & s)2402 void Document::incrRef( AnnotationType type, 2403 const string& s ){ 2404 /// increment the reference count for the AnnotationType/set combination 2405 /*! 2406 \param type the AnnotationType 2407 \param s the setname 2408 */ 2409 if ( type != AnnotationType::NO_ANN ){ 2410 string st = s; 2411 if ( st.empty() ){ 2412 st = default_set(type); 2413 } 2414 ++_annotationrefs[type][st]; 2415 // cerr << "increment " << toString(type) << "(" << st << ") to: " 2416 // << _annotationrefs[type][s] << endl; 2417 } 2418 } 2419 decrRef(AnnotationType type,const string & s)2420 void Document::decrRef( AnnotationType type, 2421 const string& s ){ 2422 /// decrement the reference count for the AnnotationType/set combination 2423 /*! 2424 \param type the AnnotationType 2425 \param s the setname 2426 */ 2427 if ( type != AnnotationType::NO_ANN 2428 && _annotationrefs[type][s] > 0 ){ 2429 --_annotationrefs[type][s]; 2430 // cerr << "decrement " << toString(type) << "(" << s << ") to: " 2431 // << _annotationrefs[type][s] << endl; 2432 } 2433 } 2434 declared(const AnnotationType & type,const string & set_name) const2435 bool Document::declared( const AnnotationType& type, 2436 const string& set_name ) const { 2437 /// check if a given combination of AnnotationType and setname 2438 /// is declared 2439 /*! 2440 \param type the AnnotationType 2441 \param set_name a setname OR an alias (may be empty) 2442 \return true when there is a match 2443 2444 For the type NO_ANN, the result is always true. 2445 2446 If set_name is empty ("") a match is found when a declarion for \e type 2447 exists 2448 2449 */ 2450 if ( debug ){ 2451 cerr << "declared(" << folia::toString(type) << ",'" 2452 << set_name << "')" << endl; 2453 } 2454 if ( type == AnnotationType::NO_ANN ){ 2455 if ( debug ){ 2456 cerr << "always true for NO_ANN" << endl; 2457 } 2458 return true; 2459 } 2460 if ( debug ){ 2461 cerr << "Doorzoek: " << _annotationdefaults << endl; 2462 } 2463 const auto& mit1 = _annotationdefaults.find(type); 2464 if ( mit1 != _annotationdefaults.end() ){ 2465 if ( debug ){ 2466 cerr << "found some: " << mit1->second << endl; 2467 } 2468 if ( set_name.empty() ){ 2469 // 'wildcard' for setname 2470 if ( debug ){ 2471 cerr << "return TRUE" << endl; 2472 } 2473 return true; 2474 } 2475 string s_name = unalias(type,set_name); 2476 if ( debug ){ 2477 cerr << "lookup: " << set_name << " (" << s_name << ")" << endl; 2478 } 2479 const auto& mit2 = mit1->second.find(s_name); 2480 if ( debug ){ 2481 if ( mit2 != mit1->second.end() ){ 2482 cerr << "return TRUE" << endl; 2483 } 2484 else { 2485 cerr << "return FALSE" << endl; 2486 } 2487 } 2488 return mit2 != mit1->second.end(); 2489 } 2490 if ( debug ){ 2491 cerr << "return DIRECTLY FALSE" << endl; 2492 } 2493 return false; 2494 } 2495 declared(ElementType et,const string & set_name) const2496 bool Document::declared( ElementType et, 2497 const string& set_name ) const { 2498 /// check if the AnnotationType belonging to the ElementType and setname 2499 /// is declared 2500 /*! 2501 \param et the ElementType 2502 \param set_name a setname OR an alias (may be empty) 2503 \return true when there is a match 2504 2505 For the type NO_ANN, the result is always true. 2506 2507 If set_name is empty ("") a match is found when a declarion for \em type 2508 exists 2509 */ 2510 AnnotationType at = element_annotation_map[et]; 2511 return declared( at, set_name ); 2512 } 2513 default_set(AnnotationType type) const2514 string Document::default_set( AnnotationType type ) const { 2515 /// return the default setname for the type. If any. 2516 /*! 2517 \param type the AnnotationType 2518 \return the setname. May be empty ("") when there is none defined OR it 2519 is ambiguous. 2520 */ 2521 if ( type == AnnotationType::NO_ANN ){ 2522 return ""; 2523 } 2524 // search a set. it must be unique. Otherwise return "" 2525 if ( debug ){ 2526 cerr << "\nzoek voor '" << toString(type) << "' de default set in:\n" 2527 << _annotationdefaults << endl; 2528 } 2529 string result; 2530 const auto& mit1 = _annotationdefaults.find(type); 2531 if ( mit1 != _annotationdefaults.end() ){ 2532 if ( debug ){ 2533 cerr << "vind tussen " << mit1->second << endl; 2534 } 2535 if ( mit1->second.size() == 1 ){ 2536 // so it is unique 2537 result = mit1->second.begin()->first; 2538 } 2539 } 2540 if ( debug ){ 2541 cerr << "default_set ==> " << result << endl; 2542 } 2543 return result; 2544 } 2545 default_annotator(AnnotationType type,const string & setname) const2546 string Document::default_annotator( AnnotationType type, 2547 const string& setname ) const { 2548 /// return the default annotator for the type/setname combination. 2549 /*! 2550 \param type the AnnotationType 2551 \param setname the annotation set. An empty string ("") means ANY set. 2552 \return the annotator. May be empty ("") when there is none defined OR it 2553 is ambiguous. 2554 */ 2555 if ( type == AnnotationType::NO_ANN ){ 2556 return ""; 2557 } 2558 const auto& mit1 = _annotationdefaults.find(type); 2559 string result; 2560 if ( mit1 != _annotationdefaults.end() ){ 2561 // cerr << "vind tussen " << mit1->second << endl; 2562 if ( setname.empty() ){ 2563 // 'wildcard' search 2564 if ( mit1->second.size() == 1 ){ 2565 // so it is unique 2566 result = mit1->second.begin()->second._annotator; 2567 return result; 2568 } 2569 } 2570 else { 2571 if ( mit1->second.count( setname ) == 1 ){ 2572 // so it is unique 2573 const auto& mit2 = mit1->second.find( setname ); 2574 result = mit2->second._annotator; 2575 } 2576 } 2577 } 2578 // cerr << "get default ==> " << result << endl; 2579 return result; 2580 } 2581 default_annotatortype(AnnotationType type,const string & setname) const2582 AnnotatorType Document::default_annotatortype( AnnotationType type, 2583 const string& setname ) const { 2584 /// return the default annotator type for the type/setname combination. 2585 /*! 2586 \param type the AnnotationType 2587 \param setname the AnnotationType. An empty string ("") means ANY set. 2588 \return the annotator. May be empty ("") when there is none defined OR it 2589 is ambiguous. 2590 */ 2591 if ( debug ){ 2592 cerr << "annotationdefaults= " << _annotationdefaults << endl; 2593 cerr << "lookup: " << folia::toString(type) << endl; 2594 } 2595 AnnotatorType result = UNDEFINED; 2596 if ( type == AnnotationType::NO_ANN ){ 2597 return result; 2598 } 2599 const auto& mit1 = _annotationdefaults.find(type); 2600 if ( mit1 != _annotationdefaults.end() ){ 2601 if ( debug ){ 2602 cerr << "found a hit for type=" << folia::toString( type ) << endl; 2603 } 2604 if ( setname.empty() ){ 2605 // 'wildcard' search 2606 if ( mit1->second.size() == 1 ){ 2607 // so it is unique 2608 result = mit1->second.begin()->second._ann_type; 2609 } 2610 return result; 2611 } 2612 else { 2613 if ( mit1->second.count( setname ) == 1 ){ 2614 // so it is unique 2615 const auto& mit2 = mit1->second.find( setname ); 2616 result = mit2->second._ann_type; 2617 } 2618 } 2619 } 2620 // cerr << "get default ==> " << result << endl; 2621 return result; 2622 } 2623 default_datetime(AnnotationType type,const string & setname) const2624 string Document::default_datetime( AnnotationType type, 2625 const string& setname ) const { 2626 /// return the default datetime value for the type/setname combination. 2627 /*! 2628 \param type the AnnotationType 2629 \param setname the annotation set. An empty string ("") means ANY set. 2630 \return the datetime value. May be empty ("") when there is none defined 2631 OR it is ambiguous. 2632 */ 2633 const auto& mit1 = _annotationdefaults.find(type); 2634 string result; 2635 if ( mit1 != _annotationdefaults.end() ){ 2636 if ( setname.empty() ){ 2637 // 'wildcard' search 2638 if ( mit1->second.size() == 1 ){ 2639 // so it is unique 2640 result = mit1->second.begin()->second._date; 2641 } 2642 } 2643 else { 2644 if ( mit1->second.count( setname ) == 1 ){ 2645 // so it is unique 2646 const auto& mit2 = mit1->second.find( setname ); 2647 result = mit2->second._date; 2648 } 2649 } 2650 } 2651 // cerr << "get default ==> " << result << endl; 2652 return result; 2653 } 2654 default_processor(AnnotationType type,const string & setname) const2655 string Document::default_processor( AnnotationType type, 2656 const string& setname ) const{ 2657 /// return the default processor type for the type/setname combination. 2658 /*! 2659 \param type the AnnotationType 2660 \param setname the annotation set. An empty string ("") means ANY set. 2661 \return the processor. May be empty ("") when there is none defined OR it 2662 is ambiguous. 2663 */ 2664 if ( debug ){ 2665 cerr << "defaultprocessor(" << toString( type ) << "," 2666 << setname << ")" << endl; 2667 } 2668 auto const& it = _annotationdefaults.find(type); 2669 if ( it != _annotationdefaults.end() ){ 2670 if ( debug ){ 2671 cerr << "found some defs: " << it->second << endl; 2672 cerr << "NOW search for set: " << setname << endl; 2673 } 2674 if ( setname.empty() ){ 2675 // 'wildcard' search 2676 if ( it->second.size() == 1 2677 && it->second.begin()->second._processors.size() == 1 ){ 2678 // so it is unique for setname AND for the number of processors 2679 return *it->second.begin()->second._processors.begin(); 2680 } 2681 else { 2682 return ""; 2683 } 2684 } 2685 set<string> results; 2686 auto s_it = it->second.lower_bound(setname); 2687 while ( s_it != it->second.upper_bound(setname) ){ 2688 if ( debug ){ 2689 cerr << "found sub strings: " << s_it->second << endl; 2690 } 2691 results.insert( s_it->second._processors.begin(), 2692 s_it->second._processors.end() ); 2693 ++s_it; 2694 } 2695 if ( results.size() == 1 ){ 2696 // so we found exactly 1 processor 2697 return *results.begin(); 2698 } 2699 else if ( results.size() > 1 ){ 2700 auto const& as = annotationtype_xml_map.find(type); 2701 if ( as != annotationtype_xml_map.end() ){ 2702 throw NoDefaultError("No processor specified for <" 2703 + as->second + ">, but the presence of multiple declarations prevent assigning a default"); 2704 } 2705 } 2706 } 2707 return ""; 2708 } 2709 original_default_set(AnnotationType type) const2710 string Document::original_default_set( AnnotationType type ) const { 2711 /// return the default setname for the type in the ORIGINAL definitions. 2712 /*! 2713 \param type the AnnotationType 2714 \return the setname. May be empty ("") when there is none defined OR it 2715 is ambiguous. 2716 2717 In case of \e incremental Document building, we are allowed to add 2718 annotation declarations at any moment. That might render the default_set 2719 of an AnnotationType undefined. With this function, we still are able to 2720 find the original value and use that e.g. on output. 2721 */ 2722 auto const& it = _orig_ann_default_sets.find(type); 2723 if ( it == _orig_ann_default_sets.end() ){ 2724 return ""; 2725 } 2726 else { 2727 return it->second; 2728 } 2729 } 2730 original_default_processor(AnnotationType type) const2731 string Document::original_default_processor( AnnotationType type ) const { 2732 /// return the default processor name for the type in the ORIGINAL definitions. 2733 /*! 2734 \param type the AnnotationType 2735 \return the processor name. May be empty ("") when there is none defined 2736 OR it is ambiguous. 2737 2738 In case of \e incremental Document building, we are allowed to add 2739 annotation declarations at any moment. That might render the default 2740 processor of an AnnotationType undefined. With this function, we still 2741 are able to find the original value and use that e.g. on output. 2742 */ 2743 auto const& it = _orig_ann_default_procs.find(type); 2744 if ( it == _orig_ann_default_procs.end() ){ 2745 return ""; 2746 } 2747 else { 2748 return it->second; 2749 } 2750 } 2751 get_annotators(AnnotationType type,const string & setname) const2752 vector<string> Document::get_annotators( AnnotationType type, 2753 const string& setname ) const { 2754 /// return all the annotators for the type/setname combination. 2755 /*! 2756 \param type the AnnotationType 2757 \param setname the annotation set. An empty string ("") means ANY set. 2758 \return a list of annotators. 2759 */ 2760 vector<string> result; 2761 if ( type == AnnotationType::NO_ANN ){ 2762 return result; 2763 } 2764 const auto& mit1 = _annotationdefaults.find(type); 2765 if ( mit1 != _annotationdefaults.end() ){ 2766 // cerr << "vond iets voor " << toString(type) << endl; 2767 for ( auto pos = mit1->second.lower_bound(setname); 2768 pos != mit1->second.upper_bound(setname); 2769 ++pos ){ 2770 copy( pos->second._processors.begin(), 2771 pos->second._processors.end(), 2772 back_inserter(result) ); 2773 } 2774 } 2775 // cerr << "get default ==> " << result << endl; 2776 return result; 2777 2778 } 2779 get_processors(AnnotationType type,const string & setname) const2780 vector<const processor*> Document::get_processors( AnnotationType type, 2781 const string& setname ) const { 2782 /// return all the processors for the type/setname combination. 2783 /*! 2784 \param type the AnnotationType 2785 \param setname the annotation set. An empty string ("") means ANY set. 2786 \return a list of processors. 2787 */ 2788 vector<const processor*> result; 2789 if ( debug ){ 2790 cerr << "getprocessors(" << toString( type ) << "," 2791 << setname << ")" << endl; 2792 } 2793 if ( type == AnnotationType::NO_ANN ){ 2794 return result; 2795 } 2796 auto const& it = _annotationdefaults.find(type); 2797 if ( it != _annotationdefaults.end() ){ 2798 if ( debug ){ 2799 cerr << "found some defs: " << it->second << endl; 2800 } 2801 for ( auto pos = it->second.lower_bound(setname); 2802 pos != it->second.upper_bound(setname); 2803 ++pos ){ 2804 transform( pos->second._processors.begin(), 2805 pos->second._processors.end(), 2806 back_inserter(result), 2807 [&]( const string& p ){ return get_processor(p); } ); 2808 } 2809 } 2810 return result; 2811 } 2812 add_one_anno(const pair<AnnotationType,string> & pair,xmlNode * node,set<string> & done) const2813 void Document::add_one_anno( const pair<AnnotationType,string>& pair, 2814 xmlNode *node, 2815 set<string>& done ) const { 2816 /// create an annotation declaration entry under the xmlNode node 2817 /*! 2818 \param pair an AnnotationType/setname pair 2819 \param node the node we want to add to 2820 \param done a set of "labels" to keep track of already handled cases 2821 2822 */ 2823 AnnotationType type = pair.first; 2824 string sett = pair.second; 2825 string label = annotation_type_to_string( type ); 2826 if ( done.find(label+sett) != done.end() ){ 2827 return; 2828 } 2829 done.insert(label+sett); 2830 label += "-annotation"; 2831 const auto& mm = _annotationdefaults.find(type); 2832 auto it = mm->second.lower_bound(sett); 2833 while ( it != mm->second.upper_bound(sett) ){ 2834 string s = it->second._annotator; 2835 if ( !s.empty() ){ 2836 // old style 2837 KWargs args; 2838 args["annotator"] = s; 2839 AnnotatorType ant = it->second._ann_type; 2840 if ( ant != UNDEFINED && ant != AUTO ){ 2841 args["annotatortype"] = toString(ant); 2842 } 2843 if ( !strip() ){ 2844 s = it->second._date; 2845 if ( !s.empty() ){ 2846 args["datetime"] = s; 2847 } 2848 } 2849 s = it->second._format; 2850 if ( !s.empty() ){ 2851 args["format"] = s; 2852 } 2853 s = it->first; 2854 if ( s == "None" ){ // "empty" set 2855 // skip 2856 } 2857 else if ( s != "undefined" ){ // the default 2858 args["set"] = s; 2859 } 2860 auto const& t_it = _groupannotations.find(type); 2861 if ( t_it != _groupannotations.end() ){ 2862 auto const& s_it = t_it->second.find(s); 2863 if ( s_it != t_it->second.end() 2864 && s_it->second ){ 2865 args["groupannotations"] = "yes"; 2866 } 2867 } 2868 2869 const auto& ti = _set_alias.find(type); 2870 if ( ti != _set_alias.end() ){ 2871 const auto& alias = ti->second.find(s); 2872 if ( alias->second != s ){ 2873 args["alias"] = alias->second; 2874 } 2875 } 2876 xmlNode *n = TiCC::XmlNewNode( foliaNs(), label ); 2877 addAttributes( n, args ); 2878 xmlAddChild( node, n ); 2879 } 2880 else { 2881 // we have new style processors 2882 KWargs args; 2883 if ( !strip() ){ 2884 s = it->second._date; 2885 if ( !s.empty() ){ 2886 args["datetime"] = s; 2887 } 2888 } 2889 s = it->second._format; 2890 if ( !s.empty() ){ 2891 args["format"] = s; 2892 } 2893 s = it->first; 2894 if ( s == "None" ){ // "empty" set 2895 // skip 2896 } 2897 else if ( s != "undefined" ){ // the default 2898 args["set"] = s; 2899 } 2900 const auto& ti = _set_alias.find(type); 2901 if ( ti != _set_alias.end() ){ 2902 const auto& alias = ti->second.find(s); 2903 if ( alias->second != s ){ 2904 args["alias"] = alias->second; 2905 } 2906 } 2907 auto const& t_it = _groupannotations.find(type); 2908 if ( t_it != _groupannotations.end() ){ 2909 auto const& s_it = t_it->second.find(s); 2910 if ( s_it != t_it->second.end() 2911 && s_it->second ){ 2912 args["groupannotations"] = "yes"; 2913 } 2914 } 2915 xmlNode *n = TiCC::XmlNewNode( foliaNs(), label ); 2916 addAttributes( n, args ); 2917 xmlAddChild( node, n ); 2918 args.clear(); 2919 for ( const auto& p : it->second._processors ){ 2920 xmlNode *a = TiCC::XmlNewNode( foliaNs(), "annotator" ); 2921 args["processor"] = p; 2922 addAttributes( a, args ); 2923 xmlAddChild( n, a ); 2924 } 2925 } 2926 ++it; 2927 } 2928 } 2929 add_annotations(xmlNode * metadata) const2930 void Document::add_annotations( xmlNode *metadata ) const { 2931 /// create an annotations block under the xmlNode metadata 2932 /*! 2933 \param metadata the parent to add to 2934 calls add_one_anno() for every annotation declaration. 2935 */ 2936 if ( debug ){ 2937 cerr << "start add_annotations: " << _annotationdefaults << endl; 2938 cerr << "sorting: " << _anno_sort << endl; 2939 } 2940 xmlNode *node = xmlAddChild( metadata, 2941 TiCC::XmlNewNode( foliaNs(), 2942 "annotations" ) ); 2943 set<string> done; 2944 if ( canonical() ){ 2945 multimap<AnnotationType, 2946 pair<AnnotationType,string>> ordered; 2947 for ( const auto& pair : _anno_sort ){ 2948 ordered.insert(make_pair(pair.first,pair)); 2949 } 2950 for ( const auto& it : ordered ){ 2951 add_one_anno( it.second, node, done ); 2952 } 2953 } 2954 else { 2955 for ( const auto& pair : _anno_sort ){ 2956 add_one_anno( pair, node, done ); 2957 } 2958 } 2959 } 2960 append_processor(xmlNode * node,const processor * p) const2961 void Document::append_processor( xmlNode *node, const processor *p ) const { 2962 /// add a processor xml structure to the parent 'node' 2963 /*! 2964 \param node the xml node to add to 2965 \param p the processor of which to add te info 2966 */ 2967 xmlNode *pr = xmlAddChild( node, TiCC::XmlNewNode( foliaNs(), "processor" ) ); 2968 KWargs atts; 2969 atts["xml:id"] = p->_id; 2970 atts["name"] = p->_name; 2971 if ( p->_type != AUTO || has_explicit() ){ 2972 atts["type"] = toString(p->_type); 2973 } 2974 if ( !strip() ){ 2975 if ( !p->_version.empty() ){ 2976 atts["version"] = p->_version; 2977 } 2978 if ( !p->_folia_version.empty() ){ 2979 atts["folia_version"] = p->_folia_version; 2980 } 2981 if ( !p->_command.empty() ){ 2982 atts["command"] = p->_command; 2983 } 2984 if ( !p->_host.empty() ){ 2985 atts["host"] = p->_host; 2986 } 2987 if ( !p->_user.empty() ){ 2988 atts["user"] = p->_user; 2989 } 2990 if ( !p->_begindatetime.empty() ){ 2991 atts["begindatetime"] = p->_begindatetime; 2992 } 2993 if ( !p->_enddatetime.empty() ){ 2994 atts["enddatetime"] = p->_enddatetime; 2995 } 2996 } 2997 else { 2998 if ( p->_name == "libfolia" ){ 2999 atts["name"] = "stripped"; 3000 } 3001 else if ( p->_name == "foliapy" ){ 3002 atts["name"] = "stripped"; 3003 } 3004 else if ( !p->_name.empty() ){ 3005 atts["name"] = p->_name; 3006 } 3007 if ( !p->_version.empty() ){ 3008 atts["version"] = "stripped"; 3009 } 3010 if ( !p->_folia_version.empty() ){ 3011 atts["folia_version"] = "stripped"; 3012 } 3013 if ( !p->_command.empty() ){ 3014 atts["command"] = "stripped"; 3015 } 3016 if ( !p->_host.empty() ){ 3017 atts["host"] = "stripped"; 3018 } 3019 if ( !p->_user.empty() ){ 3020 atts["user"] = "stripped"; 3021 } 3022 if ( !p->_begindatetime.empty() ){ 3023 atts["begindatetime"] = "stripped"; 3024 } 3025 if ( !p->_enddatetime.empty() ){ 3026 atts["enddatetime"] = "stripped"; 3027 } 3028 } 3029 if ( !p->_document_version.empty() ){ 3030 atts["document_version"] = p->_document_version; 3031 } 3032 if ( !p->_resourcelink.empty() ){ 3033 atts["resourcelink"] = p->_resourcelink; 3034 } 3035 if ( !p->_src.empty() ){ 3036 atts["src"] = p->_src; 3037 } 3038 if ( !p->_format.empty() ){ 3039 atts["format"] = p->_format; 3040 } 3041 addAttributes( pr, atts ); 3042 for ( const auto& it : p->_metadata ){ 3043 xmlNode *m = xmlAddChild( pr, TiCC::XmlNewNode( foliaNs(), "meta" ) ); 3044 KWargs args; 3045 args["id"] = it.first; 3046 addAttributes( m, args ); 3047 xmlAddChild( m, xmlNewText( (const xmlChar*)it.second.c_str()) ); 3048 } 3049 for ( const auto& s : p->_processors ){ 3050 append_processor( pr, s ); 3051 } 3052 } 3053 add_provenance(xmlNode * metadata) const3054 void Document::add_provenance( xmlNode *metadata ) const { 3055 /// create a provenance block under the xmlNode metadata 3056 /*! 3057 \param metadata the parent to add to 3058 calls append_processor() for every processor available 3059 */ 3060 if ( !_provenance ){ 3061 return; 3062 } 3063 xmlNode *node = xmlAddChild( metadata, 3064 TiCC::XmlNewNode( foliaNs(), 3065 "provenance" ) ); 3066 for ( const auto& p : _provenance->processors ){ 3067 append_processor( node, p ); 3068 } 3069 } 3070 add_submetadata(xmlNode * node) const3071 void Document::add_submetadata( xmlNode *node ) const { 3072 /// add a submetadata block to node 3073 for ( const auto& it : submetadata ){ 3074 xmlNode *sm = TiCC::XmlNewNode( foliaNs(), "submetadata" ); 3075 KWargs atts; 3076 atts["xml:id"] = it.first; 3077 addAttributes( sm, atts ); 3078 MetaData *md = submetadata.find(it.first)->second; 3079 string type = md->type(); 3080 atts.clear(); 3081 atts["type"] = type; 3082 addAttributes( sm, atts ); 3083 xmlAddChild( node, sm ); 3084 if ( type == "native" ){ 3085 atts = it.second->get_avs(); 3086 // cerr << "atts: " << atts << endl; 3087 for ( const auto& av : atts ){ 3088 xmlNode *m = TiCC::XmlNewNode( foliaNs(), "meta" ); 3089 KWargs args; 3090 args["id"] = av.first; 3091 addAttributes( m, args ); 3092 xmlAddChild( m, xmlNewText( (const xmlChar*)av.second.c_str()) ); 3093 xmlAddChild( sm, m ); 3094 } 3095 } 3096 else if ( md->datatype() == "ExternalMetaData" ){ 3097 KWargs args; 3098 args["src"] = md->src(); 3099 addAttributes( sm, args ); 3100 } 3101 else if ( md->datatype() == "ForeignMetaData" ){ 3102 for ( const auto& foreign : md->get_foreigners() ) { 3103 xmlNode *f = foreign->xml( true, false ); 3104 xmlAddChild( sm, f ); 3105 } 3106 } 3107 } 3108 } 3109 add_metadata(xmlNode * node) const3110 void Document::add_metadata( xmlNode *node ) const{ 3111 /// add a metadata block to node 3112 if ( _metadata ){ 3113 if ( _metadata->datatype() == "ExternalMetaData" ){ 3114 KWargs atts; 3115 atts["type"] = "external"; 3116 string src = _metadata->src(); 3117 if ( !src.empty() ){ 3118 atts["src"] = src; 3119 } 3120 addAttributes( node, atts ); 3121 } 3122 else { 3123 KWargs atts; 3124 atts["type"] = _metadata->type(); 3125 addAttributes( node, atts ); 3126 for ( const auto& it : _metadata->get_avs() ){ 3127 xmlNode *m = TiCC::XmlNewNode( foliaNs(), "meta" ); 3128 xmlAddChild( m, xmlNewText( (const xmlChar*)it.second.c_str()) ); 3129 KWargs meta_atts; 3130 meta_atts["id"] = it.first; 3131 addAttributes( m, meta_atts ); 3132 xmlAddChild( node, m ); 3133 } 3134 } 3135 } 3136 if ( _foreign_metadata ){ 3137 if ( !_metadata ){ 3138 KWargs atts; 3139 atts["type"] = "foreign"; 3140 addAttributes( node, atts ); 3141 } 3142 for ( const auto& foreign : _foreign_metadata->get_foreigners() ) { 3143 xmlNode *f = foreign->xml( true, false ); 3144 xmlAddChild( node, f ); 3145 } 3146 } 3147 if ( !_metadata 3148 && !_foreign_metadata ){ 3149 KWargs atts; 3150 atts["type"] = "native"; 3151 addAttributes( node, atts ); 3152 } 3153 add_submetadata( node ); 3154 } 3155 add_styles(xmlDoc * doc) const3156 void Document::add_styles( xmlDoc* doc ) const { 3157 /// add a styles block to the output document 3158 /*! 3159 \param doc the output document 3160 */ 3161 for ( const auto& it : styles ){ 3162 string content = "type=\"" + it.first + "\" href=\"" + it.second + "\""; 3163 xmlAddChild( (xmlNode*)doc, 3164 xmlNewDocPI( doc, 3165 (const xmlChar*)"xml-stylesheet", 3166 (const xmlChar*)content.c_str() ) ); 3167 } 3168 } 3169 to_xmlDoc(const string & ns_label) const3170 xmlDoc *Document::to_xmlDoc( const string& ns_label ) const { 3171 /// convert the Document to an xmlDoc 3172 /*! 3173 \param ns_label a namespace label to use. (default "") 3174 */ 3175 xmlDoc *outDoc = xmlNewDoc( (const xmlChar*)"1.0" ); 3176 add_styles( outDoc ); 3177 xmlNode *root = xmlNewDocNode( outDoc, 0, (const xmlChar*)"FoLiA", 0 ); 3178 xmlDocSetRootElement( outDoc, root ); 3179 xmlNs *xl = xmlNewNs( root, (const xmlChar *)"http://www.w3.org/1999/xlink", 3180 (const xmlChar *)"xlink" ); 3181 xmlSetNs( root, xl ); 3182 if ( _foliaNsIn_href == 0 ){ 3183 if ( ns_label.empty() ){ 3184 _foliaNsOut = xmlNewNs( root, (const xmlChar *)NSFOLIA.c_str(), 0 ); 3185 } 3186 else { 3187 _foliaNsOut = xmlNewNs( root, 3188 (const xmlChar *)NSFOLIA.c_str(), 3189 (const xmlChar*)ns_label.c_str() ); 3190 } 3191 } 3192 else { 3193 _foliaNsOut = xmlNewNs( root, 3194 _foliaNsIn_href, 3195 _foliaNsIn_prefix ); 3196 } 3197 xmlSetNs( root, _foliaNsOut ); 3198 KWargs attribs; 3199 attribs["xml:id"] = foliadoc->id(); 3200 if ( strip() ){ 3201 attribs["generator"] = ""; 3202 attribs["version"] = ""; 3203 } 3204 else { 3205 attribs["generator"] = "libfolia-v" + library_version(); 3206 attribs["version"] = _version_string; 3207 // attribs["version"] = folia_version(); 3208 } 3209 if ( has_explicit() ){ 3210 attribs["form"] = "explicit"; 3211 } 3212 if ( _external_document ){ 3213 attribs["external"] = "yes"; 3214 } 3215 addAttributes( root, attribs ); 3216 3217 xmlNode *md = xmlAddChild( root, TiCC::XmlNewNode( foliaNs(), "metadata" ) ); 3218 add_annotations( md ); 3219 add_provenance( md ); 3220 add_metadata( md ); 3221 for ( size_t i=0; i < foliadoc->size(); ++i ){ 3222 FoliaElement* el = foliadoc->index(i); 3223 xmlAddChild( root, el->xml( true, canonical() ) ); 3224 } 3225 return outDoc; 3226 } 3227 toXml(const string & ns_label) const3228 string Document::toXml( const string& ns_label ) const { 3229 /// dump the Document to a string 3230 /*! 3231 \param ns_label a namespace label to use. (default "") 3232 */ 3233 string result; 3234 if ( foliadoc ){ 3235 xmlDoc *outDoc = to_xmlDoc( ns_label ); 3236 xmlChar *buf; int size; 3237 xmlDocDumpFormatMemoryEnc( outDoc, &buf, &size, 3238 output_encoding, 1 ); 3239 result = string( (const char *)buf, size ); 3240 xmlFree( buf ); 3241 xmlFreeDoc( outDoc ); 3242 _foliaNsOut = 0; 3243 } 3244 else { 3245 throw runtime_error( "can't save, no doc" ); 3246 } 3247 return result; 3248 } 3249 toXml(const string & file_name,const string & ns_label) const3250 bool Document::toXml( const string& file_name, 3251 const string& ns_label ) const { 3252 /// write the Document to a file 3253 /*! 3254 \param file_name the name of the file to create 3255 \param ns_label a namespace label to use. (default "") 3256 \return false on error, true otherwise 3257 automaticly detects .gz and .bz2 filenames and will handle accordingly 3258 */ 3259 if ( foliadoc ){ 3260 long int res = 0; 3261 if ( TiCC::match_back( file_name, ".bz2" ) ){ 3262 string tmpname = file_name.substr( 0, file_name.length() - 3 ) + "tmp"; 3263 if ( toXml( tmpname, ns_label ) ){ 3264 bool stat = TiCC::bz2Compress( tmpname, file_name ); 3265 remove( tmpname.c_str() ); 3266 if ( !stat ){ 3267 res = -1; 3268 } 3269 } 3270 } 3271 else { 3272 xmlDoc *outDoc = to_xmlDoc( ns_label ); 3273 if ( TiCC::match_back( file_name, ".gz" ) ){ 3274 xmlSetDocCompressMode(outDoc,9); 3275 } 3276 res = xmlSaveFormatFileEnc( file_name.c_str(), 3277 outDoc, 3278 output_encoding, 1 ); 3279 xmlFreeDoc( outDoc ); 3280 _foliaNsOut = 0; 3281 } 3282 if ( res == -1 ){ 3283 return false; 3284 } 3285 } 3286 else { 3287 return false; 3288 } 3289 return true; 3290 } 3291 Pattern(const vector<string> & pat_vec,const ElementType et,const string & args)3292 Pattern::Pattern( const vector<string>& pat_vec, 3293 const ElementType et, 3294 const string& args ): matchannotation(et) { 3295 /// create a Pattern structure for searching 3296 /*! 3297 \param pat_vec a list of search terms (may be regular expressions) 3298 \param et The kind of elements to match on 3299 \param args additionale search options as attribute/value pairs 3300 */ 3301 regexp = false; 3302 case_sensitive = false; 3303 KWargs kw = getArgs( args ); 3304 matchannotationset = kw["matchannotationset"]; 3305 if (kw["regexp"] != "" ){ 3306 regexp = TiCC::stringTo<bool>( kw["regexp"] ); 3307 } 3308 if (kw["maxgapsize"] != "" ){ 3309 maxgapsize = TiCC::stringTo<int>( kw["maxgapsize"] ); 3310 } 3311 else { 3312 maxgapsize = 10; 3313 } 3314 if ( kw["casesensitive"] != "" ){ 3315 case_sensitive = TiCC::stringTo<bool>( kw["casesensitive"] ); 3316 } 3317 for ( const auto& pat : pat_vec ){ 3318 if ( pat.find( "regexp('" ) == 0 && 3319 pat.rfind( "')" ) == pat.length()-2 ){ 3320 string tmp = pat.substr( 8, pat.length() - 10 ); 3321 UnicodeString us = TiCC::UnicodeFromUTF8( tmp ); 3322 UErrorCode u_stat = U_ZERO_ERROR; 3323 RegexMatcher *matcher = new RegexMatcher(us, 0, u_stat); 3324 if ( U_FAILURE(u_stat) ){ 3325 throw runtime_error( "failed to create a regexp matcher with '" + tmp + "'" ); 3326 } 3327 matchers.push_back( matcher ); 3328 sequence.push_back( "" ); 3329 } 3330 else { 3331 sequence.push_back( TiCC::UnicodeFromUTF8(pat) ); 3332 matchers.push_back( 0 ); 3333 if ( !case_sensitive ){ 3334 sequence.back().toLower(); 3335 } 3336 } 3337 } 3338 } 3339 Pattern(const std::vector<std::string> & pat_vec,const std::string & args)3340 Pattern::Pattern( const std::vector<std::string>& pat_vec, 3341 const std::string& args ) : matchannotation(BASE) { 3342 /// create a Pattern structure for searching 3343 /*! 3344 \param pat_vec a list if search terms (may be regular expressions) 3345 \param args additionale search options as attribute/value pairs 3346 */ 3347 regexp = false; 3348 case_sensitive = false; 3349 KWargs kw = getArgs( args ); 3350 matchannotationset = kw["matchannotationset"]; 3351 if (kw["regexp"] != "" ){ 3352 regexp = TiCC::stringTo<bool>( kw["regexp"] ); 3353 } 3354 if (kw["maxgapsize"] != "" ){ 3355 maxgapsize = TiCC::stringTo<int>( kw["maxgapsize"] ); 3356 } 3357 else { 3358 maxgapsize = 10; 3359 } 3360 if ( kw["casesensitive"] != "" ){ 3361 case_sensitive = TiCC::stringTo<bool>( kw["casesensitive"] ); 3362 } 3363 for ( const auto& pat : pat_vec ){ 3364 if ( pat.find( "regexp('" ) == 0 && 3365 pat.rfind( "')" ) == pat.length()-2 ){ 3366 string tmp = pat.substr( 8, pat.length() - 10 ); 3367 UnicodeString us = TiCC::UnicodeFromUTF8( tmp ); 3368 UErrorCode u_stat = U_ZERO_ERROR; 3369 RegexMatcher *matcher = new RegexMatcher(us, 0, u_stat); 3370 if ( U_FAILURE(u_stat) ){ 3371 throw runtime_error( "failed to create a regexp matcher with '" + tmp + "'" ); 3372 } 3373 matchers.push_back( matcher ); 3374 sequence.push_back( "" ); 3375 } 3376 else { 3377 sequence.push_back( TiCC::UnicodeFromUTF8(pat) ); 3378 matchers.push_back( 0 ); 3379 if ( !case_sensitive ){ 3380 sequence.back().toLower(); 3381 } 3382 } 3383 } 3384 } 3385 ~Pattern()3386 Pattern::~Pattern(){ 3387 /// destroy a Pattern 3388 for ( const auto& m : matchers ){ 3389 delete m; 3390 } 3391 } 3392 operator <<(ostream & os,const Pattern & p)3393 inline ostream& operator<<( ostream& os, const Pattern& p ){ 3394 /// debugging only: output the sequence part of a Pattern 3395 using TiCC::operator <<; 3396 os << "pattern: " << p.sequence; 3397 return os; 3398 } 3399 match(const UnicodeString & us,size_t & pos,int & gap,bool & done,bool & flag) const3400 bool Pattern::match( const UnicodeString& us, 3401 size_t& pos, 3402 int& gap, 3403 bool& done, 3404 bool& flag ) const { 3405 /// try to match the input string to this pattern 3406 /*! 3407 \param us A UnicodeString to match 3408 \param pos the position of the (regex) matcher to try 3409 \param gap 3410 \param done 3411 \param flag 3412 \return true on a succesful match 3413 */ 3414 UnicodeString s = us; 3415 // cerr << "gap = " << gap << "cursor=" << pos << " vergelijk '" << sequence[pos] << "' met '" << us << "'" << endl; 3416 if ( matchers[pos] ){ 3417 matchers[pos]->reset( s ); 3418 UErrorCode u_stat = U_ZERO_ERROR; 3419 if ( matchers[pos]->matches( u_stat ) ){ 3420 done = ( ++pos >= sequence.size() ); 3421 return true; 3422 } 3423 else { 3424 ++pos; 3425 return false; 3426 } 3427 } 3428 else { 3429 if ( !case_sensitive ){ 3430 s.toLower(); 3431 } 3432 if ( sequence[pos] == s || sequence[pos] == "*:1" ){ 3433 done = ( ++pos >= sequence.size() ); 3434 return true; 3435 } 3436 else if ( sequence[pos] == "*" ){ 3437 if ( (pos + 1 ) >= sequence.size() ){ 3438 done = true; 3439 } 3440 else if ( sequence[pos+1] == s ){ 3441 // cerr << " but next matched!" << endl; 3442 flag = ( ++gap < maxgapsize ); 3443 if ( !flag ){ 3444 pos = pos + gap; 3445 done = ( ++pos >= sequence.size() ); 3446 } 3447 else { 3448 done = true; 3449 } 3450 } 3451 else if ( ++gap == maxgapsize ){ 3452 ++pos; 3453 } 3454 else { 3455 flag = true; 3456 } 3457 return true; 3458 } 3459 else { 3460 ++pos; 3461 return false; 3462 } 3463 } 3464 } 3465 variablesize() const3466 bool Pattern::variablesize() const { 3467 /// look if at least one sequence in the Pattern is "*" 3468 return any_of( sequence.begin(), 3469 sequence.end(), 3470 []( const UnicodeString& s ) { return s == "*"; } ); 3471 } 3472 unsetwild()3473 void Pattern::unsetwild() { 3474 /// replace all sequence in the Pattern with value "*" by "*:1" 3475 replace_if( sequence.begin(), 3476 sequence.end(), 3477 []( const UnicodeString& s ) { return s == "*"; }, 3478 "*:1" 3479 ); 3480 } 3481 variablewildcards() const3482 set<int> Pattern::variablewildcards() const { 3483 /// build an index of all "*" sequences 3484 set<int> result; 3485 for ( size_t i=0; i < sequence.size(); ++i ){ 3486 if ( sequence[i] == "*" ){ 3487 result.insert( i ); 3488 } 3489 } 3490 return result; 3491 } 3492 findwords(const Pattern & pat,const string & args) const3493 vector<vector<Word*> > Document::findwords( const Pattern& pat, 3494 const string& args ) const { 3495 /// search the Document for vector of Word list matching the Pattern 3496 /*! 3497 \param pat The search Pattern 3498 \param args additional search options as attribute/value pairs 3499 \return a vector of Word list that matched. (if any) 3500 supported additional arguments can be 'leftcontext' and 'rightcontext' 3501 */ 3502 size_t leftcontext = 0; 3503 size_t rightcontext = 0; 3504 KWargs kw = getArgs( args ); 3505 string val = kw["leftcontext"]; 3506 if ( !val.empty() ){ 3507 leftcontext = TiCC::stringTo<size_t>(val); 3508 } 3509 val = kw["rightcontext"]; 3510 if ( !val.empty() ){ 3511 rightcontext = TiCC::stringTo<size_t>(val); 3512 } 3513 vector<vector<Word*> > result; 3514 vector<Word*> matched; 3515 if ( pat.regexp ){ 3516 throw runtime_error( "regexp not supported yet in patterns" ); 3517 } 3518 vector<Word*> mywords = words(); 3519 for ( size_t startpos =0; startpos < mywords.size(); ++startpos ){ 3520 // loop over all words 3521 // cerr << "outer loop STARTPOS = " << startpos << endl; 3522 size_t cursor = 0; 3523 int gap = 0; 3524 bool goon = true; 3525 for ( size_t i = startpos; i < mywords.size() && goon ; ++i ){ 3526 // cerr << "inner LOOP I = " << i << " myword=" << mywords[i] << endl; 3527 UnicodeString value; 3528 if ( pat.matchannotation == BASE ){ 3529 value = mywords[i]->text(); 3530 } 3531 else { 3532 vector<FoliaElement *> v = mywords[i]->select( pat.matchannotation ); 3533 if ( v.size() != 1 ){ 3534 continue; 3535 } 3536 value = TiCC::UnicodeFromUTF8(v[0]->cls()); 3537 } 3538 bool done = false; 3539 bool flag = false; 3540 if ( pat.match( value, cursor, gap, done, flag ) ){ 3541 // cerr << "matched, " << (done?"done":"not done") 3542 // << (flag?" Flagged!":":{") << endl; 3543 matched.push_back(mywords[i]); 3544 if ( cursor == 0 ){ 3545 startpos = i; // restart search here 3546 } 3547 if ( done ){ 3548 vector<Word*> keep = matched; 3549 // cerr << "findnodes() tussenresultaat ==> " << matched << endl; 3550 vector<Word*> tmp1; 3551 if ( leftcontext > 0 ){ 3552 tmp1 = matched[0]->leftcontext(leftcontext); 3553 // cerr << "findnodes() tmp1 ==> " << tmp1 << endl; 3554 copy( matched.begin(), matched.end(), back_inserter(tmp1) ); 3555 // cerr << "findnodes() tmp1 na copy ==> " << tmp1 << endl; 3556 } 3557 else { 3558 tmp1 = matched; 3559 } 3560 vector<Word*> tmp2; 3561 if ( rightcontext > 0 ){ 3562 tmp2 = matched.back()->rightcontext(rightcontext); 3563 // cerr << "findnodes() tmp2 ==> " << tmp2 << endl; 3564 copy( tmp2.begin(), tmp2.end(), back_inserter(tmp1) ); 3565 // cerr << "findnodes() tmp2 na copy ==> " << tmp2 << endl; 3566 } 3567 result.push_back(tmp1); 3568 // cerr << "findnodes() tussenresultaat 2 ==> " << tmp1 << endl; 3569 if ( flag ){ 3570 matched = keep; 3571 } 3572 else { 3573 cursor = 0; 3574 matched.clear(); 3575 goon = false; 3576 } 3577 } 3578 } 3579 else { 3580 cursor = 0; 3581 matched.clear(); 3582 goon = false; 3583 } 3584 } 3585 } 3586 // cerr << "findnodes() result ==> " << result << endl; 3587 return result; 3588 } 3589 findwords(list<Pattern> & pats,const string & args) const3590 vector<vector<Word*> > Document::findwords( list<Pattern>& pats, 3591 const string& args ) const { 3592 /// search the Document for vector of Word list matching one of the Pattern 3593 /*! 3594 \param pats a list of search Patterns 3595 \param args additional search options as attribute/value pairs 3596 \return a vector of Word list that matched. (if any) 3597 supported additional arguments can be 'leftcontext' and 'rightcontext' 3598 */ 3599 size_t prevsize = 0; 3600 bool start = true; 3601 bool unsetwildcards = false; 3602 set<int> variablewildcards; 3603 int index = 0; 3604 for ( const auto& it : pats ){ 3605 // cerr << "bekijk patroon : " << *it << endl; 3606 if ( start ){ 3607 prevsize = it.size(); 3608 start = false; 3609 } 3610 else if ( it.size() != prevsize ){ 3611 throw runtime_error( "findnodes(): If multiple patterns are provided, they must all have the same length!" ); 3612 } 3613 if ( it.variablesize() ){ 3614 if ( index > 0 && variablewildcards.empty() ){ 3615 unsetwildcards = true; 3616 } 3617 else { 3618 if ( !variablewildcards.empty() && 3619 variablewildcards != it.variablewildcards() ){ 3620 throw runtime_error("If multiple patterns are provided with variable wildcards, then these wildcards must all be in the same positions!"); 3621 } 3622 variablewildcards = it.variablewildcards(); 3623 } 3624 } 3625 else if ( !variablewildcards.empty() ){ 3626 unsetwildcards = true; 3627 } 3628 ++index; 3629 } 3630 if ( unsetwildcards ){ 3631 for ( auto& it : pats ){ 3632 it.unsetwild(); 3633 } 3634 } 3635 vector<vector<Word*> > result; 3636 for ( const auto& it : pats ){ 3637 vector<vector<Word*> > res = findwords( it, args ); 3638 if ( result.empty() ){ 3639 result = res; 3640 } 3641 else if ( res != result ){ 3642 result.clear(); 3643 break; 3644 } 3645 } 3646 return result; 3647 } 3648 3649 } // namespace folia 3650