1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of libfolia 7 8 libfolia is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 libfolia is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 */ 26 27 #include <cassert> 28 #include <cstdlib> 29 #include <iostream> 30 #include <fstream> 31 #include <sstream> 32 #include <string> 33 #include <set> 34 #include <list> 35 #include <vector> 36 #include <map> 37 #include <algorithm> 38 #include <type_traits> 39 #include <stdexcept> 40 #include "ticcutils/PrettyPrint.h" 41 #include "ticcutils/StringOps.h" 42 #include "ticcutils/XMLtools.h" 43 #include "ticcutils/Unicode.h" 44 #include "libfolia/folia.h" 45 #include "libfolia/folia_properties.h" 46 #include "config.h" 47 48 using namespace std; 49 using namespace icu; 50 using namespace TiCC; 51 52 namespace folia { 53 using TiCC::operator <<; 54 VersionName()55 string VersionName() { return PACKAGE_STRING; } ///< Returns the PACKAGE_STRING info of the package Version()56 string Version() { return VERSION; } ///< Returns version of the library 57 element_id() const58 ElementType AbstractElement::element_id() const { 59 /// return the ELEMENT_ID property 60 return _props.ELEMENT_ID; 61 } 62 occurrences() const63 size_t AbstractElement::occurrences() const { 64 /// return the OCCURENCES property 65 return _props.OCCURRENCES; 66 } 67 occurrences_per_set() const68 size_t AbstractElement::occurrences_per_set() const { 69 /// return the OCCURRENCES_PER_SET property 70 return _props.OCCURRENCES_PER_SET; 71 } 72 required_attributes() const73 Attrib AbstractElement::required_attributes() const { 74 /// return the REQUIRED_ATTRIBUTES property 75 return _props.REQUIRED_ATTRIBS; 76 } 77 optional_attributes() const78 Attrib AbstractElement::optional_attributes() const { 79 /// return the OPTONAL_ATTRIBUTES property 80 return _props.OPTIONAL_ATTRIBS; 81 } 82 hidden() const83 bool AbstractElement::hidden() const { 84 /// return the HIDDEN property 85 return _props.HIDDEN; 86 } 87 xmltag() const88 const string& AbstractElement::xmltag() const { 89 /// return the XMLTAG property 90 /* 91 For pre 1.5 documents, it will return the OLD name of that property. 92 e.g. "spanrelation" is translated to the old "complexalignment" 93 */ 94 const string& result = _props.XMLTAG; 95 if ( doc() && doc()->version_below(1,6) ){ 96 const auto& it = reverse_old.find(result); 97 if ( it != reverse_old.end() ){ 98 return it->second; 99 } 100 } 101 return result; 102 } 103 default_subset() const104 const string& AbstractElement::default_subset() const { 105 /// return the SUBSET property 106 return _props.SUBSET; 107 } 108 annotation_type() const109 AnnotationType AbstractElement::annotation_type() const { 110 /// return the ANNOTATIONTYPE property 111 return _props.ANNOTATIONTYPE; 112 } 113 accepted_data() const114 const set<ElementType>& AbstractElement::accepted_data() const { 115 /// return the ACCEPTED_DATA property 116 return _props.ACCEPTED_DATA; 117 } 118 required_data() const119 const set<ElementType>& AbstractElement::required_data() const { 120 /// return the REQUIRED_DATA property 121 return _props.REQUIRED_DATA; 122 } 123 printable() const124 bool AbstractElement::printable() const { 125 /// return the PRINTABLE property 126 return _props.PRINTABLE; 127 } 128 speakable() const129 bool AbstractElement::speakable() const { 130 /// return the SPEAKABLE property 131 return _props.SPEAKABLE; 132 } 133 referable() const134 bool AbstractElement::referable() const { 135 /// return the WREFABLE property 136 return _props.WREFABLE; 137 } 138 is_textcontainer() const139 bool AbstractElement::is_textcontainer() const { 140 /// return the TEXTCONTAINER property 141 return _props.TEXTCONTAINER; 142 } 143 implicitspace() const144 bool AbstractElement::implicitspace() const { 145 /// return the IMPLICITSPACE property 146 return _props.IMPLICITSPACE; 147 } 148 is_phoncontainer() const149 bool AbstractElement::is_phoncontainer() const { 150 /// return the PHONCONTAINER property 151 return _props.PHONCONTAINER; 152 } 153 text_delimiter() const154 const string& AbstractElement::text_delimiter() const { 155 /// return the TEXTDELIMITER property 156 return _props.TEXTDELIMITER; 157 } 158 xlink() const159 bool AbstractElement::xlink() const { 160 /// return the XLINK property 161 return _props.XLINK; 162 } 163 auth() const164 bool AbstractElement::auth() const { 165 /// return the AUTH property 166 return _props.AUTH; 167 } 168 setonly() const169 bool AbstractElement::setonly() const { 170 /// return the SETONLY property 171 return _props.SETONLY; 172 } 173 auto_generate_id() const174 bool AbstractElement::auto_generate_id() const { 175 /// return the AUTO_GENERATE_ID property 176 return _props.AUTO_GENERATE_ID; 177 } 178 is_structure(const FoliaElement * el)179 bool is_structure( const FoliaElement *el ){ 180 /// test if the object is a Structure Element. 181 /*! 182 \param el the FoliaElement to test 183 \return true when the parameter is an AbstractStructureElement 184 or a derivative of an AbstractStructureElement 185 */ 186 return dynamic_cast<const AbstractStructureElement*>( el ) != 0; 187 } 188 href() const189 const string AllowXlink::href() const { 190 /// return the 'href' value of the object 191 /*! 192 * if the object has an xlink value for 'href' it is returned as a string 193 * otherwise the result is "" 194 */ 195 auto it = _xlink.find("href"); 196 if ( it != _xlink.end() ){ 197 return it->second; 198 } 199 return ""; 200 } 201 set_tag(const string & t)202 const string AbstractElement::set_tag( const string& t ) { 203 /// set a value for the _tags attribute 204 /*! 205 * \param t the new value (may be empty) 206 * \return the old value (can be empty) 207 * thows when the FoliaElement doesn't support the tag attribute 208 */ 209 Attrib supported = required_attributes() | optional_attributes(); 210 if ( !(TAG & supported) ) { 211 throw ValueError( "settag is not supported for " + classname() ); 212 } 213 string r = _tags; 214 _tags = t; 215 return r; 216 } 217 operator <<(ostream & os,const FoliaElement & ae)218 ostream& operator<<( ostream& os, const FoliaElement& ae ) { 219 /// Output operator for FoliaElements. (for DEBUGGING only) 220 /*! 221 * \param os the output stream 222 * \param ae the FoliaElement 223 */ 224 os << " <" << ae.classname(); 225 KWargs ats = ae.collectAttributes(); 226 if ( !ae.id().empty() ) { 227 os << " xml:id=\"" << ae.id() << '"'; 228 ats.erase("xml:id"); 229 } 230 231 for ( const auto& it: ats ) { 232 os << " " << it.first << "=\"" << it.second << '"'; 233 } 234 os << " > {"; 235 for ( size_t i=0; i < ae.size(); ++i ) { 236 os << "<" << ae.index(i)->classname() << ">,"; 237 } 238 os << "}"; 239 if ( ae.printable() && ae.classname()[0] == 't' ){ 240 os << " \"" << ae.str(ae.textclass()) << "\" (" << ae.textclass() << ")"; 241 } 242 return os; 243 } 244 operator <<(ostream & os,const FoliaElement * ae)245 ostream& operator<<( ostream&os, const FoliaElement *ae ) { 246 /// Output operator for FoliaElements. (for DEBUGGING only) 247 /*! 248 * \param os the output stream 249 * \param ae the FoliaElement 250 */ 251 if ( !ae ) { 252 os << "nil"; 253 } 254 else 255 os << *ae; 256 return os; 257 } 258 259 //#define DE_AND_CONSTRUCT_DEBUG 260 AbstractElement(const properties & p,Document * d)261 AbstractElement::AbstractElement( const properties& p, Document *d ) : 262 /// Constructor for AbstractElements. 263 /*! 264 * \param p a properties block (required) 265 * \param d a parent document 266 */ 267 _mydoc(d), 268 _parent(0), 269 _auth( p.AUTH ), 270 _space(true), 271 _annotator_type(UNDEFINED), 272 _refcount(0), 273 _confidence(-1), 274 _preserve_spaces(SPACE_FLAGS::UNSET), 275 _props(p) 276 { 277 #ifdef DE_AND_CONSTRUCT_DEBUG 278 cerr << "created an : " << xmltag() << " adres=" << (void*)this << endl; 279 #endif 280 } 281 AbstractElement(const properties & p,FoliaElement * el)282 AbstractElement::AbstractElement( const properties& p, FoliaElement *el ) : 283 /// Constructor for AbstractElements. 284 /*! 285 * \param p a properties block (required) 286 * \param el a parent node, to append to 287 */ 288 AbstractElement( p, el->doc() ) 289 { 290 if ( !el ){ 291 throw ValueError( "AbstractElement( p, e ) called with 0 e" ); 292 } 293 el->append( this ); 294 } 295 ~AbstractElement()296 AbstractElement::~AbstractElement( ) { 297 #ifdef DE_AND_CONSTRUCT_DEBUG 298 cerr << "really delete " << xmltag() << " adres=" << (void*)this << endl; 299 #endif 300 } 301 destroy()302 void AbstractElement::destroy( ) { 303 /// Pseudo destructor for AbstractElements. 304 /// recursively destroys this nodes and it's children 305 /// Will also remove it from it's parent when no references are left 306 #ifdef DE_AND_CONSTRUCT_DEBUG 307 cerr << "\ndestroy " << xmltag() << " adres=" << (void*)this 308 << " id=" << _id << " class= " 309 << cls() << " datasize= " << _data.size() << endl; 310 cerr << "REFCOUNT = " << refcount() << endl; 311 cerr << "AT= " << annotation_type() << " (" << _set << ")" << endl; 312 #endif 313 if ( doc() ) { 314 doc()->decrRef( annotation_type(), _set ); 315 if ( refcount() > 0 ){ 316 decrefcount(); 317 doc()->keepForDeletion( this ); 318 #ifdef DE_AND_CONSTRUCT_DEBUG 319 cerr << "\t\tstill keeping element id=" << _id << " tag = " 320 << xmltag() << " adres=" << (void*)this << " class= " << cls() 321 << " datasize= " << _data.size() << endl; 322 #endif 323 return; 324 } 325 doc()->del_doc_index( _id ); 326 } 327 if ( _parent ){ 328 #ifdef DE_AND_CONSTRUCT_DEBUG 329 cerr << "STILL A PARENT: " << _parent << endl; 330 #endif 331 _parent->remove( this ); 332 } 333 for ( const auto& el : _data ) { 334 el->set_parent(0); 335 el->destroy(); 336 } 337 _data.clear(); 338 #ifdef DE_AND_CONSTRUCT_DEBUG 339 cerr << "\t\tfinished destroying element id=" << _id << " tag = " 340 << xmltag() << " adres=" << (void*)this << " class= " << cls() 341 << " datasize= " << _data.size() << endl; 342 #endif 343 delete this; 344 } 345 destroy(FoliaElement * el)346 void destroy( FoliaElement *el ){ 347 if ( el ){ 348 el->destroy(); 349 } 350 } 351 foliaNs() const352 xmlNs *AbstractElement::foliaNs() const { 353 /// return the associated xmlNs object. 354 /*! 355 * \return the XML namespace element of the associated FoLiA document 356 * or 0 when no xml document is available 357 */ 358 if ( doc() ) { 359 return doc()->foliaNs(); 360 } 361 return 0; 362 } 363 check_set_declaration()364 void AbstractElement::check_set_declaration(){ 365 /// check the declation consistency of an object. 366 /// throws an exception on error 367 /*! 368 * When the object has an associated document, the declaration of the 369 * 'set' attribute is checked. Or the default set when no 'set' is provided 370 * Also the presence of an appropiate annotation declaration is checked 371 * for the annotation-type of the object. This might auto-declare 372 * the anntotation-type, when de document allows this. 373 */ 374 375 if ( isSubClass( AbstractCorrectionChild_t ) ){ 376 return; 377 } 378 379 if ( _mydoc ){ 380 string def; 381 if ( !_set.empty() ){ 382 if ( !doc()->declared( annotation_type(), _set ) ) { 383 throw DeclarationError( "Set '" + _set 384 + "' is used but has no declaration " + 385 "for " + toString( annotation_type() ) 386 + "-annotation" ); 387 } 388 } 389 else { 390 if ( _mydoc->debug > 2 ) { 391 cerr << "get def for " << annotation_type() << endl; 392 } 393 def = doc()->default_set( annotation_type() ); 394 if ( doc()->debug > 2 ) { 395 cerr << "got def='" << def << "'" << endl; 396 } 397 if ( doc()->is_incremental() && def.empty() ){ 398 // when there is NO default set, AND we are parsing using 399 // folia::Engine, we must check if there WAS an empty set originally 400 // which is 'obscured' by newly added declarations 401 def = doc()->original_default_set( annotation_type() ); 402 if ( doc()->debug > 2 ) { 403 cerr << "from original got def='" << def << "'" << endl; 404 } 405 } 406 if ( !def.empty() ){ 407 _set = def; 408 } 409 else if ( CLASS & required_attributes() ){ 410 throw XmlError( "unable to assign a default set for tag: " + xmltag() ); 411 } 412 } 413 if ( annotation_type() != AnnotationType::NO_ANN 414 && !_mydoc->version_below( 2, 0 ) ){ 415 if ( !_mydoc->declared( annotation_type() ) ){ 416 if ( _mydoc->autodeclare() ){ 417 _mydoc->auto_declare( annotation_type(), _set ); 418 } 419 else { 420 throw DeclarationError( "Encountered an instance of <" 421 + xmltag() 422 + "> without a proper " 423 + toString(annotation_type()) 424 + "-annotation" ); 425 } 426 } 427 else if ( _set.empty() 428 && !isSubClass( AbstractAnnotationLayer_t ) 429 && !doc()->declared( annotation_type(), "None" ) ){ 430 if ( _mydoc->autodeclare() ){ 431 _mydoc->auto_declare( annotation_type(), _set ); 432 } 433 else { 434 throw DeclarationError( "Encountered an instance of <" 435 + xmltag() 436 + "> without a proper " 437 + toString(annotation_type()) 438 + "-annotation" ); 439 } 440 } 441 } 442 } 443 } 444 445 setAttributes(KWargs & kwargs)446 void AllowXlink::setAttributes( KWargs& kwargs ) { 447 /// set the objects attributes given a set of Key-Value pairs. 448 /*! 449 * \param kwargs a KWargs set of Key-Value pairs 450 * the given keys are checked agains a range of criteria: 451 * - if the object supports the attribue 452 * - if the object provided value is valid 453 * - if the attribute is declared for the annotation-type 454 */ 455 string type = "simple"; 456 string val = kwargs.extract( "xlink:type" ); 457 if ( !val.empty() ) { 458 type = val; 459 } 460 if ( type != "simple" && type != "locator" ) { 461 throw XmlError( "only xlink:types: 'simple' and 'locator' are supported!" ); 462 } 463 _xlink["type"] = type; 464 val = kwargs.extract( "xlink:href" ); 465 if ( !val.empty() ) { 466 _xlink["href"] = val; 467 } 468 else if ( type == "locator" ){ 469 throw XmlError( "xlink:type='locator' requires an 'xlink:href' attribute" ); 470 } 471 val = kwargs.extract( "xlink:role" ); 472 if ( !val.empty() ) { 473 _xlink["role"] = val; 474 } 475 val = kwargs.extract( "xlink:title" ); 476 if ( !val.empty() ) { 477 _xlink["title"] = val; 478 } 479 val = kwargs.extract( "xlink:label" ); 480 if ( !val.empty() ) { 481 if ( type == "simple" ){ 482 throw XmlError( "xlink:type='simple' may not have an 'xlink:label' attribute" ); 483 } 484 _xlink["label"] = val; 485 } 486 val = kwargs.extract( "xlink:arcrole" ); 487 if ( !val.empty() ) { 488 if ( type == "locator" ){ 489 throw XmlError( "xlink:type='locator' may not have an 'xlink:arcrole' attribute" ); 490 } 491 _xlink["arcrole"] = val; 492 } 493 val = kwargs.extract( "xlink:show" ); 494 if ( !val.empty() ) { 495 if ( type == "locator" ){ 496 throw XmlError( "xlink:type='locator' may not have an 'xlink:show' attribute" ); 497 } 498 _xlink["show"] = val; 499 } 500 val = kwargs.extract( "xlink:actuate" ); 501 if ( !val.empty() ) { 502 if ( type == "locator" ){ 503 throw XmlError( "xlink:type='locator' may not have an 'xlink:actuate' attribute" ); 504 } 505 _xlink["actuate"] = val; 506 } 507 } 508 setAttributes(KWargs & kwargs)509 void AbstractElement::setAttributes( KWargs& kwargs ) { 510 /// set the objects attributes given a set of Key-Value pairs. 511 /*! 512 * \param kwargs a KWargs set of Key-Value pairs 513 * the given keys are checked agains a range of criteria: 514 * - if the object supports the attribue 515 * - if the object provided value is valid 516 * - if the attribute is declared for the annotation-type 517 */ 518 // for the moment, always look for the 'xml:space' attribute 519 string sval = kwargs.extract( "xml:space" ); 520 if ( !sval.empty() ){ 521 if ( sval == "preserve" ){ 522 _preserve_spaces = SPACE_FLAGS::PRESERVE; 523 } 524 else if ( sval == "default" ){ 525 _preserve_spaces = SPACE_FLAGS::DEFAULT; 526 } 527 else { 528 throw runtime_error( "invalid value for attribute xml:space, must be " 529 "'default' or 'preserve', found: '" + sval + "'"); 530 } 531 } 532 Attrib supported = required_attributes() | optional_attributes(); 533 //#define LOG_SET_ATT 534 #ifdef LOG_SET_ATT 535 int db_level = 0; 536 if ( doc() ){ 537 db_level = doc()->debug; 538 } 539 if ( element_id() == New_t 540 || element_id() == Original_t ) { 541 if ( doc() ){ 542 doc()->setdebug(0); 543 } 544 cerr << "set attributes: '" << kwargs << "' on " << classname() << endl; 545 // cerr << "required = " << toString(required_attributes()) << endl; 546 // cerr << "optional = " << optional_attributes() << endl; 547 //cerr << "supported = " << supported << endl; 548 // cerr << "ID & supported = " << (ID & supported) << endl; 549 // cerr << "ID & _required = " << (ID & required_attributes() ) << endl; 550 // cerr << "_id=" << _id << endl; 551 // cerr << "AUTH : " << _auth << endl; 552 } 553 #endif 554 if ( doc() && doc()->debug > 2 ) { 555 cerr << "set attributes: " << kwargs << " on " << classname() << endl; 556 } 557 558 string val = kwargs.extract( "generate_id" ); 559 if ( !val.empty() ) { 560 if ( !doc() ) { 561 throw runtime_error( "can't generate an ID without a doc" ); 562 } 563 if ( (!ID) & supported ) { 564 throw ValueError( "generate_id: xml:id is not supported for " 565 + classname() ); 566 } 567 if ( val == "auto()" ){ 568 FoliaElement *par = parent(); 569 if ( par ) { 570 _id = par->generateId( xmltag() ); 571 } 572 else { 573 throw ValueError( "generate_id `auto()' not possible without parent" ); 574 } 575 } 576 else { 577 FoliaElement *e = (*doc())[val]; 578 if ( e ) { 579 _id = e->generateId( xmltag() ); 580 } 581 else { 582 throw ValueError("Unable to generate an id from ID= " + val ); 583 } 584 } 585 } 586 else { 587 val = kwargs.extract( "xml:id" ); 588 if ( val.empty() ) { 589 val = kwargs.extract( "_id" ); // for backward compatibility 590 } 591 if ( !val.empty() ) { 592 if ( (!ID) & supported ) { 593 throw ValueError( "xml:id is not supported for " + classname() ); 594 } 595 else if ( val == "auto()" ){ 596 FoliaElement *par = parent(); 597 if ( par ) { 598 _id = par->generateId( xmltag() ); 599 } 600 else { 601 throw ValueError( "auto-generate of 'xml:id' not possible without parent" ); 602 } 603 } 604 else if ( isNCName( val ) ){ 605 _id = val; 606 } 607 else { 608 throw XmlError( "'" + val + "' is not a valid NCName." ); 609 } 610 } 611 } 612 613 _set.clear(); 614 val = kwargs.extract( "set" ); 615 if ( !val.empty() ) { 616 if ( !doc() ) { 617 throw ValueError( "attribute set=" + val + " is used on a node without a document." ); 618 } 619 if ( !( (CLASS & supported) || setonly() ) ) { 620 throw ValueError("attribute 'set' is not supported for " + classname()); 621 } 622 else { 623 string st = doc()->unalias( annotation_type(), val ); 624 if ( st.empty() ){ 625 _set = val; 626 } 627 else { 628 _set = st; 629 } 630 } 631 } 632 633 check_set_declaration(); 634 635 _class.clear(); 636 val = kwargs.extract( "class" ); 637 if ( !val.empty() ) { 638 if ( !( CLASS & supported ) ) { 639 throw ValueError("Class is not supported for " + classname() ); 640 } 641 if ( element_id() != TextContent_t && element_id() != PhonContent_t ) { 642 if ( !doc() ) { 643 throw ValueError( "Class=" + val + " is used on a node without a document." ); 644 } 645 if ( _set.empty() ){ 646 if ( !doc()->declared( annotation_type(), "None" ) ) { 647 cerr << endl << doc()->annotationdefaults() << endl << endl; 648 throw ValueError( xmltag() +": An empty set is used but that has no declaration " 649 "for " + toString( annotation_type() ) 650 + "-annotation" ); 651 } 652 _set = "None"; 653 } 654 doc()->incrRef( annotation_type(), _set ); 655 } 656 _class = val; 657 } 658 659 if ( element_id() != TextContent_t && element_id() != PhonContent_t ) { 660 if ( !_class.empty() && _set.empty() ) { 661 throw ValueError("Set is required for <" + classname() + 662 " class=\"" + _class + "\"> assigned without set." ); 663 } 664 } 665 666 _annotator.clear(); 667 val = kwargs.extract( "annotator" ); 668 if ( !val.empty() ) { 669 if ( !(ANNOTATOR & supported) ) { 670 throw ValueError("attribute 'annotator' is not supported for " + classname() ); 671 } 672 else { 673 _annotator = val; 674 } 675 } 676 else { 677 string def; 678 if ( doc() && 679 (def = doc()->default_annotator( annotation_type(), _set )) != "" ) { 680 _annotator = def; 681 } 682 } 683 684 _annotator_type = UNDEFINED; 685 val = kwargs.extract( "annotatortype" ); 686 if ( !val.empty() ) { 687 if ( ! (ANNOTATOR & supported) ) { 688 throw ValueError("Annotatortype is not supported for " + classname() ); 689 } 690 else { 691 _annotator_type = stringTo<AnnotatorType>( val ); 692 if ( _annotator_type == UNDEFINED ) { 693 throw ValueError( "annotatortype must be 'auto' or 'manual', got '" 694 + val + "'" ); 695 } 696 } 697 } 698 else { 699 if ( doc() ){ 700 AnnotatorType def = doc()->default_annotatortype( annotation_type(), _set ); 701 if ( def != UNDEFINED ) { 702 _annotator_type = def; 703 } 704 } 705 } 706 707 val = kwargs.extract( "processor" ); 708 if ( !val.empty() ){ 709 if ( doc() && doc()->debug > 2 ) { 710 cerr << "set processor= " << val << " on " << classname() << endl; 711 } 712 if ( annotation_type() == AnnotationType::NO_ANN ){ 713 throw ValueError( "Unable to set processor on " + classname() + ". AnnotationType is None!" ); 714 } 715 if ( _set.empty() ){ 716 _set = "None"; 717 } 718 if ( !(ANNOTATOR & supported) ){ 719 throw ValueError( "attribute 'processor' is not supported for " + classname() ); 720 } 721 else { 722 if ( doc() && doc()->get_processor(val) == 0 ){ 723 throw ValueError("attribute 'processor' has unknown value: " + val ); 724 } 725 if ( doc() 726 && !doc()->declared( annotation_type(), _set, "", _annotator_type, val ) ){ 727 if ( !doc()->version_below( 2, 0 ) 728 && doc()->autodeclare() ) { 729 KWargs args; 730 args["processor"] = val; 731 args["annotatortype"] = _annotator_type; 732 doc()->declare( annotation_type(), _set, args ); 733 } 734 else { 735 throw DeclarationError( "Processor '" + val 736 + "' is used for annotationtype '" 737 + toString( annotation_type() ) 738 + "' with set='" + _set +"'" 739 + " but there is no corresponding <annotator>" 740 + " referring to it in the annotation" 741 + " declaration block." ); 742 } 743 } 744 _processor = val; 745 } 746 } 747 else if ( (ANNOTATOR & supported) && doc() ){ 748 string def; 749 try { 750 def = doc()->default_processor( annotation_type(), _set ); 751 } 752 catch ( const NoDefaultError& e ){ 753 if ( doc()->is_incremental() ){ 754 // when there is NO default processor, AND we are parsing using 755 // folia::Engine, we must check if there WAS a processor originally 756 // which is 'obscured' by newly added declarations 757 def = doc()->original_default_processor( annotation_type() ); 758 if ( doc()->debug > 2 ) { 759 cerr << "from original got default processor='" << def << "'" << endl; 760 } 761 } 762 else { 763 throw; 764 } 765 } 766 _processor = def; 767 } 768 769 _confidence = -1; 770 val = kwargs.extract( "confidence" ); 771 if ( !val.empty() ) { 772 if ( !(CONFIDENCE & supported) ) { 773 throw ValueError("Confidence is not supported for " + classname() ); 774 } 775 else { 776 try { 777 _confidence = stringTo<double>(val); 778 if ( _confidence < 0 || _confidence > 1.0 ){ 779 throw ValueError("Confidence must be a floating point number between 0 and 1, got " + TiCC::toString(_confidence) ); 780 } 781 } 782 catch (...) { 783 throw ValueError( "invalid Confidence value: " + val 784 + " (not a number?)"); 785 } 786 } 787 } 788 789 _n = ""; 790 val = kwargs.extract( "n" ); 791 if ( !val.empty() ) { 792 if ( !(N & supported) ) { 793 throw ValueError("N attribute is not supported for " + classname() ); 794 } 795 else { 796 _n = val; 797 } 798 } 799 _datetime.clear(); 800 val = kwargs.extract( "datetime" ); 801 if ( !val.empty() ) { 802 if ( !(DATETIME & supported) ) { 803 throw ValueError("datetime attribute is not supported for " + classname() ); 804 } 805 else { 806 string time = parseDate( val ); 807 if ( time.empty() ){ 808 throw ValueError( "invalid datetime, must be in YYYY-MM-DDThh:mm:ss format: " + val ); 809 } 810 _datetime = time; 811 } 812 } 813 else { 814 string def; 815 if ( doc() && 816 (def = doc()->default_datetime( annotation_type(), _set )) != "" ) { 817 _datetime = def; 818 } 819 } 820 val = kwargs.extract( "begintime" ); 821 if ( !val.empty() ) { 822 if ( !(BEGINTIME & supported) ) { 823 throw ValueError( "begintime attribute is not supported for " + classname() ); 824 } 825 else { 826 string time = parseTime( val ); 827 if ( time.empty() ) { 828 throw ValueError( "invalid begintime, must be in HH:MM:SS.mmm format: " + val ); 829 } 830 _begintime = time; 831 } 832 } 833 else { 834 _begintime.clear(); 835 } 836 val = kwargs.extract( "endtime" ); 837 if ( !val.empty() ) { 838 if ( !(ENDTIME & supported) ) { 839 throw ValueError( "endtime attribute is not supported for " + classname() ); 840 } 841 else { 842 string time = parseTime( val ); 843 if ( time.empty() ) { 844 throw ValueError( "invalid endtime, must be in HH:MM:SS.mmm format: " + val ); 845 } 846 _endtime = time; 847 } 848 } 849 else { 850 _endtime.clear(); 851 } 852 853 val = kwargs.extract( "src" ); 854 if ( !val.empty() ) { 855 if ( !(SRC & supported) ) { 856 throw ValueError( "src attribute is not supported for " + classname() ); 857 } 858 else { 859 _src = val; 860 } 861 } 862 else { 863 _src.clear(); 864 } 865 val = kwargs.extract( "tag" ); 866 if ( !val.empty() ) { 867 if ( !(TAG & supported) ) { 868 throw ValueError( "tag attribute is not supported for " + classname() ); 869 } 870 else { 871 _tags = val; 872 } 873 } 874 else { 875 _tags.clear(); 876 } 877 878 if ( SPACE & supported ){ 879 _space = true; 880 } 881 val = kwargs.extract( "space" ); 882 if ( !val.empty() ) { 883 if ( !(SPACE & supported) ){ 884 throw ValueError( "space attribute is not supported for " + classname() ); 885 } 886 else { 887 if ( val == "no" ) { 888 _space = false; 889 } 890 else if ( val == "yes" ) { 891 _space = true; 892 } 893 else { 894 throw ValueError( "invalid value for space attribute: '" + val + "'" ); 895 } 896 } 897 } 898 899 val = kwargs.extract( "metadata" ); 900 if ( !val.empty() ) { 901 if ( !(METADATA & supported) ) { 902 throw ValueError( "Metadata attribute is not supported for " + classname() ); 903 } 904 else { 905 _metadata = val; 906 if ( doc() && doc()->get_submetadata( _metadata ) == 0 ){ 907 throw KeyError( "No such metadata defined: " + _metadata ); 908 } 909 } 910 } 911 else { 912 _metadata.clear(); 913 } 914 val = kwargs.extract( "speaker" ); 915 if ( !val.empty() ) { 916 if ( !(SPEAKER & supported) ) { 917 throw ValueError( "speaker attribute is not supported for " + classname() ); 918 } 919 else { 920 _speaker = val; 921 } 922 } 923 else { 924 _speaker.clear(); 925 } 926 927 val = kwargs.extract( "textclass" ); 928 if ( !val.empty() ) { 929 if ( !(TEXTCLASS & supported) ) { 930 throw ValueError( "textclass attribute is not supported for " + classname() ); 931 } 932 else { 933 _textclass = val; 934 } 935 } 936 else { 937 _textclass = "current"; 938 } 939 940 val = kwargs.extract( "auth" ); 941 if ( !val.empty() ){ 942 _auth = stringTo<bool>( val ); 943 } 944 if ( doc() && !_id.empty() ) { 945 try { 946 doc()->add_doc_index( this ); 947 } 948 catch ( const DuplicateIDError& e ){ 949 if ( element_id() != WordReference_t ){ 950 throw; 951 } 952 } 953 } 954 kwargs.erase("typegroup"); //this is used in explicit form only, we can safely discard it 955 addFeatureNodes( kwargs ); 956 #ifdef LOG_SET_ATT 957 if ( doc() ){ 958 doc()->setdebug(db_level); 959 } 960 #endif 961 } 962 addFeatureNodes(const KWargs & kwargs)963 void AbstractElement::addFeatureNodes( const KWargs& kwargs ) { 964 /// add children to the object, based on the set of Key-Value pairs. 965 /*! 966 * \param kwargs a KWargs set of Key-Value pairs 967 * the given keys must be in the AttributeFeatures set. 968 * the values are used as class attribute for the new children 969 * will throw for unexpected attributes, except when in permisive mode 970 */ 971 for ( const auto& it: kwargs ) { 972 string tag = it.first; 973 if ( tag == "head" ) { 974 // "head" is special because the tag is "headfeature" 975 // this to avoid conflicts with the "head" tag! 976 tag = "headfeature"; 977 } 978 if ( AttributeFeatures.find( tag ) == AttributeFeatures.end() ) { 979 string message = "unsupported attribute: " + tag + "='" + it.second 980 + "' for node with tag '" + classname() + "'"; 981 if ( tag == "id" ){ 982 message += "\ndid you mean xml:id?"; 983 } 984 if ( doc() && doc()->permissive() ) { 985 cerr << message << endl; 986 } 987 else { 988 throw XmlError( message ); 989 } 990 } 991 KWargs newa; 992 newa["class"] = it.second; 993 FoliaElement *new_node = createElement( tag, doc() ); 994 new_node->setAttributes( newa ); 995 append( new_node ); 996 } 997 } 998 toDoubleString(double d)999 string toDoubleString( double d ){ 1000 if ( d == 1.0 ){ 1001 return "1.0"; 1002 } 1003 else if ( d == 0.0 ){ 1004 return "0.0"; 1005 } 1006 else { 1007 stringstream ss; 1008 ss.precision(6); 1009 ss << d; 1010 return ss.str(); 1011 } 1012 } 1013 collectAttributes() const1014 KWargs AllowXlink::collectAttributes() const { 1015 KWargs attribs; 1016 auto it = _xlink.find("type"); 1017 if ( it != _xlink.end() ){ 1018 string type = it->second; 1019 if ( type == "simple" || type == "locator" ){ 1020 it = _xlink.find("href"); 1021 if ( it != _xlink.end() ){ 1022 attribs["xlink:href"] = it->second; 1023 attribs["xlink:type"] = type; 1024 } 1025 it = _xlink.find("role"); 1026 if ( it != _xlink.end() ){ 1027 attribs["xlink:role"] = it->second; 1028 } 1029 it = _xlink.find("arcrole"); 1030 if ( it != _xlink.end() ){ 1031 attribs["xlink:arcrole"] = it->second; 1032 } 1033 it = _xlink.find("show"); 1034 if ( it != _xlink.end() ){ 1035 attribs["xlink:show"] = it->second; 1036 } 1037 it = _xlink.find("actuate"); 1038 if ( it != _xlink.end() ){ 1039 attribs["xlink:actuate"] = it->second; 1040 } 1041 it = _xlink.find("title"); 1042 if ( it != _xlink.end() ){ 1043 attribs["xlink:title"] = it->second; 1044 } 1045 it = _xlink.find("label"); 1046 if ( it != _xlink.end() ){ 1047 attribs["xlink:label"] = it->second; 1048 } 1049 } 1050 } 1051 return attribs; 1052 } 1053 set_typegroup(KWargs & attribs) const1054 void AbstractElement::set_typegroup( KWargs& attribs ) const { 1055 if ( isSubClass( AbstractStructureElement_t ) ){ 1056 attribs["typegroup"] = "structure"; 1057 } 1058 else if ( isSubClass( Feature_t ) ){ 1059 attribs["typegroup"] = "feature"; 1060 } 1061 else if ( isSubClass( AbstractInlineAnnotation_t ) ){ 1062 attribs["typegroup"] = "inline"; 1063 } 1064 else if ( isSubClass( AbstractHigherOrderAnnotation_t ) ){ 1065 attribs["typegroup"] = "higherorder"; 1066 } 1067 else if ( isSubClass( AbstractSpanRole_t ) ){ 1068 attribs["typegroup"] = "spanrole"; 1069 } 1070 else if ( isSubClass( AbstractSpanAnnotation_t ) ){ 1071 attribs["typegroup"] = "span"; 1072 } 1073 else if ( isSubClass( AbstractTextMarkup_t ) ){ 1074 attribs["typegroup"] = "textmarkup"; 1075 } 1076 else if ( isSubClass( AbstractContentAnnotation_t ) ){ 1077 attribs["typegroup"] = "content"; 1078 } 1079 else if ( isSubClass( AbstractAnnotationLayer_t ) ){ 1080 attribs["typegroup"] = "layer"; 1081 } 1082 else if ( isSubClass( AbstractSubtokenAnnotation_t ) ){ 1083 attribs["typegroup"] = "subtoken"; 1084 } 1085 else if ( isSubClass( AbstractCorrectionChild_t ) ){ 1086 attribs["typegroup"] = "correctionchild"; 1087 } 1088 else { 1089 cerr << "UNHANDLED " << element_id() << endl; 1090 } 1091 } 1092 collectAttributes() const1093 KWargs AbstractElement::collectAttributes() const { 1094 /// extract all Attribute-Value pairs from the object 1095 /*! 1096 * \return a KWargs set of Attribute-value pairs 1097 * Might also use declaration defaults and alias declarations to extract 1098 * default values 1099 */ 1100 KWargs attribs; 1101 bool Explicit = false; 1102 Attrib supported = required_attributes() | optional_attributes(); 1103 if ( doc() && doc()->has_explicit() ){ 1104 Explicit = true; 1105 set_typegroup( attribs ); 1106 } 1107 if ( !_id.empty() ) { 1108 attribs["xml:id"] = _id; 1109 } 1110 if ( _preserve_spaces == SPACE_FLAGS::PRESERVE ) { 1111 attribs["xml:space"] = "preserve"; 1112 } 1113 if ( doc() ){ 1114 string default_set = doc()->default_set( annotation_type() ); 1115 bool isDefaultSet = (_set == default_set); 1116 if ( Explicit && _set != "None" && !default_set.empty() ){ 1117 if ( _set.empty() ){ 1118 attribs["set"] = default_set; 1119 } 1120 else { 1121 attribs["set"] = _set; 1122 } 1123 } 1124 else if ( _set != "None" 1125 && !_set.empty() 1126 && !isDefaultSet ){ 1127 string ali = doc()->alias( annotation_type(), _set ); 1128 if ( ali.empty() ){ 1129 attribs["set"] = _set; 1130 } 1131 else { 1132 attribs["set"] = ali; 1133 } 1134 } 1135 if ( !_class.empty() ) { 1136 attribs["class"] = _class; 1137 } 1138 if ( !_processor.empty() ){ 1139 string tmp; 1140 try { 1141 tmp = doc()->default_processor( annotation_type(), _set ); 1142 if ( Explicit ){ 1143 attribs["processor"] = tmp; 1144 } 1145 } 1146 catch ( const NoDefaultError& ){ 1147 } 1148 catch ( ... ){ 1149 throw; 1150 } 1151 if ( tmp != _processor ){ 1152 attribs["processor"] = _processor; 1153 } 1154 } 1155 else { 1156 bool isDefaultAnn = true; 1157 if ( !_annotator.empty() && 1158 _annotator != doc()->default_annotator( annotation_type(), _set ) ) { 1159 isDefaultAnn = false; 1160 attribs["annotator"] = _annotator; 1161 } 1162 if ( _annotator_type != UNDEFINED ){ 1163 AnnotatorType at = doc()->default_annotatortype( annotation_type(), _set ); 1164 if ( (!isDefaultSet || !isDefaultAnn) 1165 && _annotator_type != at ) { 1166 if ( _annotator_type == AUTO ) { 1167 attribs["annotatortype"] = "auto"; 1168 } 1169 else if ( _annotator_type == MANUAL ) { 1170 attribs["annotatortype"] = "manual"; 1171 } 1172 } 1173 } 1174 } 1175 } 1176 if ( !_datetime.empty() && 1177 _datetime != doc()->default_datetime( annotation_type(), _set ) ) { 1178 attribs["datetime"] = _datetime; 1179 } 1180 if ( !_begintime.empty() ) { 1181 attribs["begintime"] = _begintime; 1182 } 1183 if ( !_endtime.empty() ) { 1184 attribs["endtime"] = _endtime; 1185 } 1186 if ( !_src.empty() ) { 1187 attribs["src"] = _src; 1188 } 1189 if ( !_tags.empty() ) { 1190 attribs["tag"] = _tags; 1191 } 1192 if ( !_metadata.empty() ) { 1193 attribs["metadata"] = _metadata; 1194 } 1195 if ( !_speaker.empty() ) { 1196 attribs["speaker"] = _speaker; 1197 } 1198 if ( ( TEXTCLASS & supported) 1199 && ( !_textclass.empty() && 1200 ( _textclass != "current" || Explicit ) ) ){ 1201 attribs["textclass"] = _textclass; 1202 } 1203 1204 if ( _confidence >= 0 ) { 1205 attribs["confidence"] = toDoubleString(_confidence); 1206 } 1207 if ( !_n.empty() ) { 1208 attribs["n"] = _n; 1209 } 1210 if ( !_auth ) { 1211 attribs["auth"] = "no"; 1212 } 1213 if ( SPACE & optional_attributes() ){ 1214 if ( !_space ) { 1215 attribs["space"] = "no"; 1216 } 1217 } 1218 return attribs; 1219 } 1220 xmlstring(bool add_ns) const1221 const string FoliaElement::xmlstring( bool add_ns ) const{ 1222 /// serialize a FoLiAElement to a string (XML fragment) 1223 /*! 1224 * \param add_ns Also add the NameSpace declarations 1225 * \return a string representation of the FoLiA XML 1226 */ 1227 return xmlstring( false, 0, add_ns ); 1228 } 1229 xmlstring(bool format,int indent,bool add_ns) const1230 const string FoliaElement::xmlstring( bool format, 1231 int indent, 1232 bool add_ns ) const{ 1233 /// serialize a FoLiAElement to a string (XML fragment) 1234 /*! 1235 * \param format allow output formating 1236 * \param indent number of spaces to indent 1237 * \param add_ns Also add the NameSpace declarations 1238 * \return a string representation of the FoLiA XML 1239 */ 1240 xmlNode *n = xml( true, false ); 1241 if ( add_ns ){ 1242 xmlSetNs( n, xmlNewNs( n, (const xmlChar *)NSFOLIA.c_str(), 0 ) ); 1243 } 1244 xmlBuffer *buf = xmlBufferCreate(); 1245 // xmlKeepBlanksDefault(0); 1246 xmlNodeDump( buf, 0, n, indent, (format?1:0) ); 1247 string result = (const char*)xmlBufferContent( buf ); 1248 xmlBufferFree( buf ); 1249 xmlFreeNode( n ); 1250 return result; 1251 } 1252 tagToAtt(const FoliaElement * c)1253 string tagToAtt( const FoliaElement* c ) { 1254 /// helper function. Given an element of type Feature_t, return the tag value 1255 /*! 1256 * \param c some FoLiAElement 1257 * \return the string value of attribute related to the tag of the parameter 1258 * if the element is of type Feature_t is has an asscociated attribute 1259 * otherwise not, and the empty string is returned. 1260 */ 1261 string att; 1262 if ( c->isSubClass( Feature_t ) ) { 1263 att = c->xmltag(); 1264 if ( att == "feat" ) { 1265 // "feat" is a Feature_t too. exclude! 1266 att = ""; 1267 } 1268 else if ( att == "headfeature" ) { 1269 // "head" is special 1270 att = "head"; 1271 } 1272 } 1273 return att; 1274 } 1275 1276 CheckText(const FoliaElement * parent,const FoliaElement * child,const string & cls)1277 void CheckText( const FoliaElement *parent, 1278 const FoliaElement *child, 1279 const string& cls ){ 1280 if ( parent 1281 && parent->element_id() != Correction_t 1282 && parent->hastext( cls ) ){ 1283 // check text consistency for parents with text 1284 // but SKIP Corrections 1285 UnicodeString s1 = parent->stricttext( cls ); 1286 UnicodeString s2 = child->stricttext( cls ); 1287 // cerr << "check parent: " << s1 << endl; 1288 // cerr << "check child: " << s2 << endl; 1289 // no retain tokenization, strict for both 1290 s1 = normalize_spaces( s1 ); 1291 s2 = normalize_spaces( s2 ); 1292 if ( !s1.isEmpty() && !s2.isEmpty() ){ 1293 bool test_fail; 1294 if ( child->isSubClass( TextContent_t ) 1295 || child->isSubClass( AbstractTextMarkup_t ) 1296 || child->isSubClass( String_t ) 1297 || child->isSubClass( Word_t ) ){ 1298 // Words and Strings are 'per definition' PART of their parents 1299 test_fail = ( s1.indexOf( s2 ) < 0 ); // aren't they? 1300 } 1301 else { 1302 // otherwise an exacte match is needed 1303 test_fail = ( s1 != s2 ); 1304 } 1305 if ( test_fail ){ 1306 throw InconsistentText( "adding text (class=" 1307 + cls + ") from node: " + child->xmltag() 1308 + "(" + child->id() + ")" 1309 + " with value\n'" + TiCC::UnicodeToUTF8(s2) 1310 + "'\n to element: " + parent->xmltag() + 1311 + "(" + parent->id() + ") which already has " 1312 + "text in that class and value: \n'" 1313 + TiCC::UnicodeToUTF8(s1) + "'\n" ); 1314 } 1315 } 1316 } 1317 } 1318 CheckText2(const FoliaElement * parent,const FoliaElement * child,const string & cls,bool trim_spaces)1319 void CheckText2( const FoliaElement *parent, 1320 const FoliaElement *child, 1321 const string& cls, 1322 bool trim_spaces ){ 1323 if ( parent 1324 && parent->hastext( cls ) ){ 1325 // check text consistency for parents with text 1326 // but SKIP Corrections 1327 // no retain tokenization, strict for parent, deeper for child 1328 TextPolicy tp( cls ); 1329 tp.set( TEXT_FLAGS::STRICT ); 1330 if ( !trim_spaces ) { 1331 tp.set( TEXT_FLAGS::NO_TRIM_SPACES ); 1332 } 1333 UnicodeString s1 = parent->text( tp ); 1334 tp.clear( TEXT_FLAGS::STRICT ); 1335 UnicodeString s2 = child->text( tp ); 1336 s1 = normalize_spaces( s1 ); 1337 s2 = normalize_spaces( s2 ); 1338 bool test_fail; 1339 if ( child->isSubClass( Word_t ) 1340 || child->isSubClass( String_t ) 1341 || child->isSubClass( AbstractTextMarkup_t ) ) { 1342 // Words, Strings and AbstractTextMarkup are 'per definition' PART of 1343 // their text parents 1344 test_fail = ( s1.indexOf( s2 ) < 0 ); // aren't they? 1345 } 1346 else { 1347 // otherwise an exacte match is needed 1348 test_fail = ( s1 != s2 ); 1349 } 1350 if ( test_fail ){ 1351 bool warn_only = false; 1352 if ( trim_spaces ) { 1353 //ok, we failed according to the >v2.4.1 rules 1354 //but do we also fail under the old rules? 1355 try { 1356 child->check_text_consistency(false); 1357 warn_only = true; 1358 } catch ( const InconsistentText& ) { 1359 //ignore, we raise the newer error 1360 } 1361 } 1362 string msg = "conflicting text (class=" 1363 + cls + ") from node: " + child->xmltag() 1364 + "(" + child->id() + ")" 1365 + " with value\n'" + TiCC::UnicodeToUTF8(s2) 1366 + "'\n with parent: " + parent->xmltag() + 1367 + "(" + parent->id() + ") which already has " 1368 + "text in that class and value: \n'" 1369 + TiCC::UnicodeToUTF8(s1) + "'\n"; 1370 if ( warn_only ) { 1371 msg += "However, according to the older rules (<v2.4.1) the text is consistent. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.\n"; 1372 cerr << "WARNING: inconsistent text: " << msg << endl; 1373 parent->doc()->increment_warn_count(); 1374 } 1375 else { 1376 throw InconsistentText(msg); 1377 } 1378 } 1379 } 1380 } 1381 check_append_text_consistency(const FoliaElement * child) const1382 void AbstractElement::check_append_text_consistency( const FoliaElement *child ) const { 1383 /// check the text consistency of a new child against the Element. 1384 /*! 1385 * \param child the new child 1386 * 1387 * When a document is available AND it has the checktext() property 1388 * the text of the child is checked against the text of the parent. 1389 * 1390 * will throw on error. 1391 * 1392 * For Word, String and TextContent children, we assume that their text is 1393 * embedded in the parents text. 1394 * 1395 * For all other cases, the text of the child should match the parents text. 1396 * \note Matching is opaque to spaces, newlines and tabs 1397 */ 1398 // cerr << "VOOR checkappend I am=" << this << endl; 1399 // cerr << "VOOR checkappend child=" << child << endl; 1400 if ( !doc() || !doc()->checktext() || doc()->fixtext() ){ 1401 return; 1402 } 1403 string cls = child->cls(); 1404 // cerr << "HIER 2 " << cls << endl; 1405 if ( child->size() == 0 1406 || ( child->is_textcontainer() 1407 && !child->hastext( cls ) ) ){ 1408 // no use to proceed. not adding real text 1409 return; 1410 } 1411 // cerr << "HIER 3 " << endl; 1412 const FoliaElement *parent = 0; 1413 if ( child->is_textcontainer() ){ 1414 parent = this->parent(); 1415 } 1416 else { 1417 parent = this; 1418 cls = child->index(0)->cls(); 1419 } 1420 // cerr << "PARENT? " << parent << endl; 1421 CheckText( parent, child, cls ); 1422 } 1423 check_text_consistency(bool trim_spaces) const1424 void AbstractElement::check_text_consistency( bool trim_spaces ) const { 1425 /// check the text consistency of the combined text of the children 1426 /// against the text of the Element. 1427 /*! 1428 * When a document is available AND it has the checktext() property 1429 * the combined text of ALL the children is checked against the text of 1430 * the parent. 1431 * 1432 * will throw on error 1433 * 1434 * For Word and String children, we only assume that their text is 1435 * embedded in the parents text. 1436 * 1437 * For all other cases, the text should exactly match the parents text. 1438 * \note Matching is opaque to spaces, newlines and tabs 1439 */ 1440 if ( !doc() || !doc()->checktext() || !printable() ){ 1441 return; 1442 } 1443 1444 string cls = this->cls(); 1445 FoliaElement *parent = this->parent(); 1446 CheckText2( parent, this, cls, trim_spaces ); 1447 } 1448 check_text_consistency_while_parsing(bool trim_spaces,bool debug)1449 void AbstractElement::check_text_consistency_while_parsing( bool trim_spaces, 1450 bool debug ) { 1451 // this block was moved from parseXml into a separate function 1452 // it remains to be seen how much overlaps with check_text_consistency() 1453 // and whether we can't make do with one function 1454 // 1455 // unlike the other function, this does do some fixing when requested 1456 // 1457 1458 if ( debug ){ 1459 cerr << "DEBUG: BEGIN check_text_consistency_while_parsing(" 1460 << trim_spaces << ")" << endl; 1461 } 1462 vector<TextContent*> tv = select<TextContent>( false ); 1463 // first see which text classes are present 1464 set<string> classes; 1465 for ( const auto& it : tv ){ 1466 classes.insert( it->cls() ); 1467 } 1468 // check the text for every text class 1469 for ( const auto& st : classes ){ 1470 UnicodeString s1, s2; 1471 TextPolicy tp( st ); 1472 tp.set_correction_handling(CORRECTION_HANDLING::EITHER); 1473 tp.set( TEXT_FLAGS::STRICT ); 1474 tp.set_debug( debug ); 1475 if ( !trim_spaces ) { 1476 tp.set( TEXT_FLAGS::NO_TRIM_SPACES ); 1477 } 1478 try { 1479 s1 = text( tp ); // no retain tokenization, strict 1480 } 1481 catch (...){ 1482 } 1483 if ( !s1.isEmpty() ){ 1484 // cerr << "S1: " << s1 << endl; 1485 tp.clear( TEXT_FLAGS::STRICT ); 1486 try { 1487 s2 = text( tp ); // no retain tokenization, no strict 1488 } 1489 catch (...){ 1490 } 1491 // cerr << "S2: " << s2 << endl; 1492 s1 = normalize_spaces( s1 ); 1493 s2 = normalize_spaces( s2 ); 1494 if ( !s2.isEmpty() && s1 != s2 ){ 1495 if ( doc()->fixtext() ){ 1496 // cerr << "FIX: " << s1 << "==>" << s2 << endl; 1497 KWargs args; 1498 args["value"] = TiCC::UnicodeToUTF8(s2); 1499 args["class"] = st; 1500 TextContent *node = new TextContent( args, doc() ); 1501 this->replace( node ); 1502 } 1503 else { 1504 bool warn_only = false; 1505 if ( trim_spaces ) { 1506 //ok, we failed according to the >v2.4.1 rules 1507 //but do we also fail under the old rules? 1508 try { 1509 if ( debug ){ 1510 cerr << "DEBUG: (testing according to older rules now)" << endl; 1511 } 1512 this->check_text_consistency_while_parsing(false); 1513 warn_only = true; 1514 } 1515 catch ( const InconsistentText& e ) { 1516 if ( debug ){ 1517 cerr << "(tested according to older rules (<v2.4.1) as well, but this failed too)" << endl; 1518 } 1519 //ignore, we raise the newer error 1520 } 1521 } 1522 string msg = "node " + xmltag() + "(" + id() 1523 + ") has a mismatch for the text in set:" + st 1524 + "\nthe element text ='" + TiCC::UnicodeToUTF8(s1) 1525 + "'\n" + " the deeper text ='" + TiCC::UnicodeToUTF8(s2) + "'"; 1526 if ( warn_only ) { 1527 msg += "\nHOWEVER, according to the older rules (<v2.4.1) the text is consistent. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.\n"; 1528 cerr << "WARNING: inconsistent text: " << msg << endl; 1529 doc()->increment_warn_count(); 1530 } 1531 else { 1532 if ( debug ){ 1533 cerr << "DEBUG: CONSISTENCYERROR check_text_consistency_while_parsing(" << trim_spaces << ")" << endl; 1534 } 1535 throw InconsistentText(msg); 1536 } 1537 } 1538 } 1539 } 1540 } 1541 if ( debug ){ 1542 cerr << "DEBUG: END-OK check_text_consistency_while_parsing(" 1543 << trim_spaces << ")" << endl; 1544 } 1545 } 1546 xml(bool recursive,bool kanon) const1547 xmlNode *AbstractElement::xml( bool recursive, bool kanon ) const { 1548 /// convert an Element to an xmlNode 1549 /*! 1550 * \param recursive Convert the children too, creating a xmlNode tree 1551 * \param kanon Output in a canonical form to make comparions easy 1552 * \return am xmlNode object(-tree) 1553 */ 1554 xmlNode *e = XmlNewNode( foliaNs(), xmltag() ); 1555 KWargs attribs = collectAttributes(); 1556 if ( _preserve_spaces == SPACE_FLAGS::PRESERVE ){ 1557 // we carry an 'xml:space="preserve" flag? 1558 if ( doc()->preserve_spaces() ){ 1559 // if our ancestor did also, clear it here 1560 attribs.extract( "xml:space" ); 1561 } 1562 else { 1563 // otherwise leave it, and notify our document 1564 doc()->set_preserve_spaces(true); 1565 } 1566 } 1567 else if ( doc()->preserve_spaces() ){ 1568 // this subtree should go back to "default" then 1569 attribs["xml:space"] = "default"; 1570 // and the doc needs to know it 1571 doc()->set_preserve_spaces(false); 1572 } 1573 set<FoliaElement *> attribute_elements; 1574 // nodes that can be represented as attributes are converted to atributes 1575 // and excluded of 'normal' output. 1576 1577 if ( !doc()->has_explicit() ){ 1578 map<string,int> af_map; 1579 // first we search al features that can be serialized to an attribute 1580 // and count them! 1581 for ( const auto& el : _data ) { 1582 string at = tagToAtt( el ); 1583 if ( !at.empty() ) { 1584 ++af_map[at]; 1585 } 1586 } 1587 // ok, now we create attributes for those that only occur once 1588 for ( const auto& el : _data ) { 1589 string at = tagToAtt( el ); 1590 if ( !at.empty() && af_map[at] == 1 ) { 1591 attribs[at] = el->cls(); 1592 attribute_elements.insert( el ); 1593 } 1594 } 1595 } 1596 addAttributes( e, attribs ); 1597 if ( _data.empty() ){ 1598 return e; // we are done 1599 } 1600 if ( recursive ) { 1601 // append children: 1602 // we want make sure that text elements are in the right order, 1603 // in front and the 'current' class first 1604 list<FoliaElement *> currenttextelements; 1605 list<FoliaElement *> textelements; 1606 list<FoliaElement *> otherelements; 1607 list<FoliaElement *> commentelements; 1608 multimap<ElementType, FoliaElement *, std::greater<ElementType>> otherelementsMap; 1609 for ( const auto& el : _data ) { 1610 if ( attribute_elements.find(el) == attribute_elements.end() ) { 1611 if ( el->isinstance(TextContent_t) ) { 1612 if ( el->cls() == "current" ) { 1613 currenttextelements.push_back( el ); 1614 } 1615 else { 1616 textelements.push_back( el ); 1617 } 1618 } 1619 else { 1620 if ( kanon ) { 1621 otherelementsMap.insert( make_pair( el->element_id(), el ) ); 1622 } 1623 else { 1624 if ( el->isinstance(XmlComment_t) 1625 && currenttextelements.empty() 1626 && textelements.empty() ) { 1627 commentelements.push_back( el ); 1628 } 1629 else { 1630 otherelements.push_back( el ); 1631 } 1632 } 1633 } 1634 } 1635 } 1636 for ( const auto& cel : commentelements ) { 1637 xmlAddChild( e, cel->xml( recursive, kanon ) ); 1638 } 1639 for ( const auto& tel : currenttextelements ) { 1640 xmlAddChild( e, tel->xml( recursive, false ) ); 1641 // don't change the internal sequences of TextContent elements 1642 } 1643 for ( const auto& tel : textelements ) { 1644 xmlAddChild( e, tel->xml( recursive, false ) ); 1645 // don't change the internal sequences of TextContent elements 1646 } 1647 if ( !kanon ) { 1648 for ( const auto& oem : otherelements ) { 1649 xmlAddChild( e, oem->xml( recursive, kanon ) ); 1650 } 1651 } 1652 else { 1653 for ( const auto& oem : otherelementsMap ) { 1654 xmlAddChild( e, oem.second->xml( recursive, kanon ) ); 1655 } 1656 } 1657 check_text_consistency(); 1658 } 1659 return e; 1660 } 1661 str(const string & cls) const1662 const string AbstractElement::str( const string& cls ) const { 1663 /// return the text value of this element 1664 /*! 1665 * \param cls The desired textclass 1666 * \return the string value (UTF8 encoded) 1667 * 1668 * if this is a TextContent or it may contain TextContent 1669 * then return the associated text() 1670 * 1671 * if this is a PhonContent or it may contain PhonContent 1672 * then return the associated phon() 1673 * 1674 * otherwise return the empty string 1675 */ 1676 UnicodeString us; 1677 try { 1678 us = text(cls); 1679 } 1680 catch( const NoSuchText& ){ 1681 try { 1682 us = phon(cls); 1683 } 1684 catch( const NoSuchPhon&){ 1685 // No TextContent or Phone is allowed 1686 } 1687 } 1688 return TiCC::UnicodeToUTF8( us ); 1689 } 1690 str(const TextPolicy & tp) const1691 const string AbstractElement::str( const TextPolicy& tp ) const { 1692 /// return the text value of this element 1693 /*! 1694 * \param tp the TextPolicy to use 1695 * \return the string value (UTF8 encoded) 1696 * 1697 * if this is a TextContent or it may contain TextContent 1698 * then return the associated text() 1699 * 1700 * if this is a PhonContent or it may contain PhonContent 1701 * then return the associated phon() 1702 * 1703 * otherwise return the empty string 1704 */ 1705 UnicodeString us; 1706 try { 1707 us = text( tp ); 1708 } 1709 catch( const NoSuchText& ){ 1710 try { 1711 us = phon( tp ); 1712 } 1713 catch( const NoSuchPhon&){ 1714 // No TextContent or Phone is allowed 1715 } 1716 } 1717 return TiCC::UnicodeToUTF8( us ); 1718 } 1719 speech_src() const1720 const string AbstractElement::speech_src() const { 1721 /// give the value of the _scr of an element 1722 /*! 1723 * return a (possibly empty) string. 1724 * 1725 * This function recurses upward to the first element which carries _src 1726 */ 1727 if ( !_src.empty() ) { 1728 return _src; 1729 } 1730 if ( _parent ) { 1731 return _parent->speech_src(); 1732 } 1733 return ""; 1734 } 1735 speech_speaker() const1736 const string AbstractElement::speech_speaker() const { 1737 /// give the value of the _speaker of an element 1738 /*! 1739 * return a (possibly empty) string. 1740 * 1741 * This function recurses upward to the first element which carries _speaker 1742 */ 1743 if ( !_speaker.empty() ) { 1744 return _speaker; 1745 } 1746 if ( _parent ) { 1747 return _parent->speech_speaker(); 1748 } 1749 return ""; 1750 } 1751 language(const string & st) const1752 const string AbstractElement::language( const string& st ) const { 1753 /// give the language value of an element 1754 /*! 1755 * \param st the setname to us for searching 1756 * The search will start at the object, and recurse upward until 1757 * the document level, where it will return the Documents language 1758 * Might return "" when no match is found 1759 */ 1760 set<ElementType> exclude; 1761 vector<LangAnnotation*> v = select<LangAnnotation>( st, exclude, false ); 1762 if ( v.size() > 0 ){ 1763 return v[0]->cls(); 1764 } 1765 else if ( _parent ){ 1766 return _parent->language( st ); 1767 } 1768 else { 1769 return doc()->language(); 1770 } 1771 } 1772 hastext(const string & cls) const1773 bool FoliaElement::hastext( const string& cls ) const { 1774 /// check if the element has a TextContent with class 'cls' 1775 /*! 1776 * \param cls The desired textclass 1777 * \return true if there is a TextContent available. Otherwise false 1778 */ 1779 try { 1780 this->text_content(cls); 1781 return true; 1782 } catch ( const NoSuchText& e ) { 1783 return false; 1784 } 1785 } 1786 hasphon(const string & cls) const1787 bool FoliaElement::hasphon( const string& cls ) const { 1788 /// check if the element has a PhonContent with class 'cls' 1789 /*! 1790 * \param cls The desired textclass 1791 * \return true if there is a PhonContent available. Otherwise false 1792 */ 1793 try { 1794 this->phon_content(cls); 1795 return true; 1796 } catch ( const NoSuchPhon& e ) { 1797 return false; 1798 } 1799 } 1800 get_delimiter(const TextPolicy & tp) const1801 const string& AbstractElement::get_delimiter( const TextPolicy& tp ) const { 1802 /// get the default delimiter of this object. 1803 /*! 1804 * \param tp the TextPolicy to use 1805 * \return a string representing the delimiter 1806 * 1807 * If the object has a TEXTDELIMITER property thats is returned 1808 * Otherwise, the last child is taken and its delimiter is returned IF 1809 * it is a Structure Element. 1810 * When this test fails, an empty string is returned, UNLESS the element has 1811 * the SPACE attribute AND retaintok is specified 1812 */ 1813 bool retaintok = tp.is_set( TEXT_FLAGS::RETAIN ); 1814 if ( tp.debug() ){ 1815 cerr << "IN <" << xmltag() << ">:get_delimiter (" << retaintok << ")" 1816 << endl; 1817 } 1818 if ( (SPACE & optional_attributes()) ){ 1819 if ( ! ( _space || retaintok ) ){ 1820 if ( tp.debug() ){ 1821 cerr << " space = NO, return: '" << EMPTY_STRING << "'" << endl; 1822 } 1823 return EMPTY_STRING; 1824 } 1825 } 1826 1827 if ( !_data.empty() ){ 1828 FoliaElement *last = _data.back(); 1829 if ( last && 1830 last->isSubClass(AbstractStructureElement_t) 1831 && !last->space() ){ 1832 return EMPTY_STRING; 1833 } 1834 } 1835 if ( text_delimiter() != "NONE" ) { 1836 return text_delimiter(); 1837 } 1838 else if ( _data.size() > 0 ) { 1839 // attempt to get a delimiter from the last child 1840 FoliaElement *last = _data.back(); 1841 if ( last->isSubClass(AbstractStructureElement_t) ){ 1842 const string& det = last->get_delimiter( tp ); 1843 if ( tp.debug() ){ 1844 cerr << "out <" << xmltag() << ">:get_delimiter ==> '" << det << "'" 1845 << endl; 1846 } 1847 return det; 1848 } 1849 } 1850 if ( tp.debug() ){ 1851 cerr << "out <" << xmltag() << ">:get_delimiter ==> ''" << endl; 1852 } 1853 return EMPTY_STRING; 1854 } 1855 is_space(const UChar32 kar)1856 bool is_space( const UChar32 kar ){ 1857 return ( kar == 0x0020 // space 1858 || kar == 0x0009 // tab 1859 || kar == 0x000a // newline 1860 || kar == 0x000d ); // carriage return 1861 } 1862 text_container_text(const TextPolicy & tp) const1863 UnicodeString AbstractElement::text_container_text( const TextPolicy& tp ) const { 1864 string desired_class = tp.get_class(); 1865 if ( isinstance( TextContent_t ) 1866 && cls() != desired_class ) { 1867 // take a shortcut for TextContent in wrong class 1868 if ( tp.debug() ){ 1869 cerr << "TextContent shortcut, class=" << cls() 1870 << " but looking for: " << desired_class << endl; 1871 } 1872 return ""; 1873 } 1874 UnicodeString result; 1875 bool pendingspace = false; 1876 bool trim_spaces = !tp.is_set( TEXT_FLAGS::NO_TRIM_SPACES); 1877 for ( const auto& d : _data ){ 1878 if (d->isinstance( XmlText_t)) { 1879 // 'true' text child 1880 if (pendingspace) { 1881 result += " "; 1882 pendingspace = false; 1883 } 1884 if ( trim_spaces) { 1885 //This implements https://github.com/proycon/folia/issues/88 1886 //FoLiA >= v2.5 behaviour (introduced earlier in v2.4.1 but modified thereafter) 1887 const int l = result.length(); 1888 UnicodeString text = d->text( tp ); 1889 int begin = 0; 1890 int linenr = 0; 1891 for ( int i = 0; i < text.length(); ++i ) { 1892 if ( text[i] == 0x000a 1893 || (i == text.length() - 1) ) { 1894 //newline or end 1895 UnicodeString line; 1896 if ( text[i] == 0x000a ) { //newline 1897 line = UnicodeString(text, begin, i - begin); 1898 } 1899 else { 1900 line = UnicodeString(text, begin, text.length() - begin); 1901 } 1902 begin = i+1; 1903 1904 UnicodeString subresult; 1905 if ( _preserve_spaces == SPACE_FLAGS::PRESERVE) { 1906 if ( line.length() > 0 1907 && line[line.length() - 1] == 0x000d) { 1908 //carriage return 1909 //remove artefacts of any DOS-style line endings (not sure if still 1910 //needed here but better safe than sorry) 1911 line = UnicodeString(line, 0, line.length() - 1); 1912 } 1913 subresult = line; 1914 } 1915 else { 1916 subresult = normalize_spaces(trim_space(line)); 1917 } 1918 1919 if ( (linenr > 0) 1920 && (subresult.length() > 0) 1921 && (result.length() != l) ) { 1922 //insert spaces between lines that used to be newline separated 1923 result.append((UChar32) 0x0020); 1924 } 1925 else if ( (subresult.length() > 0) 1926 && (line.length() > 0) 1927 && ( is_space(line[0]) ) 1928 && this->_preserve_spaces != SPACE_FLAGS::PRESERVE ) { 1929 //we have leading indentation we may need to collapse or ignore entirely 1930 //we can't be sure yet what to do so we add a temporary placeholder \1 1931 //this will later be handled in postprocess_spaces() (converts to a space only if no space preceeds it) 1932 result.append(0x0001); 1933 } 1934 result += subresult; 1935 linenr++; 1936 } 1937 } 1938 1939 if ( this->_preserve_spaces != SPACE_FLAGS::PRESERVE 1940 && text.length() > 0 1941 && result.length() > 0 1942 && is_space(text[text.length() - 1]) 1943 && !is_space(result[result.length() - 1]) ){ 1944 //this item has trailing spaces but we stripped them 1945 //this may be premature so 1946 //we reserve to output them later in case there is a next item 1947 pendingspace = true; 1948 } 1949 } 1950 else { 1951 //old FoLiA <= v2.4.1 behaviour, we don't trim anything 1952 result += d->text( tp ); 1953 } 1954 } 1955 else if ( d->printable() ){ 1956 // this is some TextMarkup I hope 1957 if (pendingspace) { 1958 if (!d->implicitspace()) result += " "; 1959 pendingspace = false; 1960 } 1961 string tv = d->tag(); 1962 if ( !tv.empty() ){ 1963 vector<string> tvv = TiCC::split(tv); 1964 bool no_match = true; 1965 for ( const auto& v : tvv ){ 1966 TextPolicy::tag_handler match = tp.get_handler( v ); 1967 if ( match ){ 1968 no_match = false; 1969 UnicodeString tmp_result = match( d, tp ); 1970 result += tmp_result; 1971 } 1972 } 1973 if ( no_match ){ 1974 result += d->text( tp ); 1975 } 1976 } 1977 else { 1978 result += d->text( tp ); 1979 } 1980 if ( !result.isEmpty() ){ 1981 const string& delim = d->get_delimiter( tp ); 1982 if ( tp.debug() ){ 1983 cerr << "append delimiter: '" << delim << "'" << endl; 1984 } 1985 result += TiCC::UnicodeFromUTF8(delim); 1986 } 1987 } 1988 else { 1989 // non interesting stuff like <feature>, <comment> etc. 1990 } 1991 } 1992 if (trim_spaces && this->spaces_flag() != SPACE_FLAGS::PRESERVE) { 1993 result = postprocess_spaces(result); 1994 } 1995 if ( tp.debug() ){ 1996 cerr << "TEXT(" << tp.get_class() << ") on a textcontainer :" << xmltag() 1997 << " returned '" << result << "'" << endl; 1998 } 1999 return result; 2000 } 2001 private_text(const TextPolicy & tp) const2002 const UnicodeString AbstractElement::private_text( const TextPolicy& tp ) const { 2003 /// get the UnicodeString value of an element 2004 /*! 2005 * \param tp The TextPolicy to use 2006 * \return the Unicode String representation found. Throws when 2007 * no text can be found 2008 */ 2009 bool strict = tp.is_set( TEXT_FLAGS::STRICT ); 2010 bool show_hidden = tp.is_set( TEXT_FLAGS::HIDDEN ); 2011 bool trim = !tp.is_set( TEXT_FLAGS::NO_TRIM_SPACES ); 2012 if ( tp.debug() ){ 2013 cerr << "TEXT(" << tp.get_class() << ") on node : " << xmltag() << " id=" 2014 << id() << endl; 2015 cerr << "TextPolicy: " << tp << endl; 2016 } 2017 if ( strict ) { 2018 /// WARNING. Don't call text(tp) here. We will get into an infinite 2019 /// recursion. Can't we do better then calling ourself again, sort of? 2020 TextPolicy tmp = tp; 2021 tmp.clear( TEXT_FLAGS::STRICT ); 2022 return text_content(tmp)->text( tmp ); 2023 } 2024 else if ( !printable() || ( hidden() && !show_hidden ) ){ 2025 throw NoSuchText( "NON printable element: " + xmltag() ); 2026 } 2027 else if ( is_textcontainer() ){ 2028 return text_container_text( tp ); 2029 } 2030 else { 2031 // 2032 UnicodeString result = deeptext( tp ); 2033 if ( result.isEmpty() ) { 2034 TextPolicy tmp = tp; 2035 tmp.set( TEXT_FLAGS::STRICT ); 2036 if ( !trim ) { 2037 tmp.set( TEXT_FLAGS::NO_TRIM_SPACES ); 2038 } 2039 result = text( tmp ); 2040 } 2041 if ( result.isEmpty() ) { 2042 throw NoSuchText( "on tag " + xmltag() + " nor it's children" ); 2043 } 2044 return result; 2045 } 2046 } 2047 text(const TextPolicy & tp) const2048 const UnicodeString AbstractElement::text( const TextPolicy& tp ) const { 2049 /// get the UnicodeString text value of an element 2050 /*! 2051 * \param tp a TextPolicy 2052 */ 2053 if ( tp.debug() ){ 2054 cerr << "DEBUG <" << xmltag() << ">.text() Policy=" << tp << endl; 2055 } 2056 return private_text( tp ); 2057 } 2058 text(const string & cls,TEXT_FLAGS flags,bool debug) const2059 const UnicodeString AbstractElement::text( const string& cls, 2060 TEXT_FLAGS flags, 2061 bool debug ) const { 2062 /// get the UnicodeString text value of an element 2063 /*! 2064 * \param cls the textclass the text should be in 2065 * \param flags the search parameters to use. See TEXT_FLAGS. 2066 * \param debug enables debugging when true 2067 */ 2068 TextPolicy tp( cls, flags ); 2069 tp.set_debug( debug ); 2070 if ( debug ){ 2071 cerr << "DEBUG <" << xmltag() << ">.text() Policy=" << tp << endl; 2072 } 2073 return private_text( tp ); 2074 } 2075 setAttributes(KWargs & kwargs)2076 void FoLiA::setAttributes( KWargs& kwargs ){ 2077 /// set the attributes of a FoLiA top node 2078 /*! 2079 * \param kwargs an attribute-value list 2080 * the FoLiA top is special, as it may accept special attributes 2081 * which are stored in the associated document, and NOT in the node 2082 */ 2083 // we store some attributes in the document itself 2084 doc()->setDocumentProps( kwargs ); 2085 // use remaining attributes for the FoLiA node 2086 // probably only the ID 2087 AbstractElement::setAttributes( kwargs ); 2088 } 2089 parseXml(const xmlNode * node)2090 FoliaElement* FoLiA::parseXml( const xmlNode *node ){ 2091 /// 2092 /// recursively parse a complete FoLiA tree 2093 /// \param node an xmlNode that MUST be a FoLiA root node 2094 /// \return the parsed tree. Throws on error. 2095 /*! 2096 * the topnode is special, as it also carries the main document properties 2097 * 2098 */ 2099 KWargs atts = getAttributes( node ); 2100 if ( !doc() ){ 2101 throw XmlError( "FoLiA root without Document" ); 2102 } 2103 setAttributes( atts ); 2104 bool meta_found = false; 2105 xmlNode *p = node->children; 2106 while ( p ){ 2107 if ( p->type == XML_ELEMENT_NODE ){ 2108 if ( TiCC::Name(p) == "metadata" && 2109 checkNS( p, NSFOLIA ) ){ 2110 if ( doc()->debug > 1 ){ 2111 cerr << "Found metadata" << endl; 2112 } 2113 doc()->parse_metadata( p ); 2114 meta_found = true; 2115 } 2116 else if ( p && TiCC::getNS(p) == NSFOLIA ){ 2117 string tag = TiCC::Name( p ); 2118 if ( !meta_found && !doc()->version_below(1,6) ){ 2119 throw XmlError( "Expecting element metadata, got '" + tag + "'" ); 2120 } 2121 FoliaElement *t = AbstractElement::createElement( tag, doc() ); 2122 if ( t ){ 2123 if ( doc()->debug > 2 ){ 2124 cerr << "created " << t << endl; 2125 } 2126 t = t->parseXml( p ); 2127 if ( t ){ 2128 if ( doc()->debug > 2 ){ 2129 cerr << "extend " << this << " met " << tag << endl; 2130 } 2131 this->append( t ); 2132 } 2133 } 2134 } 2135 } 2136 else if ( p->type == XML_TEXT_NODE ){ 2137 // This MUST be 'empty space', so only spaces and tabs formatting 2138 string txt = TextValue(p); 2139 txt = TiCC::trim(txt); 2140 if ( !txt.empty() ){ 2141 if ( p->prev ){ 2142 string tg = "<" + Name(p->prev) + ">"; 2143 throw XmlError( "found extra text '" + txt + "' after element " 2144 + tg + ", NOT allowed there." ); 2145 } 2146 else { 2147 string tg = "<" + Name(p->parent) + ">"; 2148 throw XmlError( "found extra text '" + txt + "' inside element " 2149 + tg + ", NOT allowed there." ); 2150 } 2151 } 2152 } 2153 p = p->next; 2154 } 2155 return this; 2156 } 2157 trim_space(const UnicodeString & in)2158 UnicodeString trim_space( const UnicodeString& in ){ 2159 /// remove leading and traling spaces. KEEP newlines etc. 2160 /*! 2161 * \param in an untrimmed UnicodeString 2162 * \return an UnicodeString with all leading and trailing spaces removed. 2163 * Other 'whitespace' characters like newline and tab are retained! 2164 */ 2165 const char16_t space = 0x0020; 2166 // cerr << "in = '" << in << "'" << endl; 2167 UnicodeString out; 2168 int i = 0; 2169 for( ; i < in.length(); ++i ){ 2170 // cerr << "start: bekijk:" << UnicodeString(in[i]) << endl; 2171 if ( in[i] != space ){ 2172 break; 2173 } 2174 } 2175 int j = in.length()-1; 2176 for( ; j >= 0; --j ){ 2177 // cerr << "end: bekijk:" << UnicodeString(in[j]) << endl; 2178 if ( in[j] != space ){ 2179 break; 2180 } 2181 } 2182 // cerr << "I=" << i << endl; 2183 // cerr << "J=" << j << endl; 2184 if ( j < i ){ 2185 // cerr << "out = LEEG" << endl; 2186 return out; 2187 } 2188 out = UnicodeString( in, i, j-i+1 ); 2189 // cerr << "out = '" << out << "'" << endl; 2190 return out; 2191 } 2192 postprocess_spaces(const UnicodeString & in)2193 UnicodeString postprocess_spaces( const UnicodeString& in ){ 2194 ///Postprocessing for spaces, translates temporary \1 codepoints to spaces 2195 /// if they are are not preceeded by whitespace 2196 bool need_postprocessing = false; 2197 for (int i = 0; i < in.length(); i++) { 2198 if (in[i] == 0x0001) { 2199 need_postprocessing = true; 2200 break; 2201 } 2202 } 2203 if (!need_postprocessing) { 2204 return in; 2205 } 2206 else { 2207 UnicodeString result; 2208 for (int i = 0; i < in.length(); ++i) { 2209 if ( in[i] == 0x0001 ) { 2210 if ( i > 0 2211 && !is_space(in[i-1]) ){ 2212 result.append((UChar32) 0x0020); //add a space 2213 // 1 byte is dropped otherwise 2214 } 2215 } 2216 else { 2217 result.append(in[i]); 2218 } 2219 } 2220 return result; 2221 } 2222 } 2223 check_end(const UnicodeString & us,bool & only)2224 bool check_end( const UnicodeString& us, bool& only ){ 2225 /// check for newline characters at the end 2226 /*! 2227 * \param us the UnicodeString to check for '\n' 2228 * \param only set to true if the whole string consists of only '\n' 2229 * \return true when at least 1 '\n' is found at the end. 2230 */ 2231 only = false; 2232 string tmp = TiCC::UnicodeToUTF8( us ); 2233 int j = tmp.length()-1; 2234 size_t found_nl = 0; 2235 for ( ; j >=0; --j ){ 2236 if ( tmp[j] == '\n' ){ 2237 ++found_nl; 2238 } 2239 else { 2240 break; 2241 } 2242 } 2243 only = found_nl == tmp.length(); 2244 return found_nl > 0; 2245 } 2246 no_space_at_end(FoliaElement * s)2247 bool no_space_at_end( FoliaElement *s ){ 2248 /// given a FoliaElement check if the last Word in it has space() 2249 /*! 2250 * \param s a FoliaElement 2251 * \return true if the element contains Word children and the last 2252 * one has space() 2253 */ 2254 bool result = false; 2255 // cerr << "no space? s: " << s << endl; 2256 if ( s ){ 2257 vector<Word*> words = s->select<Word>(false); 2258 if ( !words.empty() ){ 2259 Word *last = words.back(); 2260 // cerr << "no space? last: " << last << endl; 2261 return !last->space(); 2262 } 2263 } 2264 return result; 2265 } 2266 deeptext(const TextPolicy & tp) const2267 const UnicodeString AbstractElement::deeptext( const TextPolicy& tp ) const { 2268 /// get the UnicodeString text value of underlying elements 2269 /*! 2270 * \param tp the TextPolicy to use 2271 * \return The Unicode Text found. 2272 * Will throw on error. 2273 */ 2274 if ( tp.debug() ){ 2275 cerr << "deeptext, policy: " << tp << ", on node : " << xmltag() << " id=" << id() << ", cls=" << this->cls() << ")" << endl; 2276 cerr << "deeptext: node has " << _data.size() << " children." << endl; 2277 } 2278 vector<UnicodeString> parts; 2279 vector<UnicodeString> seps; 2280 for ( const auto& child : data() ) { 2281 // try to get text dynamically from printable children 2282 // skipping the TextContent elements 2283 if ( tp.debug() ){ 2284 if ( !child->printable() ) { 2285 cerr << "deeptext: node[" << child->xmltag() << "] NOT PRINTABLE! " 2286 << endl; 2287 } 2288 } 2289 if ( child->printable() 2290 && ( is_structure( child ) 2291 || child->isSubClass( AbstractSpanAnnotation_t ) 2292 || child->isinstance( Correction_t ) ) 2293 && !child->isinstance( TextContent_t ) ) { 2294 if ( tp.debug() ){ 2295 cerr << "deeptext:bekijk node[" << child->xmltag() << "]"<< endl; 2296 } 2297 try { 2298 UnicodeString tmp = child->text( tp ); 2299 if ( tp.debug() ){ 2300 cerr << "deeptext found '" << tmp << "'" << endl; 2301 } 2302 parts.push_back(tmp); 2303 if ( child->isinstance( Sentence_t ) 2304 && no_space_at_end(child) ){ 2305 const string& delim = ""; 2306 if ( tp.debug() ){ 2307 cerr << "deeptext: no delimiter van "<< child->xmltag() << " on" 2308 << " last w of s" << endl; 2309 } 2310 seps.push_back(TiCC::UnicodeFromUTF8(delim)); 2311 } 2312 else { 2313 // get the delimiter 2314 const string& delim = child->get_delimiter( tp ); 2315 if ( tp.debug() ){ 2316 cerr << "deeptext:delimiter van "<< child->xmltag() << " ='" 2317 << delim << "'" << endl; 2318 } 2319 seps.push_back(TiCC::UnicodeFromUTF8(delim)); 2320 } 2321 } catch ( const NoSuchText& e ) { 2322 if ( tp.debug() ){ 2323 cerr << "HELAAS" << endl; 2324 } 2325 } 2326 } 2327 } 2328 2329 // now construct the result; 2330 UnicodeString result; 2331 for ( size_t i=0; i < parts.size(); ++i ) { 2332 if ( tp.debug() ){ 2333 cerr << "part[" << i << "]='" << parts[i] << "'" << endl; 2334 cerr << "sep[" << i << "]='" << seps[i] << "'" << endl; 2335 } 2336 bool only_nl = false; 2337 bool end_is_nl = check_end( parts[i], only_nl ); 2338 if ( end_is_nl ){ 2339 if ( tp.debug() ){ 2340 cerr << "a newline after: '" << parts[i] << "'" << endl; 2341 if ( i < parts.size()-1 ){ 2342 cerr << "next sep='" << seps[i+1] << "'" << endl; 2343 } 2344 } 2345 2346 if ( only_nl ){ 2347 // only a newline 2348 result = trim_space( result ); 2349 if ( tp.debug() ){ 2350 cerr << "OK it is only newline(s)" << endl; 2351 cerr << "TRIMMED? '" << result << "'" << endl; 2352 } 2353 } 2354 } 2355 result += parts[i]; 2356 if ( !end_is_nl && i < parts.size()-1 ){ 2357 result += seps[i]; 2358 } 2359 if ( tp.debug() ){ 2360 cerr << "result='" << result << "'" << endl; 2361 } 2362 } 2363 if ( tp.debug() ){ 2364 cerr << "deeptext() for " << xmltag() << " step 3 " << endl; 2365 } 2366 if ( result.isEmpty() ) { 2367 // so no deeper text is found. Well, lets look here then 2368 result = text_content(tp)->text( tp ); 2369 } 2370 if ( tp.debug() ){ 2371 cerr << "deeptext() for " << xmltag() << " result= '" << result << "'" 2372 << endl; 2373 } 2374 if ( result.isEmpty() ) { 2375 throw NoSuchText( xmltag() + ":(class=" + tp.get_class() +"): empty!" ); 2376 } 2377 return result; 2378 } 2379 stricttext(const string & cls) const2380 const UnicodeString FoliaElement::stricttext( const string& cls ) const { 2381 /// get the UnicodeString value of TextContent children only 2382 /*! 2383 * \param cls the textclass 2384 * \return The Unicode Text found. 2385 * Will throw on error. 2386 */ 2387 TextPolicy tp( cls, TEXT_FLAGS::STRICT ); 2388 return this->text( tp ); 2389 } 2390 toktext(const string & cls) const2391 const UnicodeString FoliaElement::toktext( const string& cls ) const { 2392 /// get the UnicodeString value of TextContent children only, retaining 2393 /// tokenization 2394 /*! 2395 * \param cls the textclass 2396 * \return The Unicode Text found. 2397 * Will throw on error. 2398 */ 2399 TextPolicy tp( cls, TEXT_FLAGS::RETAIN ); 2400 return this->text( tp ); 2401 } 2402 text_content(const TextPolicy & tp) const2403 const TextContent *AbstractElement::text_content( const TextPolicy& tp ) const { 2404 /// Get the TextContent explicitly associated with this element. 2405 /*! 2406 * \param tp the TextPolicy to use 2407 * 2408 * Returns the TextContent instance rather than the actual text. 2409 * (so it might return itself.. ;) 2410 * Does not recurse into children with the sole exception of Correction 2411 * might throw NoSuchText exception if not found. 2412 */ 2413 2414 if ( tp.debug() ){ 2415 cerr << "text_content, policy= " << tp << endl; 2416 } 2417 string desired_class = tp.get_class(); 2418 if ( isinstance(TextContent_t) ){ 2419 if ( tp.debug() ){ 2420 cerr << "A textcontent!!" << endl; 2421 } 2422 if ( this->cls() == desired_class ) { 2423 if ( tp.debug() ){ 2424 cerr << "return myself..." << endl; 2425 } 2426 return dynamic_cast<const TextContent*>(this); 2427 } 2428 else { 2429 throw NoSuchText( "TextContent::text_content(" + desired_class + ")" ); 2430 } 2431 } 2432 bool show_hidden = tp.is_set( TEXT_FLAGS::HIDDEN ); 2433 if ( tp.debug() ){ 2434 cerr << (!printable()?"NOT":"") << " printable: " << xmltag() << endl; 2435 cerr << (!hidden()?"NOT":"") << " hidden: " << xmltag() << endl; 2436 } 2437 if ( !printable() || ( hidden() && !show_hidden ) ) { 2438 throw NoSuchText( "non-printable element: " + xmltag() ); 2439 } 2440 if ( tp.debug() ){ 2441 cerr << "recurse into children...." << endl; 2442 } 2443 for ( const auto& el : data() ) { 2444 if ( el->isinstance(TextContent_t) && (el->cls() == desired_class ) ) { 2445 return dynamic_cast<TextContent*>(el); 2446 } 2447 else if ( el->element_id() == Correction_t) { 2448 try { 2449 return el->text_content( tp ); 2450 } catch ( const NoSuchText& e ) { 2451 // continue search for other Corrections or a TextContent 2452 } 2453 } 2454 } 2455 throw NoSuchText( xmltag() + "::text_content(" + desired_class + ")" ); 2456 } 2457 text_content(const string & cls,bool debug) const2458 const TextContent *AbstractElement::text_content( const string& cls, 2459 bool debug ) const { 2460 /// Get the TextContent explicitly associated with this element. 2461 /*! 2462 * \param cls the textclass to search for 2463 * \param debug enables debugging when true 2464 * 2465 * Returns the TextContent instance rather than the actual text. 2466 * (so it might return itself.. ;) 2467 * Does not recurse into children with the sole exception of Correction 2468 * might throw NoSuchText exception if not found. 2469 */ 2470 TextPolicy tp( cls ); 2471 tp.set_debug( debug ); 2472 return text_content( tp ); 2473 } 2474 phon_content(const TextPolicy & tp) const2475 const PhonContent *AbstractElement::phon_content( const TextPolicy& tp ) const { 2476 /// Get the PhonContent explicitly associated with this element. 2477 /*! 2478 * \param tp the TextPolicy to use 2479 * 2480 * Returns the PhonContent instance rather than the actual text. 2481 * (so it might return iself.. ;) 2482 * Does not recurse into children with the sole exception of Correction 2483 * might throw NoSuchPhon exception if not found. 2484 */ 2485 string desired_class = tp.get_class(); 2486 if ( isinstance(PhonContent_t) ){ 2487 if ( cls() == desired_class ){ 2488 return dynamic_cast<const PhonContent*>(this); 2489 } 2490 else { 2491 throw NoSuchPhon( xmltag() + "::phon_content(" + desired_class + ")" ); 2492 } 2493 } 2494 bool show_hidden = tp.is_set( TEXT_FLAGS::HIDDEN ); 2495 if ( !speakable() || ( hidden() && !show_hidden ) ) { 2496 throw NoSuchPhon( "non-speakable element: " + xmltag() ); 2497 } 2498 2499 for ( const auto& el : _data ) { 2500 if ( el->isinstance(PhonContent_t) && ( el->cls() == desired_class ) ) { 2501 return dynamic_cast<PhonContent*>(el); 2502 } 2503 else if ( el->element_id() == Correction_t) { 2504 try { 2505 return el->phon_content(tp); 2506 } catch ( const NoSuchPhon& e ) { 2507 // continue search for other Corrections or a TextContent 2508 } 2509 } 2510 } 2511 throw NoSuchPhon( xmltag() + "::phon_content(" + desired_class + ")" ); 2512 } 2513 phon_content(const string & cls,bool debug) const2514 const PhonContent *AbstractElement::phon_content( const string& cls, 2515 bool debug ) const { 2516 /// Get the PhonContent explicitly associated with this element. 2517 /*! 2518 * \param cls the textclass to search for 2519 * \param debug enable debugging when true 2520 * 2521 * Returns the PhonContent instance rather than the actual text. 2522 * (so it might return iself.. ;) 2523 * Does not recurse into children with the sole exception of Correction 2524 * might throw NoSuchPhon exception if not found. 2525 */ 2526 TextPolicy tp(cls ); 2527 tp.set_debug( debug ); 2528 return phon_content( tp ); 2529 } 2530 phon(const TextPolicy & tp) const2531 const UnicodeString AbstractElement::phon( const TextPolicy& tp ) const { 2532 /// get the UnicodeString phon value of an element 2533 /*! 2534 * \param tp the TextPolic to use 2535 */ 2536 bool hidden = tp.is_set( TEXT_FLAGS::HIDDEN ); 2537 bool strict = tp.is_set( TEXT_FLAGS::STRICT ); 2538 if ( tp.debug() ){ 2539 cerr << "PHON, Policy= " << tp << " on node : " << xmltag() << " id=" 2540 << id() << endl; 2541 } 2542 if ( strict ) { 2543 return phon_content(tp)->phon(); 2544 } 2545 else if ( !speakable() || ( this->hidden() && !hidden ) ) { 2546 throw NoSuchPhon( "NON speakable element: " + xmltag() ); 2547 } 2548 else { 2549 UnicodeString result = deepphon( tp ); 2550 if ( result.isEmpty() ) { 2551 result = phon_content(tp)->phon(); 2552 } 2553 if ( result.isEmpty() ) { 2554 throw NoSuchPhon( "on tag " + xmltag() + " nor it's children" ); 2555 } 2556 return result; 2557 } 2558 } 2559 phon(const string & cls,TEXT_FLAGS flags) const2560 const UnicodeString AbstractElement::phon( const string& cls, 2561 TEXT_FLAGS flags ) const { 2562 /// get the UnicodeString phon value of an element 2563 /*! 2564 * \param cls the textclass the text should be in 2565 * \param flags the search parameters to use. See TEXT_FLAGS. 2566 */ 2567 TextPolicy tp( cls, flags ); 2568 return phon( tp ); 2569 } 2570 deepphon(const TextPolicy & tp) const2571 const UnicodeString AbstractElement::deepphon( const TextPolicy& tp ) const { 2572 /// get the UnicodeString phon value of underlying elements 2573 /*! 2574 * \param tp the TextPolicu to use 2575 * \return The Unicode Text found. 2576 * Will throw on error. 2577 */ 2578 if ( tp.debug() ){ 2579 cerr << "deepPHON, policy= " << tp << ", on node : " << xmltag() 2580 << " id=" << id() << endl; 2581 cerr << "deepphon: node has " << _data.size() << " children." << endl; 2582 } 2583 vector<UnicodeString> parts; 2584 vector<UnicodeString> seps; 2585 for ( const auto& child : _data ) { 2586 // try to get text dynamically from children 2587 // skip PhonContent elements 2588 if ( tp.debug() ){ 2589 if ( !child->speakable() ) { 2590 cerr << "deepphon: node[" << child->xmltag() << "] NOT SPEAKABLE! " 2591 << endl; 2592 } 2593 } 2594 if ( child->speakable() && !child->isinstance( PhonContent_t ) ) { 2595 if ( tp.debug() ){ 2596 cerr << "deepphon:bekijk node[" << child->xmltag() << "]" << endl; 2597 } 2598 try { 2599 UnicodeString tmp = child->phon( tp ); 2600 if ( tp.debug() ){ 2601 cerr << "deepphon found '" << tmp << "'" << endl; 2602 } 2603 parts.push_back(tmp); 2604 // get the delimiter 2605 const string& delim = child->get_delimiter(tp); 2606 if ( tp.debug() ){ 2607 cerr << "deepphon:delimiter van "<< child->xmltag() 2608 << " ='" << delim << "'" << endl; 2609 } 2610 seps.push_back(TiCC::UnicodeFromUTF8(delim)); 2611 } catch ( const NoSuchPhon& e ) { 2612 if ( tp.debug() ){ 2613 cerr << "HELAAS" << endl; 2614 } 2615 } 2616 } 2617 } 2618 2619 // now construct the result; 2620 UnicodeString result; 2621 for ( size_t i=0; i < parts.size(); ++i ) { 2622 result += parts[i]; 2623 if ( i < parts.size()-1 ) { 2624 result += seps[i]; 2625 } 2626 } 2627 if ( tp.debug() ){ 2628 cerr << "deepphon() for " << xmltag() << " step 3 " << endl; 2629 } 2630 if ( result.isEmpty() ) { 2631 try { 2632 result = phon_content(tp)->phon(); 2633 } 2634 catch ( ... ) { 2635 } 2636 } 2637 if ( tp.debug() ){ 2638 cerr << "deepphontext() for " << xmltag() << " result= '" << result 2639 << "'" << endl; 2640 } 2641 if ( result.isEmpty() ) { 2642 throw NoSuchPhon( xmltag() + ":(class=" + tp.get_class() +"): empty!" ); 2643 } 2644 return result; 2645 } 2646 2647 find_replacables(FoliaElement * par) const2648 vector<FoliaElement *>AbstractElement::find_replacables( FoliaElement *par ) const { 2649 // find all children with the same signature as the parameter 2650 /*! 2651 * \param par the FoliaElement to search 2652 * \return a vector of matching elements 2653 * search in the DIRECT children for nodes with the same tag AND set 2654 * as the element par 2655 */ 2656 return par->select( element_id(), sett(), SELECT_FLAGS::LOCAL ); 2657 } 2658 replace(FoliaElement * child)2659 void AbstractElement::replace( FoliaElement *child ) { 2660 /// replace a child element 2661 /*! 2662 * \param child The element to substitute 2663 * This function searches for A child of the same signature (type and set) 2664 * If found, that child is replaced. 2665 * If no such child element exists, this will act the same as append() 2666 */ 2667 vector<FoliaElement*> replace = child->find_replacables( this ); 2668 if ( replace.empty() ) { 2669 // nothing to replace, simply call append 2670 append( child ); 2671 } 2672 else if ( replace.size() > 1 ) { 2673 throw runtime_error( "Unable to replace. Multiple candidates found, unable to choose." ); 2674 } 2675 else { 2676 replace[0]->destroy(); 2677 append( child ); 2678 } 2679 } 2680 replace(FoliaElement * old,FoliaElement * _new)2681 FoliaElement* AbstractElement::replace( FoliaElement *old, 2682 FoliaElement* _new ) { 2683 /// replace in the children old by _new 2684 /*! 2685 * \param old The node to be replacec 2686 * \param _new the new node to add 2687 * \return old 2688 * First old is looked up, if present it is replaced 2689 * 2690 * when not found this function does nothing and returns 0 2691 */ 2692 FoliaElement *result = 0; 2693 auto it = find_if( _data.begin(), 2694 _data.end(), 2695 [&]( FoliaElement *el ){ return el == old; } ); 2696 if ( it != _data.end() ){ 2697 *it = _new; 2698 result = old; 2699 _new->set_parent(this); 2700 } 2701 return result; 2702 } 2703 insert_after(FoliaElement * pos,FoliaElement * add)2704 void AbstractElement::insert_after( FoliaElement *pos, FoliaElement *add ){ 2705 /// append a node after a certain element 2706 /*! 2707 * \param pos The location after which to insert add 2708 * \param add the element to add 2709 * 2710 * throws when pos is not found 2711 */ 2712 auto it = _data.begin(); 2713 while ( it != _data.end() ) { 2714 if ( *it == pos ) { 2715 it = _data.insert( ++it, add ); 2716 break; 2717 } 2718 ++it; 2719 } 2720 if ( it == _data.end() ) { 2721 throw runtime_error( "insert_after(): previous not found" ); 2722 } 2723 } 2724 clear_textcontent(const string & textclass)2725 void FoliaElement::clear_textcontent( const string& textclass ){ 2726 for ( size_t i=0; i < size(); ++i ){ 2727 FoliaElement *p = index(i); 2728 if ( p->element_id() == TextContent_t ) { 2729 if ( p->cls() == textclass ){ 2730 p->destroy(); 2731 break; 2732 } 2733 } 2734 } 2735 } 2736 settext(const string & txt,const string & cls)2737 TextContent *FoliaElement::settext( const string& txt, 2738 const string& cls ){ 2739 /// append a TextContent child of class txt with value txt 2740 /*! 2741 * \param txt the UTF8 text value 2742 * \param cls the textclass of the new TextContent 2743 * \return the new created TextContent 2744 * may throw on error 2745 * 2746 * when the associated document has the checktext mode, (which is the 2747 * default) both text consistency and the offset are checked. 2748 */ 2749 return settext( txt, -1, cls ); 2750 } 2751 setutext(const UnicodeString & txt,const string & cls)2752 TextContent *FoliaElement::setutext( const UnicodeString& txt, 2753 const string& cls ){ 2754 /// append a TextContent child of class cls with value txt 2755 /*! 2756 * \param txt the Unicode text value 2757 * \param cls the textclass of the new TextContent 2758 * \return the new created TextContent 2759 * may throw on error 2760 * 2761 * when the associated document has the checktext mode, (which is the 2762 * default) both text consistency and the offset are checked. 2763 */ 2764 string utf8 = TiCC::UnicodeToUTF8(txt); 2765 return settext( utf8, cls ); 2766 } 2767 settext(const string & txt,int offset,const string & cls)2768 TextContent *FoliaElement::settext( const string& txt, 2769 int offset, 2770 const string& cls ){ 2771 /// append a TextContent child of class cls with value txt 2772 /*! 2773 * \param txt the UTF8 text value 2774 * \param offset offset of the text in the text of the parent, 2775 when offset < 0 it is ignored. 2776 * \param cls the textclass of the new TextContent 2777 * \return the new created TextContent 2778 * may throw on error 2779 * 2780 * when the associated document has the checktext mode, (which is the 2781 * default) both text consistency and the offset are checked. 2782 */ 2783 UnicodeString txt_u = TiCC::UnicodeFromUTF8( txt ); 2784 if ( doc() && doc()->checktext() 2785 && !isSubClass( Morpheme_t ) && !isSubClass( Phoneme_t) ){ 2786 UnicodeString deeper_u; 2787 try { 2788 deeper_u = text( cls ); 2789 // get deep original text: no retain tokenization, no strict 2790 } 2791 catch (...){ 2792 } 2793 deeper_u = normalize_spaces( deeper_u ); 2794 UnicodeString txt_check_u = normalize_spaces( txt_u ); 2795 if ( !deeper_u.isEmpty() 2796 && txt_check_u != deeper_u ){ 2797 throw InconsistentText( "settext(cls=" + cls + "): deeper text differs from attempted\ndeeper='" + TiCC::UnicodeToUTF8(deeper_u) + "'\nattempted='" + TiCC::UnicodeToUTF8(txt_u) + "'" ); 2798 } 2799 } 2800 KWargs args; 2801 args["value"] = TiCC::UnicodeToUTF8(txt_u); 2802 args["class"] = cls; 2803 if ( offset >= 0 ){ 2804 args["offset"] = TiCC::toString(offset); 2805 } 2806 TextContent *node = new TextContent( args, doc() ); 2807 replace( node ); 2808 return node; 2809 } 2810 setutext(const UnicodeString & txt,int offset,const string & cls)2811 TextContent *FoliaElement::setutext( const UnicodeString& txt, 2812 int offset, 2813 const string& cls ){ 2814 /// append a TextContent child of class cls with value txt 2815 /*! 2816 * \param txt the Unicode text value 2817 * \param offset offset of the text in the text of the parent, 2818 when offset < 0 it is ignored. 2819 * \param cls the textclass of the new TextContent 2820 * \return the new created TextContent 2821 * may throw on error 2822 * 2823 * when the associated document has the checktext mode, (which is the 2824 * default) both text consistency and the offset are checked. 2825 */ 2826 string utf8 = TiCC::UnicodeToUTF8(txt); 2827 return settext( utf8, offset, cls ); 2828 } 2829 description() const2830 const string FoliaElement::description() const { 2831 /// return the string value of the description tag (if present) 2832 /*! 2833 * \return a string 2834 * search for Description nodes in this object. 2835 * When 1 or more are found, return the value of the first one 2836 */ 2837 vector<FoliaElement *> v = select( Description_t, SELECT_FLAGS::LOCAL ); 2838 if ( v.size() == 0 ) { 2839 return ""; 2840 } 2841 return v[0]->description(); 2842 } 2843 acceptable(ElementType t) const2844 bool AbstractElement::acceptable( ElementType t ) const { 2845 /// test if this ElementType is acceptable for the current node 2846 /*! 2847 * \param t the ElementType to test 2848 * 2849 * This function tests if t is in the accepted_data list of the node 2850 * OR if it is a SubClass of one of the accepted types 2851 */ 2852 2853 auto it = accepted_data().find( t ); 2854 if ( it == accepted_data().end() ) { 2855 for ( const auto& et : accepted_data() ) { 2856 if ( folia::isSubClass( t, et ) ) { 2857 return true; 2858 } 2859 } 2860 return false; 2861 } 2862 return true; 2863 } 2864 addable(const FoliaElement * parent) const2865 bool AbstractElement::addable( const FoliaElement *parent ) const { 2866 /// test if an element might succesfully appended to \em parent 2867 /*! 2868 * \param parent the node to check 2869 * \return true if it doesn't throw 2870 * 2871 * \note It will allways throw an error, instead of returning false 2872 */ 2873 if ( !parent->acceptable( element_id() ) ) { 2874 string mess = "Unable to append object of type " + classname() 2875 + " to a <" + parent->classname() + ">"; 2876 if ( !parent->id().empty() ){ 2877 mess += " (id=" + parent->id() + ")"; 2878 } 2879 throw ValueError( mess ); 2880 } 2881 if ( occurrences() > 0 ) { 2882 vector<FoliaElement*> v = parent->select( element_id(), 2883 SELECT_FLAGS::LOCAL ); 2884 size_t count = v.size(); 2885 if ( count >= occurrences() ) { 2886 throw DuplicateAnnotationError( "Unable to add another object of type " + classname() + " to " + parent->classname() + ". There are already " + TiCC::toString(count) + " instances of this type, which is the maximum." ); 2887 } 2888 } 2889 if ( occurrences_per_set() > 0 && 2890 (CLASS & required_attributes() || setonly() ) ){ 2891 vector<FoliaElement*> v = select( element_id(), 2892 sett(), 2893 SELECT_FLAGS::LOCAL ); 2894 size_t count = v.size(); 2895 if ( count >= occurrences_per_set() ) { 2896 throw DuplicateAnnotationError( "Unable to add another object of type " + classname() + " to " + parent->classname() + ". There are already " + TiCC::toString(count) + " instances of this type and set (" + sett() + "), which is the maximum." ); 2897 } 2898 } 2899 if ( _parent && 2900 !( element_id() == WordReference_t 2901 || referable() ) ){ 2902 throw XmlError( "attempt to reconnect node " + classname() + "(" 2903 + id() 2904 + ") to a " + parent->classname() + " node, id=" 2905 + parent->id() 2906 + ", it was already connected to a " 2907 + parent->classname() + " id=" + parent->id() ); 2908 } 2909 #ifdef NOT_WORKING 2910 // this fails. needs attention 2911 if ( c->element_id() == WordReference_t ){ 2912 string tval = atts["t"]; 2913 if ( !tval.empty() ){ 2914 string tc = ref->textclass(); 2915 string rtval = ref->str(tc); 2916 if ( tval != rtval ){ 2917 throw XmlError( "WordReference id=" + id + " has another value for " 2918 + "the t attribute than it's reference. (" 2919 + tval + " versus " + rtval + ")" ); 2920 } 2921 } 2922 } 2923 #endif 2924 if ( element_id() == TextContent_t 2925 && parent->element_id() == Word_t ) { 2926 string val = str(cls()); 2927 val = trim( val ); 2928 if ( val.empty() ) { 2929 throw ValueError( "attempt to add an empty <t> to word: " + parent->id() ); 2930 } 2931 } 2932 if ( element_id() == TextContent_t ){ 2933 string cls = this->cls(); 2934 string st = sett(); 2935 vector<TextContent*> tmp = parent->select<TextContent>( st, false ); 2936 if ( any_of( tmp.begin(), 2937 tmp.end(), 2938 [cls]( const TextContent *t) { return ( t->cls() == cls);} ) ){ 2939 throw DuplicateAnnotationError( "attempt to add <t> with class=" 2940 + cls + " to element: " + parent->id() 2941 + " which already has a <t> with that class" ); 2942 } 2943 } 2944 if ( is_textcontainer() || 2945 element_id() == Word_t ){ 2946 parent->check_append_text_consistency( this ); 2947 } 2948 return true; 2949 } 2950 assignDoc(Document * the_doc)2951 void AbstractElement::assignDoc( Document* the_doc ) { 2952 /// attach a document-less FoliaElement (-tree) to a Document the_doc 2953 /*! 2954 * \param the_doc The Document to attach to 2955 * 2956 * if the node already has a Document assigned , nothing is done. 2957 * 2958 * Otherwise: The annotation type is checked. If not set yet and 2959 * the doc has autodeclare mode set, it is attempted to do so. 2960 * For TextContent and PhonContent, a default is added too 2961 * 2962 * Also the ID is registered in the_doc. 2963 * 2964 * Finaly, all children are also assigned to the_doc 2965 */ 2966 if ( !_mydoc ) { 2967 _mydoc = the_doc; 2968 if ( annotation_type() != AnnotationType::NO_ANN 2969 && !the_doc->version_below( 2, 0 ) 2970 && !the_doc->declared( annotation_type() ) ){ 2971 // cerr << "assignDoc: " << this << endl; 2972 // cerr << "ant: " << annotation_type() << endl; 2973 // cerr << "set: " << _set << endl; 2974 // so when appending a document-less child, make sure that 2975 // an annotation declaration is present or added. 2976 if ( annotation_type() == AnnotationType::TEXT ){ 2977 if ( _set.empty() ){ 2978 doc()->declare( AnnotationType::TEXT, DEFAULT_TEXT_SET ); 2979 } 2980 else { 2981 doc()->declare( AnnotationType::TEXT, _set ); 2982 } 2983 } 2984 else if ( annotation_type() == AnnotationType::PHON ){ 2985 if ( _set.empty() ){ 2986 doc()->declare( AnnotationType::PHON, DEFAULT_PHON_SET ); 2987 } 2988 else { 2989 doc()->declare( AnnotationType::PHON, _set ); 2990 } 2991 } 2992 else if ( doc()->autodeclare() ){ 2993 doc()->auto_declare( annotation_type(), _set ); 2994 } 2995 else { 2996 throw DeclarationError( "Encountered an instance of <" 2997 + xmltag() 2998 + "> without a proper declaration for " 2999 + toString(annotation_type()) 3000 + "-annotation" ); 3001 } 3002 } 3003 if ( !_set.empty() 3004 && (CLASS & required_attributes() ) 3005 && !_mydoc->declared( annotation_type(), _set ) ) { 3006 throw DeclarationError( "Set " + _set + " is used in " + xmltag() 3007 + "element: " + _id 3008 + " but has no declaration " 3009 + "for " + toString( annotation_type() ) 3010 + "-annotation" ); 3011 } 3012 if ( !_id.empty() ) { 3013 _mydoc->add_doc_index( this ); 3014 } 3015 // assume that children also might be doc-less 3016 for ( const auto& el : _data ) { 3017 el->assignDoc( _mydoc ); 3018 } 3019 } 3020 } 3021 checkAtts()3022 bool AbstractElement::checkAtts() { 3023 /// check if all the REQUIRED attributes of the node are set 3024 /*! 3025 * \return true, or throws 3026 */ 3027 if ( _id.empty() 3028 && (ID & required_attributes() ) ) { 3029 throw ValueError( "attribute 'ID' is required for " + classname() ); 3030 } 3031 if ( _set.empty() 3032 && (CLASS & required_attributes() ) ) { 3033 throw ValueError( "attribute 'set' is required for " + classname() ); 3034 } 3035 if ( _class.empty() 3036 && ( CLASS & required_attributes() ) ) { 3037 throw ValueError( "attribute 'class' is required for " + classname() ); 3038 } 3039 if ( _annotator.empty() 3040 && ( ANNOTATOR & required_attributes() ) ) { 3041 throw ValueError( "attribute 'annotator' is required for " + classname() ); 3042 } 3043 if ( _annotator_type == UNDEFINED 3044 && ( ANNOTATOR & required_attributes() ) ) { 3045 throw ValueError( "attribute 'Annotatortype' is required for " + classname() ); 3046 } 3047 if ( _confidence == -1 && 3048 ( CONFIDENCE & required_attributes() ) ) { 3049 throw ValueError( "attribute 'confidence' is required for " + classname() ); 3050 } 3051 if ( _n.empty() 3052 && ( N & required_attributes() ) ) { 3053 throw ValueError( "attribute 'n' is required for " + classname() ); 3054 } 3055 if ( _datetime.empty() 3056 && ( DATETIME & required_attributes() ) ) { 3057 throw ValueError( "attribute 'datetime' is required for " + classname() ); 3058 } 3059 if ( _begintime.empty() 3060 && ( BEGINTIME & required_attributes() ) ) { 3061 throw ValueError( "attribute 'begintime' is required for " + classname() ); 3062 } 3063 if ( _endtime.empty() 3064 && ( ENDTIME & required_attributes() ) ) { 3065 throw ValueError( "attribute 'endtime' is required for " + classname() ); 3066 } 3067 if ( _src.empty() 3068 && ( SRC & required_attributes() ) ) { 3069 throw ValueError( "attribute 'src' is required for " + classname() ); 3070 } 3071 if ( _metadata.empty() 3072 && ( METADATA & required_attributes() ) ) { 3073 throw ValueError( "attribute 'metadata' is required for " + classname() ); 3074 } 3075 if ( _speaker.empty() 3076 && ( SPEAKER & required_attributes() ) ) { 3077 throw ValueError( "attribute 'speaker' is required for " + classname() ); 3078 } 3079 return true; 3080 } 3081 classInit()3082 void AbstractElement::classInit(){ 3083 // we could call 'init()' directly, but this is more esthetic 3084 // keep in balance with the next function 3085 init(); // virtual init 3086 } 3087 classInit(const KWargs & a)3088 void AbstractElement::classInit( const KWargs& a ){ 3089 // this funcion is needed because calling the virtual function 3090 // setAttributes from the constructor will NOT call the right version 3091 // THIS IS BY DESIGN in C++ 3092 init(); // virtual init 3093 KWargs a1 = a; 3094 setAttributes( a1 ); // also virtual! 3095 checkAtts(); // check if all needed attributes are set 3096 } 3097 append(FoliaElement * child)3098 FoliaElement *AbstractElement::append( FoliaElement *child ){ 3099 /// append child to this node 3100 /*! 3101 * \param child the node to add 3102 * \return the appended child 3103 * 3104 * will throw on error 3105 */ 3106 if ( !child ){ 3107 throw XmlError( "attempt to append an empty node to a " + classname() ); 3108 } 3109 bool ok = false; 3110 try { 3111 ok = child->addable( this ); 3112 } 3113 catch ( const XmlError& ) { 3114 // don't delete the offending child in case of illegal reconnection 3115 // it will be deleted by the true parent 3116 throw; 3117 } 3118 catch ( const exception& ) { 3119 child->destroy(); 3120 throw; 3121 } 3122 if ( ok ) { 3123 if ( doc() ){ 3124 child->assignDoc( doc() ); 3125 } 3126 _data.push_back(child); 3127 if ( !child->parent() ) { 3128 child->set_parent(this); 3129 } 3130 if ( child->referable() ){ 3131 child->increfcount(); 3132 } 3133 if ( child->spaces_flag() == SPACE_FLAGS::UNSET ){ 3134 child->set_spaces_flag( _preserve_spaces ); 3135 } 3136 return child->postappend(); 3137 } 3138 return 0; 3139 } 3140 postappend()3141 FoliaElement *AbstractElement::postappend( ) { 3142 /// perform some post correction after appending 3143 if ( id().empty() && (ID & required_attributes()) && auto_generate_id() ){ 3144 _id = generateId( xmltag() ); 3145 } 3146 return this; 3147 } 3148 remove(FoliaElement * child)3149 void AbstractElement::remove( FoliaElement *child ) { 3150 /// remove a child from a node 3151 /*! 3152 * \param child the element to remove 3153 */ 3154 #ifdef DE_AND_CONSTRUCT_DEBUG 3155 cerr << "\nremove " << child->xmltag() << " from " << xmltag() 3156 << " adres=" << (void*)this 3157 << " id=" << _id << " class= " << endl; 3158 #endif 3159 auto it = std::remove( _data.begin(), _data.end(), child ); 3160 _data.erase( it, _data.end() ); 3161 } 3162 index(size_t i) const3163 FoliaElement* AbstractElement::index( size_t i ) const { 3164 /// return the child at index i 3165 /*! 3166 * \param i the index 3167 * \return the child at index i 3168 * 3169 * Will throw when the index is out of range 3170 */ 3171 if ( i < _data.size() ) { 3172 return _data[i]; 3173 } 3174 throw range_error( "[] index out of range" ); 3175 } 3176 rindex(size_t ri) const3177 FoliaElement* AbstractElement::rindex( size_t ri ) const { 3178 /// return the child at reversed index ri 3179 /*! 3180 * \param ri the index 3181 * \return the child at index ri 3182 * 3183 * Will throw when the index is out of range 3184 */ 3185 if ( ri < _data.size() ) { 3186 return _data[_data.size()-1-ri]; 3187 } 3188 throw range_error( "[] rindex out of range" ); 3189 } 3190 select(ElementType et,const string & st,const set<ElementType> & exclude,SELECT_FLAGS flag) const3191 vector<FoliaElement*> AbstractElement::select( ElementType et, 3192 const string& st, 3193 const set<ElementType>& exclude, 3194 SELECT_FLAGS flag ) const { 3195 /// The generic 'select()' function on which all other variants are based 3196 /// it searches a FoLiA node for matchins sibblings. 3197 /*! 3198 * \param et which type of element we are looking for 3199 * \param st when not empty ("") we also must match on the 'sett' of the nodes 3200 * \param exclude a set of ElementType to exclude from searching. 3201 * These are skipped, and NOT recursed into. 3202 * \param flag determines special search stategies: 3203 * - RECURSE : recurse the whole FoLia from the given node downwards 3204 * returning all matching nodes, even within matches 3205 * This is the default. 3206 * - LOCAL : just look at the direct sibblings of the node 3207 * - TOP_HIT : like recurse, but do NOT recurse into sibblings 3208 * of matching node 3209 */ 3210 vector<FoliaElement*> res; 3211 for ( const auto& el : _data ) { 3212 if ( el->element_id() == et && 3213 ( st.empty() || el->sett() == st ) ) { 3214 res.push_back( el ); 3215 if ( flag == SELECT_FLAGS::TOP_HIT ){ 3216 flag = SELECT_FLAGS::LOCAL; 3217 } 3218 } 3219 if ( flag != SELECT_FLAGS::LOCAL ){ 3220 // not at this level, search deeper when recurse is true 3221 if ( exclude.find( el->element_id() ) == exclude.end() ) { 3222 vector<FoliaElement*> tmp = el->select( et, st, exclude, flag ); 3223 res.insert( res.end(), tmp.begin(), tmp.end() ); 3224 } 3225 } 3226 } 3227 return res; 3228 } 3229 select(ElementType et,const string & st,SELECT_FLAGS flag) const3230 vector<FoliaElement*> AbstractElement::select( ElementType et, 3231 const string& st, 3232 SELECT_FLAGS flag ) const { 3233 /// wrapper around the the generic select() 3234 /*! 3235 * calls select() with a default ignore set. 3236 */ 3237 return select( et, st, default_ignore, flag ); 3238 } 3239 select(ElementType et,const set<ElementType> & exclude,SELECT_FLAGS flag) const3240 vector<FoliaElement*> AbstractElement::select( ElementType et, 3241 const set<ElementType>& exclude, 3242 SELECT_FLAGS flag ) const { 3243 /// wrapper around the the generic select() 3244 /*! 3245 * calls select() with a default setname. 3246 */ 3247 return select( et, "", exclude, flag ); 3248 } 3249 select(ElementType et,SELECT_FLAGS flag) const3250 vector<FoliaElement*> AbstractElement::select( ElementType et, 3251 SELECT_FLAGS flag ) const { 3252 /// wrapper around the the generic select() 3253 /*! 3254 * calls select() with a default setname and the default ignore set 3255 */ 3256 return select( et, "", default_ignore, flag ); 3257 } 3258 unravel(set<FoliaElement * > & store)3259 void AbstractElement::unravel( set<FoliaElement*>& store ){ 3260 /// split the node and all siblings into a set of nodes 3261 /*! 3262 * \param store 3263 * recursively go through this node and its children an collect all 3264 * node pointers in store. 3265 * Erase the _data array of every node 3266 * 3267 * This function is used when erasing a document. Creating a set avoids 3268 * deleting nodes twice 3269 */ 3270 resetrefcount(); 3271 _parent = 0; 3272 store.insert( this ); 3273 auto dit = _data.begin(); 3274 while ( dit != _data.end() ){ 3275 (*dit)->unravel( store ); 3276 dit = _data.erase(dit); 3277 } 3278 } 3279 parseXml(const xmlNode * node)3280 FoliaElement* AbstractElement::parseXml( const xmlNode *node ) { 3281 /// recursively parse a FoLiA tree starting at node 3282 /*! 3283 * \param node an xmlNode representing a FoLiA subtree 3284 * \return the parsed tree. Throws on error. 3285 */ 3286 KWargs att = getAttributes( node ); 3287 int sp = xmlNodeGetSpacePreserve(node); 3288 if ( sp == 1 ){ 3289 att["xml:space"] = "preserve"; 3290 } 3291 else if ( sp == 0 ){ 3292 att["xml:space"] = "default"; 3293 } 3294 3295 setAttributes( att ); 3296 xmlNode *p = node->children; 3297 while ( p ) { 3298 string pref; 3299 string ns = getNS( p, pref ); 3300 if ( !ns.empty() && ns != NSFOLIA ){ 3301 // skip alien nodes 3302 if ( doc() && doc()->debug > 2 ) { 3303 cerr << "skipping non-FoLiA node: " << pref << ":" << Name(p) << endl; 3304 } 3305 p = p->next; 3306 continue; 3307 } 3308 if ( p->type == XML_ELEMENT_NODE ) { 3309 string tag = Name( p ); 3310 FoliaElement *t = createElement( tag, doc() ); 3311 if ( t ) { 3312 if ( doc() && doc()->debug > 2 ) { 3313 cerr << "created " << t << endl; 3314 } 3315 t = t->parseXml( p ); 3316 if ( t ) { 3317 if ( doc() && doc()->debug > 2 ) { 3318 cerr << "extend " << this << " met " << t << endl; 3319 } 3320 append( t ); 3321 } 3322 } 3323 else if ( doc() && !doc()->permissive() ){ 3324 throw XmlError( "FoLiA parser terminated" ); 3325 } 3326 } 3327 else if ( p->type == XML_COMMENT_NODE ) { 3328 string tag = "_XmlComment"; 3329 FoliaElement *t = createElement( tag, doc() ); 3330 if ( t ) { 3331 if ( doc() && doc()->debug > 2 ) { 3332 cerr << "created " << t << endl; 3333 } 3334 t = t->parseXml( p ); 3335 if ( t ) { 3336 if ( doc() && doc()->debug > 2 ) { 3337 cerr << "extend " << this << " met " << t << endl; 3338 } 3339 append( t ); 3340 } 3341 } 3342 } 3343 else if ( p->type == XML_ENTITY_REF_NODE ){ 3344 string txt = TextValue( p ); 3345 XmlText *t = add_child<XmlText>( txt ); 3346 if ( doc() && doc()->debug > 2 ) { 3347 cerr << "created " << t << "(" << t->text() << ")" << endl; 3348 cerr << "extended " << this << " met " << t << endl; 3349 cerr << "this.size()= " << size() << " t.size()=" << t->size() << endl; 3350 } 3351 } 3352 else if ( p->type == XML_TEXT_NODE ){ 3353 if ( this->is_textcontainer() 3354 || this->is_phoncontainer() ){ 3355 // non empty text is allowed (or even required) here 3356 string txt = TextValue( p ); 3357 if ( !txt.empty() ) { 3358 XmlText *t = add_child<XmlText>( txt ); 3359 if ( doc() && doc()->debug > 2 ) { 3360 cerr << "created " << t << "(" << t->text() << ")" << endl; 3361 cerr << "extended " << this << " met " << t << endl; 3362 cerr << "this.size()= " << size() << " t.size()=" << t->size() << endl; 3363 } 3364 } 3365 } 3366 else { 3367 // This MUST be 'empty space', so only spaces and tabs formatting 3368 string tag = "_XmlText"; 3369 FoliaElement *t = createElement( tag, doc() ); 3370 if ( t ) { 3371 if ( doc() && doc()->debug > 2 ){ 3372 cerr << "created " << t << endl; 3373 } 3374 try { 3375 t = t->parseXml( p ); 3376 } 3377 catch ( const ValueError& e ){ 3378 t->destroy(); 3379 t = 0; 3380 } 3381 } 3382 if ( t ) { 3383 string txt = t->str(); 3384 txt = TiCC::trim(txt); 3385 if ( !txt.empty() ){ 3386 if ( p->prev ){ 3387 string tg = "<" + Name(p->prev) + ">"; 3388 throw XmlError( "found extra text '" + txt + "' after element " 3389 + tg + ", NOT allowed there." ); 3390 } 3391 else { 3392 string tg = "<" + Name(p->parent) + ">"; 3393 throw XmlError( "found extra text '" + txt + "' inside element " 3394 + tg + ", NOT allowed there." ); 3395 } 3396 } 3397 if ( doc() && doc()->debug > 2 ){ 3398 cerr << "created " << t << "(" << t->text() << ")" << endl; 3399 cerr << "extended " << this << " met " << t << endl; 3400 cerr << "this.size()= " << size() << " t.size()=" << t->size() << endl; 3401 } 3402 append( t ); 3403 } 3404 } 3405 } 3406 p = p->next; 3407 } 3408 if ( doc() && ( doc()->checktext() || doc()->fixtext() ) 3409 && this->printable() 3410 && !isSubClass( Morpheme_t ) && !isSubClass( Phoneme_t) ){ 3411 check_text_consistency_while_parsing(); 3412 } 3413 return this; 3414 } 3415 setDateTime(const string & s)3416 void AbstractElement::setDateTime( const string& s ) { 3417 /// set the DATETIME value of a node 3418 /*! 3419 * \param s a date/time in ISO.... format. (YYYY-MM-DDThh:mm:ss) 3420 */ 3421 Attrib supported = required_attributes() | optional_attributes(); 3422 if ( !(DATETIME & supported) ) { 3423 throw ValueError("datetime is not supported for " + classname() ); 3424 } 3425 else { 3426 string time = parseDate( s ); 3427 if ( time.empty() ) { 3428 throw ValueError( "invalid datetime, must be in YYYY-MM-DDThh:mm:ss format: " + s ); 3429 } 3430 _datetime = time; 3431 } 3432 } 3433 getDateTime() const3434 const string AbstractElement::getDateTime() const { 3435 /// return the _datetime value 3436 return _datetime; 3437 } 3438 addPosAnnotation(const KWargs & inargs)3439 PosAnnotation *AllowInlineAnnotation::addPosAnnotation( const KWargs& inargs ) { 3440 /// add a PosAnnotation node given the parameters 3441 /*! 3442 * \param inargs A list of Attribute-Value pairs 3443 * \return the created PosAnnotation node 3444 * 3445 * when the *this node already has a PosAnnotation in the specified set, 3446 * an ALTERNATIVE node is added 3447 */ 3448 KWargs args = inargs; 3449 string st; 3450 auto it = args.find("set" ); 3451 if ( it != args.end() ) { 3452 st = it->second; 3453 } 3454 string newId = args.extract("generate_id" ); 3455 if ( newId.empty() ){ 3456 newId = "alt-pos"; 3457 } 3458 if ( has_annotation<PosAnnotation>( st ) > 0 ) { 3459 // ok, there is already one, so create an Alternative 3460 KWargs kw; 3461 kw["xml:id"] = generateId( newId ); 3462 if ( !doc()->declared( AnnotationType::ALTERNATIVE ) ){ 3463 doc()->declare( AnnotationType::ALTERNATIVE, "" ); 3464 } 3465 Alternative *alt = new Alternative( kw, doc() ); 3466 append( alt ); 3467 return alt->addAnnotation<PosAnnotation>( args ); 3468 } 3469 else { 3470 return addAnnotation<PosAnnotation>( args ); 3471 } 3472 } 3473 getPosAnnotations(const string & st,vector<PosAnnotation * > & alts) const3474 PosAnnotation* AllowInlineAnnotation::getPosAnnotations( const string& st, 3475 vector<PosAnnotation*>& alts ) const { 3476 /// return the PosAnnotation AND all alternatives 3477 /*! 3478 * \param st the annotation set 3479 * \param alts all the alternatives in set st 3480 * \return the PosAnnotation in set st 3481 * 3482 * \note The return value may be 0, even when there ARE alternatives! 3483 */ 3484 PosAnnotation *res = annotation<PosAnnotation>( st ); // may be 0 3485 alts.clear(); 3486 // now search for alternatives 3487 vector<Alternative *> alt_nodes = select<Alternative>( AnnoExcludeSet ); 3488 for ( const auto& alt : alt_nodes ){ 3489 if ( alt->size() > 0 ) { // child elements? 3490 for ( size_t j=0; j < alt->size(); ++j ) { 3491 if ( alt->index(j)->element_id() == PosAnnotation_t && 3492 ( st.empty() || alt->index(j)->sett() == st ) ) { 3493 alts.push_back( dynamic_cast<PosAnnotation*>(alt->index(j)) ); 3494 } 3495 } 3496 } 3497 } 3498 return res; 3499 } 3500 addLemmaAnnotation(const KWargs & inargs)3501 LemmaAnnotation *AllowInlineAnnotation::addLemmaAnnotation( const KWargs& inargs ) { 3502 /// add a LemmaAnnotation node given the parameters 3503 /*! 3504 * \param inargs A list of Attribute-Value pairs 3505 * \return the created LemmaAnnotation node 3506 * 3507 * when the *this node already has a LemmaAnnotation in the specified set, 3508 * an ALTERNATIVE node is added 3509 */ 3510 KWargs args = inargs; 3511 string st; 3512 auto it = args.find("set" ); 3513 if ( it != args.end() ) { 3514 st = it->second; 3515 } 3516 string newId = args.extract("generate_id" ); 3517 if ( newId.empty() ){ 3518 newId = "alt-lem"; 3519 } 3520 if ( has_annotation<LemmaAnnotation>( st ) > 0 ) { 3521 // ok, there is already one, so create an Alternative 3522 KWargs kw; 3523 kw["xml:id"] = generateId( newId ); 3524 if ( !doc()->declared( AnnotationType::ALTERNATIVE ) ){ 3525 doc()->declare( AnnotationType::ALTERNATIVE, "" ); 3526 } 3527 Alternative *alt = new Alternative( kw, doc() ); 3528 append( alt ); 3529 return alt->addAnnotation<LemmaAnnotation>( args ); 3530 } 3531 else { 3532 return addAnnotation<LemmaAnnotation>( args ); 3533 } 3534 } 3535 getLemmaAnnotations(const string & st,vector<LemmaAnnotation * > & alts) const3536 LemmaAnnotation* AllowInlineAnnotation::getLemmaAnnotations( const string& st, 3537 vector<LemmaAnnotation*>& alts ) const { 3538 /// return the LemmaAnnotation AND all alternatives 3539 /*! 3540 * \param st the annotation set 3541 * \param alts all the alternatives in set st 3542 * \return the LemmaAnnotation in set st 3543 * 3544 * \note The return value may be 0, even when there ARE alternatives! 3545 */ 3546 alts.clear(); 3547 LemmaAnnotation *res = annotation<LemmaAnnotation>( st ); // may be 0 ! 3548 // also search alternatives 3549 vector<Alternative *> alt_nodes = select<Alternative>( AnnoExcludeSet ); 3550 for ( const auto& alt : alt_nodes ){ 3551 if ( alt->size() > 0 ) { // child elements? 3552 for ( size_t j =0; j < alt->size(); ++j ) { 3553 if ( alt->index(j)->element_id() == LemmaAnnotation_t && 3554 ( st.empty() || alt->index(j)->sett() == st ) ) { 3555 alts.push_back( dynamic_cast<LemmaAnnotation*>(alt->index(j)) ); 3556 } 3557 } 3558 } 3559 } 3560 return res; 3561 } 3562 addSentence(const KWargs & in_args)3563 Sentence *AbstractElement::addSentence( const KWargs& in_args ) { 3564 /// add a Sentence node given the parameters 3565 /*! 3566 * \param in_args A list of Attribute-Value pairs 3567 * \return the created Sentence 3568 * may throw when the 'xml:id' is nor unique 3569 */ 3570 Sentence *res = 0; 3571 KWargs kw = in_args; 3572 if ( !kw.is_present("xml:id") ){ 3573 string id = generateId( "s" ); 3574 kw["xml:id"] = id; 3575 } 3576 try { 3577 res = new Sentence( kw, doc() ); 3578 } 3579 catch( const DuplicateIDError& e ) { 3580 res->destroy(); 3581 throw; 3582 } 3583 append( res ); 3584 return res; 3585 } 3586 addWord(const KWargs & in_args)3587 Word *AbstractElement::addWord( const KWargs& in_args ) { 3588 /// add a Word node given the parameters 3589 /*! 3590 * \param in_args A list of Attribute-Value pairs 3591 * \return the created Word 3592 * may throw when the 'xml:id' is nor unique, or when appending fails 3593 */ 3594 Word *res = new Word( doc() ); 3595 KWargs kw = in_args; 3596 if ( !kw.is_present("xml:id") ){ 3597 string id = generateId( "w" ); 3598 kw["xml:id"] = id; 3599 } 3600 try { 3601 res->setAttributes( kw ); 3602 } 3603 catch( const DuplicateIDError& e ) { 3604 res->destroy(); 3605 throw; 3606 } 3607 append( res ); 3608 return res; 3609 } 3610 addWord(const string & s)3611 Word *AbstractElement::addWord( const string& s ){ 3612 /// add a Word given the string s 3613 /*! 3614 \param s a string with text OR an encode KWargs list 3615 \return a new Word 3616 3617 If the string is a KWargs properties list it is used to create a word 3618 with those properties. Otherwise it is assumed that \em s represents 3619 the text value for the Word 3620 */ 3621 KWargs args = getArgs(s); 3622 if ( args.empty() ){ 3623 args["text"] = s; 3624 } 3625 return addWord( args ); 3626 } 3627 generateId(const string & tag)3628 const string AllowGenerateID::generateId( const string& tag ){ 3629 /// generate an new xml:id 3630 /*! 3631 * \param tag an extra string to use in the result 3632 * \return a string with an unique id 3633 * 3634 * The new id is constructed from the elements id, or from a parent id 3635 */ 3636 string nodeId = id(); 3637 // cerr << "node: " << this << endl; 3638 // cerr << "ID=" << nodeId << endl; 3639 if ( nodeId.empty() ){ 3640 // if no ID, look upward. 3641 FoliaElement *par = parent(); 3642 if ( !par ){ 3643 throw XmlError( "unable to generate an ID. No StructureElement parent found?" ); 3644 } 3645 // cerr << "call on parent:" << par << endl; 3646 return par->generateId( tag ); 3647 } 3648 else { 3649 int max = 0; 3650 if ( !tag.empty() ) { 3651 max = ++id_map[tag]; 3652 } 3653 // cerr << "MAX = " << max << endl; 3654 string id = nodeId + '.' + tag + '.' + TiCC::toString( max ); 3655 // cerr << "new id = " << id << endl; 3656 return id; 3657 } 3658 } 3659 setMaxId(FoliaElement * child)3660 void AllowGenerateID::setMaxId( FoliaElement *child ) { 3661 /// register the child id for later use 3662 /*! 3663 * \param child 3664 * if the child has an id, try to extract the last part as a number 3665 * if so, check the registration of that numer for the childs tag 3666 */ 3667 if ( !child->id().empty() && !child->xmltag().empty() ) { 3668 vector<string> parts = TiCC::split_at( child->id(), "." ); 3669 if ( !parts.empty() ) { 3670 string val = parts.back(); 3671 int i; 3672 try { 3673 i = stringTo<int>( val ); 3674 } 3675 catch ( const exception& ) { 3676 // no number, so assume some user defined id 3677 return; 3678 } 3679 const auto& it = id_map.find( child->xmltag() ); 3680 if ( it == id_map.end() ) { 3681 id_map[child->xmltag()] = i; 3682 } 3683 else { 3684 if ( it->second < i ) { 3685 it->second = i; 3686 } 3687 } 3688 } 3689 } 3690 } 3691 3692 //#define DEBUG_CORRECT 1 3693 correct(const vector<FoliaElement * > & _original,const vector<FoliaElement * > & _current,const vector<FoliaElement * > & _newv,const vector<FoliaElement * > & _suggestions,const KWargs & args_in)3694 Correction * AllowCorrections::correct( const vector<FoliaElement*>& _original, 3695 const vector<FoliaElement*>& _current, 3696 const vector<FoliaElement*>& _newv, 3697 const vector<FoliaElement*>& _suggestions, 3698 const KWargs& args_in ) { 3699 /// generic function to correct a group of FoliaElements into a Correction 3700 /*! 3701 * \param _original a group of nodes to correct and add to the Original 3702 * \param _current a group of nodes to add to the Current 3703 * \param _newv a group of nodes to replace _original, added to New 3704 * \param _suggestions a group of nodes to add to Suggestions 3705 * \param args_in additional arguments 3706 * \return the Correction node. Might throw on problems 3707 */ 3708 #ifdef DEBUG_CORRECT 3709 cerr << "correct " << this << endl; 3710 cerr << "original= " << _original << endl; 3711 cerr << "current = " << _current << endl; 3712 cerr << "new = " << _newv << endl; 3713 cerr << "suggestions = " << _suggestions << endl; 3714 cerr << "args in = " << args_in << endl; 3715 #endif 3716 // Apply a correction 3717 Document *doc = this->doc(); 3718 Correction *corr = 0; 3719 bool hooked = false; 3720 New *addnew = 0; 3721 KWargs args = args_in; 3722 vector<FoliaElement*> original = _original; 3723 vector<FoliaElement*> _new = _newv; 3724 vector<FoliaElement*> suggestions = _suggestions; 3725 auto it = args.find("new"); 3726 if ( it != args.end() ) { 3727 KWargs my_args; 3728 my_args["value"] = it->second; 3729 TextContent *t = new TextContent( my_args, doc ); 3730 _new.push_back( t ); 3731 args.erase( it ); 3732 } 3733 it = args.find("suggestion"); 3734 if ( it != args.end() ) { 3735 KWargs my_args; 3736 my_args["value"] = it->second; 3737 TextContent *t = new TextContent( my_args, doc ); 3738 suggestions.push_back( t ); 3739 args.erase( it ); 3740 } 3741 it = args.find("reuse"); 3742 if ( it != args.end() ) { 3743 // reuse an existing correction instead of making a new one 3744 try { 3745 corr = dynamic_cast<Correction*>(doc->index(it->second)); 3746 } 3747 catch ( const exception& e ) { 3748 throw ValueError("reuse= must point to an existing correction id!"); 3749 } 3750 if ( !corr->isinstance( Correction_t ) ) { 3751 throw ValueError("reuse= must point to an existing correction id!"); 3752 } 3753 hooked = true; 3754 if ( !_new.empty() && corr->hasCurrent() ) { 3755 // can't add new if there's current, so first set original to current, and then delete current 3756 3757 if ( !_current.empty() ) { 3758 throw runtime_error( "Can't set both new= and current= !"); 3759 } 3760 if ( original.empty() ) { 3761 // move the current to Original 3762 FoliaElement *cur = corr->getCurrent(); 3763 original.push_back( cur ); 3764 corr->remove( cur ); 3765 } 3766 } 3767 } 3768 else { 3769 KWargs args2 = args; 3770 args2.erase("suggestion" ); 3771 args2.erase("suggestions" ); 3772 string id = generateId( "correction" ); 3773 args2["xml:id"] = id; 3774 corr = new Correction( args2, doc ); 3775 } 3776 #ifdef DEBUG_CORRECT 3777 cerr << "now corr= " << corr << endl; 3778 #endif 3779 if ( !_current.empty() ) { 3780 if ( !original.empty() || !_new.empty() ) { 3781 throw runtime_error("When setting current=, original= and new= can not be set!"); 3782 } 3783 for ( const auto& cur : _current ) { 3784 FoliaElement *add = new Current( doc ); 3785 cur->set_parent(0); 3786 add->append( cur ); 3787 corr->replace( add ); 3788 if ( !hooked ) { 3789 for ( size_t i=0; i < size(); ++i ) { 3790 if ( index(i) == cur ) { 3791 replace( index(i), corr ); 3792 hooked = true; 3793 } 3794 } 3795 } 3796 } 3797 #ifdef DEBUG_CORRECT 3798 cerr << "now corr= " << corr << endl; 3799 #endif 3800 } 3801 if ( !_new.empty() ) { 3802 #ifdef DEBUG_CORRECT 3803 cerr << "there is new! " << endl; 3804 #endif 3805 vector<New*> old_new = corr->select<New>(); 3806 if ( !old_new.empty() && old_new[0]->size() == 0 ){ 3807 // there is an EMPTY <new> tag! 3808 // use it to expand 3809 addnew = old_new[0]; 3810 } 3811 else { 3812 // create a <new> tag, might throw is there is a non-empty one 3813 addnew = new New( doc ); 3814 corr->append(addnew); 3815 } 3816 for ( const auto& nw : _new ) { 3817 nw->set_parent(0); 3818 addnew->append( nw ); 3819 } 3820 #ifdef DEBUG_CORRECT 3821 cerr << "after adding NEW: " << corr->xmlstring() << endl; 3822 #endif 3823 vector<Current*> v = corr->FoliaElement::select<Current>(); 3824 //delete current if present 3825 for ( const auto& cur:v ) { 3826 corr->remove( cur ); 3827 } 3828 #ifdef DEBUG_CORRECT 3829 cerr << "after removing CUR: " << corr->xmlstring() << endl; 3830 #endif 3831 } 3832 else if ( !original.empty() ){ 3833 vector<New*> old_new = corr->select<New>(); 3834 if ( !old_new.empty() && old_new[0]->size() == 0 ){ 3835 // there is aleady an EMPTY <new> tag! 3836 } 3837 else { 3838 // create a <new> tag, might throw is there is a non-empty one 3839 New *add_new = new New( doc ); 3840 corr->append(add_new); 3841 } 3842 } 3843 if ( !original.empty() ) { 3844 #ifdef DEBUG_CORRECT 3845 cerr << "there is original! " << endl; 3846 #endif 3847 FoliaElement *add = new Original( doc ); 3848 corr->replace(add); 3849 #ifdef DEBUG_CORRECT 3850 cerr << " corr after replacing original " << corr->xmlstring() << endl; 3851 cerr << " new original= " << add << endl; 3852 #endif 3853 for ( const auto& org: original ) { 3854 #ifdef DEBUG_CORRECT 3855 cerr << " examine org " << org << endl; 3856 #endif 3857 bool dummyNode = ( org->id() == "dummy" ); 3858 if ( !dummyNode ) { 3859 org->set_parent(0); 3860 add->append( org ); 3861 } 3862 #ifdef DEBUG_CORRECT 3863 cerr << " NOW original= " << add << endl; 3864 #endif 3865 for ( size_t i=0; i < size(); ++i ) { 3866 #ifdef DEBUG_CORRECT 3867 cerr << "in loop, bekijk " << index(i) << endl; 3868 #endif 3869 if ( index(i) == org ) { 3870 #ifdef DEBUG_CORRECT 3871 cerr << "OK hit on ORG :" << org << endl; 3872 #endif 3873 if ( !hooked ) { 3874 #ifdef DEBUG_CORRECT 3875 cerr << "it isn't hooked!" << endl; 3876 FoliaElement *tmp = replace( index(i), corr ); 3877 cerr << " corr after replace " << corr->xmlstring() << endl; 3878 cerr << " replaced " << tmp << endl; 3879 #else 3880 replace( index(i), corr ); 3881 #endif 3882 hooked = true; 3883 } 3884 else { 3885 #ifdef DEBUG_CORRECT 3886 cerr << " corr before remove " << corr << endl; 3887 cerr << " remove " << org << endl; 3888 #endif 3889 this->remove( org ); 3890 #ifdef DEBUG_CORRECT 3891 cerr << " corr after remove " << corr << endl; 3892 #endif 3893 } 3894 } 3895 } 3896 } 3897 if ( add->size() == 0 ){ 3898 corr->remove( add ); 3899 } 3900 } 3901 else if ( addnew ) { 3902 // original not specified, find automagically: 3903 vector<FoliaElement *> orig; 3904 #ifdef DEBUG_CORRECT 3905 cerr << "start to look for original " << endl; 3906 #endif 3907 for ( size_t i=0; i < len(addnew); ++ i ) { 3908 FoliaElement *p = addnew->index(i); 3909 #ifdef DEBUG_CORRECT 3910 cerr << "bekijk " << p << endl; 3911 #endif 3912 vector<FoliaElement*> v = p->find_replacables( this ); 3913 // for ( const auto& el: v ) { 3914 // orig.push_back( el ); 3915 // } 3916 copy( v.begin(), v.end(), back_inserter(orig) ); 3917 } 3918 if ( orig.empty() ) { 3919 throw runtime_error( "No original= specified and unable to automatically infer"); 3920 } 3921 else { 3922 #ifdef DEBUG_CORRECT 3923 cerr << "we seem to have some originals! " << endl; 3924 #endif 3925 FoliaElement *add = new Original( doc ); 3926 #ifdef DEBUG_CORRECT 3927 cerr << "corr before adding new original! " << corr << endl; 3928 #endif 3929 corr->replace(add); 3930 #ifdef DEBUG_CORRECT 3931 cerr << "corr after adding new original! " << corr << endl; 3932 cerr << "now parent = " << add->parent() << endl; 3933 #endif 3934 3935 for ( const auto& org: orig ) { 3936 #ifdef DEBUG_CORRECT 3937 cerr << " examine original : " << org << endl; 3938 cerr << "with parent = " << org->parent() << endl; 3939 #endif 3940 // first we lookup org in our data and remove it there 3941 for ( size_t i=0; i < size(); ++i ) { 3942 #ifdef DEBUG_CORRECT 3943 cerr << "in loop, bekijk " << index(i) << endl; 3944 #endif 3945 if ( index(i) == org ) { 3946 #ifdef DEBUG_CORRECT 3947 cerr << "found original " << endl; 3948 #endif 3949 if ( !hooked ) { 3950 #ifdef DEBUG_CORRECT 3951 cerr << "it isn't hooked!" << endl; 3952 FoliaElement *tmp = replace( index(i), corr ); 3953 cerr << " corr after replace " << corr << endl; 3954 cerr << " replaced " << tmp << endl; 3955 #else 3956 replace( index(i), corr ); 3957 #endif 3958 3959 hooked = true; 3960 } 3961 else { 3962 #ifdef DEBUG_CORRECT 3963 cerr << " corr before remove " << corr << endl; 3964 cerr << " remove " << org << endl; 3965 #endif 3966 this->remove( org ); 3967 #ifdef DEBUG_CORRECT 3968 cerr << " corr after remove " << corr << endl; 3969 #endif 3970 } 3971 } 3972 } 3973 // now we conect org to the new original node 3974 org->set_parent( 0 ); 3975 add->append( org ); 3976 #ifdef DEBUG_CORRECT 3977 cerr << " add after append : " << add << endl; 3978 cerr << "parent = " << org->parent() << endl; 3979 #endif 3980 } 3981 vector<Current*> v = corr->FoliaElement::select<Current>(); 3982 //delete current if present 3983 for ( const auto& cur: v ) { 3984 #ifdef DEBUG_CORRECT 3985 cerr << " remove cur=" << cur << endl; 3986 #endif 3987 this->remove( cur ); 3988 } 3989 } 3990 } 3991 #ifdef DEBUG_CORRECT 3992 cerr << " corr after edits " << corr->xmlstring() << endl; 3993 #endif 3994 if ( addnew ) { 3995 for ( const auto& org : original ) { 3996 #ifdef DEBUG_CORRECT 3997 cerr << " remove " << org << endl; 3998 #endif 3999 bool dummyNode = ( org->id() == "dummy" ); 4000 corr->remove( org ); 4001 if ( dummyNode ){ 4002 org->destroy(); 4003 } 4004 } 4005 } 4006 #ifdef DEBUG_CORRECT 4007 cerr << " corr after removes " << corr->xmlstring() << endl; 4008 #endif 4009 if ( !suggestions.empty() ) { 4010 if ( !hooked ) { 4011 append(corr); 4012 } 4013 for ( const auto& sug : suggestions ) { 4014 if ( sug->isinstance( Suggestion_t ) ) { 4015 sug->set_parent(0); 4016 corr->append( sug ); 4017 } 4018 else { 4019 FoliaElement *add = new Suggestion( doc ); 4020 sug->set_parent(0); 4021 add->append( sug ); 4022 corr->append( add ); 4023 } 4024 } 4025 } 4026 4027 it = args.find("reuse"); 4028 if ( it != args.end() ) { 4029 it = args.find("annotator"); 4030 if ( it != args.end() ) { 4031 corr->annotator( it->second ); 4032 } 4033 it = args.find("annotatortype"); 4034 if ( it != args.end() ){ 4035 corr->annotatortype( stringTo<AnnotatorType>(it->second) ); 4036 } 4037 it = args.find("confidence"); 4038 if ( it != args.end() ) { 4039 corr->confidence( stringTo<double>(it->second) ); 4040 } 4041 } 4042 return corr; 4043 } 4044 correct(const string & s)4045 Correction *AllowCorrections::correct( const string& s ) { 4046 /// use an Attribute-Value list to create a Correction 4047 /*! 4048 * \param s a string representation of a Attribute-Value list 4049 * \return the created Correcion 4050 * The parameter is converted to a KWargs list which is handled over 4051 * to correct() 4052 */ 4053 vector<FoliaElement*> nil1; 4054 vector<FoliaElement*> nil2; 4055 vector<FoliaElement*> nil3; 4056 vector<FoliaElement*> nil4; 4057 KWargs args = getArgs( s ); 4058 // cerr << xmltag() << "::correct() <== " << this << endl; 4059 Correction *tmp = correct( nil1, nil2, nil3, nil4, args ); 4060 // cerr << xmltag() << "::correct() ==> " << this << endl; 4061 return tmp; 4062 } 4063 correct(FoliaElement * _old,FoliaElement * _new,const vector<FoliaElement * > & sugg,const KWargs & args)4064 Correction *AllowCorrections::correct( FoliaElement *_old, 4065 FoliaElement *_new, 4066 const vector<FoliaElement*>& sugg, 4067 const KWargs& args ) { 4068 /// create a correction using the parameters 4069 /*! 4070 * \param _old the node to correct 4071 * \param _new the corrected node 4072 * \param sugg a list of possible suggestions 4073 * \param args additonal arguments 4074 * \return the created Correcion 4075 */ 4076 vector<FoliaElement *> nv; 4077 nv.push_back( _new ); 4078 vector<FoliaElement *> ov; 4079 ov.push_back( _old ); 4080 vector<FoliaElement *> nil; 4081 // cerr << xmltag() << "::correct() <== " << this << endl; 4082 Correction *tmp = correct( ov, nil, nv, sugg, args ); 4083 // cerr << xmltag() << "::correct() ==> " << this << endl; 4084 return tmp; 4085 } 4086 correct(FoliaElement * _old,FoliaElement * _new,const KWargs & args)4087 Correction *AllowCorrections::correct( FoliaElement* _old, 4088 FoliaElement* _new, 4089 const KWargs& args ) { 4090 /// create a correction using the parameters 4091 /*! 4092 * \param _old the node to correct 4093 * \param _new the corrected node 4094 * \param args additonal arguments 4095 * \return the created Correcion 4096 */ 4097 const vector<FoliaElement*> sugg; 4098 return correct( _old, _new, sugg, args ); 4099 } 4100 feats(const string & s) const4101 vector<string> AbstractElement::feats( const string& s ) const { 4102 /// return all classes of the given subset 4103 /*! 4104 * \param s a subset name 4105 * \return a list of all classes in the subset of the Feature nodes 4106 * The function loops through all children and for Feature_t children 4107 * it check the subset and collects the matching ones 4108 */ 4109 vector<string> result; 4110 for ( const auto& el : data() ) { 4111 if ( el->isSubClass( Feature_t ) && 4112 el->subset() == s ) { 4113 result.push_back( el->cls() ); 4114 } 4115 } 4116 return result; 4117 } 4118 feat(const string & s) const4119 const string AbstractElement::feat( const string& s ) const { 4120 /// return the class of the first matching Feature with subset s 4121 /*! 4122 * \param s a subset name 4123 * \return the first class of the first Feature node in subset s 4124 */ 4125 const auto& it = find_if( _data.begin(), _data.end(), 4126 [s]( const FoliaElement *e ){ 4127 return ( e->isSubClass( Feature_t ) 4128 && e->subset() == s ); } ); 4129 if ( it == _data.end() ){ 4130 return ""; 4131 } 4132 else { 4133 return (*it)->cls(); 4134 } 4135 } get_metadata() const4136 const MetaData* AbstractElement::get_metadata() const { 4137 /// Get the MetaData node related to this element 4138 /*! 4139 * \return the _metadata or 0 if not available 4140 * may recurse upwards through the parent nodes 4141 */ 4142 if ( !_metadata.empty() && doc() ){ 4143 return doc()->get_submetadata(_metadata); 4144 } 4145 else if ( parent() ){ 4146 return parent()->get_metadata(); 4147 } 4148 else { 4149 return 0; 4150 } 4151 } 4152 get_metadata(const string & key) const4153 const string AbstractElement::get_metadata( const string& key ) const { 4154 /// Get the metadata value for this key 4155 /*! 4156 * \param key which metadata field do we want? 4157 * \return the metadata value for this key 4158 */ 4159 if ( !_metadata.empty() && doc() ){ 4160 const MetaData *what = doc()->get_submetadata(_metadata); 4161 if ( what && what->datatype() == "NativeMetaData" && !key.empty() ){ 4162 return what->get_val( key ); 4163 } 4164 return ""; 4165 } 4166 else if ( parent() ){ 4167 return parent()->get_metadata( key ); 4168 } 4169 else { 4170 return ""; 4171 } 4172 } 4173 selectSpan() const4174 vector<AbstractSpanAnnotation*> AbstractElement::selectSpan() const { 4175 /// select all SpanAnnotation nodes in the FoliaElement 4176 /*! 4177 * \return a list of SpanAnnotation nodes. 4178 * All possible Span types are collected in this list. (see SpanSet) 4179 */ 4180 vector<AbstractSpanAnnotation*> res; 4181 for ( const auto& el : SpanSet ) { 4182 vector<FoliaElement*> tmp = select( el ); 4183 transform( tmp.begin(), tmp.end(), 4184 back_inserter(res), 4185 [&]( FoliaElement *e ){ 4186 return dynamic_cast<AbstractSpanAnnotation*>( e ); } ); 4187 } 4188 return res; 4189 } 4190 4191 } // namespace folia 4192