1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of libfolia 7 8 libfolia is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 libfolia is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 26 */ 27 28 #ifndef FOLIA_DOCUMENT_H 29 #define FOLIA_DOCUMENT_H 30 31 #include <list> 32 #include <set> 33 #include <map> 34 #include <vector> 35 #include <string> 36 #include <iostream> 37 #include "unicode/unistr.h" 38 #include "unicode/regex.h" 39 #include "libxml/tree.h" 40 #include "libxml/xpath.h" 41 #include "libfolia/folia.h" 42 43 using namespace icu; 44 45 namespace folia { 46 47 extern const std::string NSFOLIA; 48 49 enum ElementType : unsigned int; 50 51 class Pattern { 52 friend std::ostream& operator<<( std::ostream&, const Pattern& ); 53 public: 54 Pattern( const std::vector<std::string>&, 55 const ElementType = BASE, 56 const std::string& = "" ); 57 Pattern( const std::vector<std::string>&, const std::string& ); 58 59 ~Pattern(); 60 bool match( const UnicodeString& , size_t&, int&, bool&, bool& ) const; size()61 size_t size() const { return sequence.size(); }; 62 void unsetwild(); 63 bool variablesize() const; 64 std::set<int> variablewildcards() const; 65 ElementType matchannotation; 66 bool regexp; 67 private: 68 bool case_sensitive; 69 int maxgapsize; 70 std::vector<UnicodeString> sequence; 71 std::vector<RegexMatcher*> matchers; 72 std::string matchannotationset; 73 }; 74 75 class FoliaElement; 76 class Word; 77 class Sentence; 78 class Paragraph; 79 class processor; 80 class Provenance; 81 82 class Document { 83 friend std::ostream& operator<<( std::ostream& os, const Document *d ); 84 /// enum Mode determines runtime characteristic of the document 85 /*! 86 The default settings are CHECKTEXT and AUTODECLARE 87 */ 88 enum Mode { 89 NOMODE=0, //!< no special mode is set. 90 PERMISSIVE=1, //!< be permissive for certain incompatablities 91 CHECKTEXT=2, //!< check text consistency 92 FIXTEXT=4, //!< try to fix text inconsistencies in the fly 93 STRIP=8, //!< on output, strip 94 CANONICAL=16, //!< sort ouput in a reproducable way. 95 AUTODECLARE=32, //!< Automagicly add missing Annotation Declarations 96 EXPLICIT=64 //!< add all set information 97 }; 98 friend class Engine; 99 public: 100 Document(); 101 explicit Document( const KWargs& ); 102 explicit Document( const std::string& ); 103 ~Document(); 104 void init(); 105 void init_args( const KWargs& ); 106 bool read_from_string( const std::string& ); readFromString(const std::string & s)107 bool readFromString( const std::string& s ){ 108 /// backward compatability. read_from_string() is preferred 109 return read_from_string( s ); 110 } 111 bool read_from_file( const std::string& ); readFromFile(const std::string & s)112 bool readFromFile( const std::string& s ){ 113 /// backward compatability. read_from_file() is preferred 114 return read_from_file( s ); 115 } 116 bool save( std::ostream&, const std::string&, bool = false ) const; 117 bool save( std::ostream& os, bool canonical = false ) const { 118 /// save a Document to a stream without using a namespace name 119 return save( os, "", canonical ); 120 } 121 bool save( const std::string&, const std::string&, bool = false ) const ; 122 bool save( const std::string& s, bool canonical = false ) const { 123 /// save a Document to a file without using a namespace name 124 return save( s, "", canonical ); 125 } 126 std::string xmlstring( bool = false ) const; 127 doc()128 FoliaElement* doc() const { 129 /// return a pointer to the internal FoLiA tree 130 return foliadoc; 131 } 132 133 template <typename T> create_root(const KWargs &)134 T *create_root( const KWargs& ){ 135 throw std::logic_error( "create_root() only possible for 'Text' and 'Speech'" ); 136 } 137 template <typename T> create_root()138 T *create_root(){ 139 throw std::logic_error( "create_root() only possible for 'Text' and 'Speech'" ); 140 } 141 142 FoliaElement* append( FoliaElement *t ); 143 Text* setTextRoot(); 144 Text* setTextRoot( const KWargs& ); 145 Speech* setSpeechRoot(); 146 Speech* setSpeechRoot( const KWargs& ); 147 FoliaElement *getRoot(); 148 // backward compatible: addText(KWargs & a)149 Text* addText( KWargs& a ){ return setTextRoot( a ); }; addText(Text * t)150 Text* addText( Text *t ){ return dynamic_cast<Text*>( append(t) ); }; addSpeech(KWargs & a)151 Speech* addSpeech( KWargs& a ){ return setSpeechRoot( a ); }; addSpeech(Speech * s)152 Speech* addSpeech( Speech *s ){ return dynamic_cast<Speech*>( append(s) ); }; 153 154 void set_foreign_metadata( xmlNode * ); 155 void addStyle( const std::string&, const std::string& ); 156 void replaceStyle( const std::string&, const std::string& ); 157 UnicodeString text( const std::string& = "current", 158 bool = false, 159 bool = false ) const; 160 UnicodeString text( const TextPolicy& ) const; 161 std::vector<Paragraph*> paragraphs() const; 162 std::vector<Sentence*> sentences() const; 163 std::vector<Sentence*> sentenceParts() const; 164 std::vector<Word*> words() const; 165 std::vector<std::vector<Word*> > findwords( const Pattern&, 166 const std::string& ="" ) const; 167 std::vector<std::vector<Word*> > findwords( std::list<Pattern>&, 168 const std::string& = "" ) const; 169 Word *words( size_t ) const; 170 Word *rwords( size_t ) const; 171 Paragraph *paragraphs( size_t ) const; 172 Paragraph *rparagraphs( size_t ) const; 173 Sentence *sentences( size_t ) const; 174 Sentence *rsentences( size_t ) const; 175 std::string toXml( const std::string& ="" ) const; 176 bool toXml( const std::string&, 177 const std::string& ) const; 178 std::string metadata_type() const; 179 std::string metadata_file() const; 180 std::string annotation_type_to_string( AnnotationType ) const; 181 void set_metadata( const std::string&, const std::string& ); 182 const std::string get_metadata( const std::string&) const; 183 processor *get_default_processor() const; 184 processor *get_processor( const std::string& ) const; 185 std::vector<processor*> get_processors_by_name( const std::string& ) const; 186 void add_doc_index( FoliaElement * ); 187 void del_doc_index( const std::string& ); 188 189 FoliaElement *index( const std::string& ) const; //retrieve element with specified ID 190 FoliaElement* operator []( const std::string& ) const ; //index as operator 191 bool declared( const AnnotationType&, 192 const std::string&, 193 const std::string&, 194 const AnnotatorType&, 195 const std::string& ) const; 196 bool declared( const AnnotationType&, 197 const std::string&, 198 const std::string&, 199 const AnnotatorType&, 200 const std::set<std::string>& ) const; 201 bool declared( const AnnotationType&, 202 const std::string& = "" ) const; 203 bool declared( ElementType, const std::string& = "" ) const; 204 std::string unalias( AnnotationType, 205 const std::string& ) const; 206 std::string alias( AnnotationType, 207 const std::string& ) const; 208 209 processor *add_processor( const KWargs&, processor * =0 ); 210 std::vector<std::string> get_annotators( AnnotationType, 211 const std::string& ="" ) const; 212 std::vector<const processor *> get_processors( AnnotationType, 213 const std::string& ="" ) const; 214 215 std::string default_set( AnnotationType ) const; 216 std::string original_default_set( AnnotationType ) const; 217 218 std::string default_annotator( AnnotationType, 219 const std::string& ="" ) const; 220 AnnotatorType default_annotatortype( AnnotationType, 221 const std::string& ="" ) const; 222 223 std::string default_datetime( AnnotationType, 224 const std::string& ="" ) const; 225 std::string default_processor( AnnotationType, 226 const std::string& ="" ) const; 227 std::string original_default_processor( AnnotationType ) const; 228 229 FoliaElement* parseXml( ); 230 id()231 std::string id() const { 232 /// return the Document id value 233 return _id; 234 }; 235 std::string language() const; 236 void auto_declare( AnnotationType, 237 const std::string& = "" ); 238 void declare( AnnotationType, 239 const std::string&, 240 const std::string& = "" ); 241 void declare( AnnotationType, 242 const std::string&, 243 const KWargs& ); 244 void declare( AnnotationType, 245 const std::string&, const std::string&, const std::string&, 246 const std::string&, const std::string&, 247 const std::set<std::string>&, 248 const std::string& = "" ); 249 void un_declare( AnnotationType, 250 const std::string& ); XmlDoc()251 const xmlDoc *XmlDoc() const { 252 /// return a pointer to the internal xmlDoc. handle with care. 253 return _xmldoc; 254 }; foliaNs()255 xmlNs *foliaNs() const { 256 /// return a pointer to the output namespace structure 257 return _foliaNsOut; 258 }; keepForDeletion(FoliaElement * p)259 void keepForDeletion( FoliaElement *p ) { 260 /// add FoliaElement \e p to the delSet 261 /*! 262 \param p the FoliaElement to keep for later annihilation 263 the delSet is kept until the destruction of the Document 264 */ 265 delSet.insert( p ); 266 }; addExternal(External * p)267 void addExternal( External *p ) { 268 /// add a node to the _externals list 269 /*! 270 \param p The node to add 271 */ 272 _externals.push_back( p ); 273 }; 274 void resolveExternals(); 275 int debug; //!< the debug level. 0 means NO debugging. 276 277 /// is the PERMISSIVE mode set? permissive()278 bool permissive() const { return mode & PERMISSIVE; }; 279 /// is the CHECKTEXT mode set? checktext()280 bool checktext() const { return mode & CHECKTEXT; }; 281 /// is the FIXTEXT mode set? fixtext()282 bool fixtext() const { return mode & FIXTEXT; }; 283 /// is the STRIP mode set? strip()284 bool strip() const { return mode & STRIP; }; 285 /// is the CANONICAL mode set? canonical()286 bool canonical() const { return mode & CANONICAL; }; 287 /// is the AUTODECLARE mode set? autodeclare()288 bool autodeclare() const { return mode & AUTODECLARE; }; has_explicit()289 bool has_explicit() const { return mode & EXPLICIT; }; 290 bool set_permissive( bool ) const; // defined const, but the mode is mutable! 291 bool set_checktext( bool ) const; // defined const, but the mode is mutable! 292 bool set_fixtext( bool ) const; // defined const, but the mode is mutable! 293 bool set_strip( bool ) const; // defined const, but the mode is mutable! 294 bool set_canonical( bool ) const; // defined const, but the mode is mutable! 295 bool set_autodeclare( bool ) const; // defined const, but the mode is mutable! 296 bool set_explicit( bool ) const; // defined const, but the mode is mutable! 297 /// this class holds annotation declaration information 298 class at_t { 299 friend std::ostream& operator<<( std::ostream& os, const at_t& at ); 300 public: at_t(const std::string & a,const AnnotatorType & t,const std::string & d,const std::string & f,const std::set<std::string> & p)301 at_t( const std::string& a, 302 const AnnotatorType& t, 303 const std::string& d, 304 const std::string& f, 305 const std::set<std::string>& p ): 306 _annotator(a), 307 _ann_type(t), 308 _date(d), 309 _format(f), 310 _processors(p){}; 311 std::string _annotator; ///< the annotator as a string 312 AnnotatorType _ann_type; ///< the annotator type 313 std::string _date; ///< the timestamp as a string 314 std::string _format; ///< the format 315 std::set<std::string> _processors; ///< the id's of all associated processors 316 }; 317 318 void incrRef( AnnotationType, const std::string& ); 319 void decrRef( AnnotationType, const std::string& ); 320 void setmode( const std::string& ) const; 321 std::string getmode() const; setdebug(int val)322 int setdebug( int val ){ 323 /// set the debug level 324 /*! 325 \param val the new debug value 326 \return the old debug value 327 */ 328 int ret=debug; debug=val; return ret; 329 }; 330 std::multimap<AnnotationType,std::string> unused_declarations( ) const; get_submetadata(const std::string & m)331 const MetaData *get_submetadata( const std::string& m ){ 332 /// get the metadata structure with value \e m 333 /*! 334 \param m the value we search 335 \return the found MetaData element, or 0 336 */ 337 const auto& it = submetadata.find( m ); 338 if ( it == submetadata.end() ){ 339 return 0; 340 } 341 else { 342 return it->second; 343 } 344 } cache_textcontent(TextContent * tc)345 void cache_textcontent( TextContent *tc ){ 346 /// add a TextContent to the validation buffer 347 /*! 348 \param tc the TextContent to add to the buffer 349 on a call to validate_offsets() this buffer is used to validate 350 all offsets. 351 */ 352 t_offset_validation_buffer.push_back( tc ); 353 } cache_phoncontent(PhonContent * pc)354 void cache_phoncontent( PhonContent *pc ){ 355 /// add a PhonContent to the validation buffer 356 /*! 357 \param pc the PhonContent to add 358 on a call to validate_offsets() this buffer is used to validate 359 all offsets. 360 */ 361 p_offset_validation_buffer.push_back( pc ); 362 } 363 bool validate_offsets() const; 364 int compare_to_build_version() const; version()365 const std::string& version() const { 366 /// return the version string 367 return _version_string; 368 }; 369 std::string doc_version() const; 370 std::string update_version(); 371 bool version_below( int, int ) const; annotationdefaults()372 const std::map<AnnotationType,std::multimap<std::string,at_t>>& annotationdefaults() const { return _annotationdefaults; }; 373 void parse_metadata( const xmlNode * ); 374 void setDocumentProps( KWargs& ); provenance()375 Provenance *provenance() const { 376 /// return a pointer to the Provenance data 377 return _provenance; 378 }; filename()379 const std::string& filename() const { 380 /// return the filename the Document was created from 381 return _source_filename; 382 }; 383 void save_orig_ann_defaults(); set_incremental(bool b)384 void set_incremental( bool b ) { 385 /// set/unset the incremental_parse flag 386 _incremental_parse = b; 387 }; is_incremental()388 bool is_incremental() const { 389 /// return the value of the incremental_parse flag 390 return _incremental_parse; 391 }; set_preserve_spaces(bool b)392 void set_preserve_spaces( bool b ) { 393 /// set/unset the preserve_spaces flag 394 _preserve_spaces = b; 395 }; preserve_spaces()396 bool preserve_spaces() const { 397 /// return the value of the preserve_spaces flag 398 return _preserve_spaces; 399 } get_warn_count()400 int get_warn_count( ) const { 401 /// return the number of warnings 402 return _warn_count; 403 } reset_warn_count()404 void reset_warn_count( ) { 405 /// reset the number of warnings to 0 406 _warn_count = 0; 407 } increment_warn_count()408 void increment_warn_count() const { 409 /// increment the warning count 410 // NOTE: function is defined const, but the _warn_count is mutable 411 ++_warn_count; 412 } 413 414 private: 415 void adjustTextMode(); 416 std::map<AnnotationType,std::multimap<std::string,at_t> > _annotationdefaults; ///< stores all declared annotations per AnnotationType 417 ///< every AnnotationType can have multiple annotations even with the same 418 ///< setnames. hence a multimap 419 std::map<AnnotationType,std::map<std::string,bool> > _groupannotations; ///< 420 ///< register which annotations are GROUP annotations 421 std::vector<std::pair<AnnotationType,std::string>> _anno_sort; ///< 422 ///< register the original sorting of the annotation declarations in the 423 ///< input, so we can use that for output in the same order. (cannonical 424 ///< mode 425 std::map<AnnotationType,std::map<std::string,int> > _annotationrefs; ///< 426 ///< register the number of references to this AnnotationType/setname 427 std::map<AnnotationType,std::map<std::string,std::string>> _alias_set; ///< 428 ///< register the mapping from aliases to setnames per AnnotationType 429 std::map<AnnotationType,std::map<std::string,std::string>> _set_alias; ///< 430 ///< register the mapping from setname to aliases per AnnotationType 431 std::map<AnnotationType,std::string> _orig_ann_default_sets; ///< 432 ///< for folia::Engine we need to register the original mapping from a 433 ///< AnnoationType to a setname, because in the process more mappings 434 ///< can be added, loosing the default. 435 std::map<AnnotationType,std::string> _orig_ann_default_procs;///< 436 ///< for folia::Engine we need to register the original mapping from a 437 ///< AnnoationType to a processor name, because in the process more mappings 438 ///< can be added, loosing the default. 439 440 std::vector<TextContent*> t_offset_validation_buffer; ///< we register all 441 ///< TextContent nodes here to quickly access them for offset checks 442 ///< that check is performed directly after parsing 443 std::vector<PhonContent*> p_offset_validation_buffer; ///< we register all 444 ///< PhonContent nodes here to quickly access them for offset checks 445 ///< that check is performed directly after parsing 446 void parse_imdi( const xmlNode * ); 447 void parse_annotations( const xmlNode * ); 448 void parse_provenance( const xmlNode * ); 449 void parse_submeta( const xmlNode * ); 450 void parse_styles(); 451 void add_annotations( xmlNode * ) const; 452 void add_provenance( xmlNode * ) const; 453 void add_metadata( xmlNode * ) const; 454 void add_submetadata( xmlNode *) const; 455 void add_styles( xmlDoc* ) const; 456 void append_processor( xmlNode *, const processor * ) const; 457 xmlDoc *to_xmlDoc( const std::string& ="" ) const; 458 void add_one_anno( const std::pair<AnnotationType,std::string>&, 459 xmlNode *, 460 std::set<std::string>& ) const; 461 std::map<std::string, FoliaElement* > sindex; ///< the lookup table 462 ///< for FoliaElements by index (xml:id) (not all nodes do have an index) 463 // std::vector<FoliaElement*> data; 464 std::vector<External*> _externals; 465 std::string _id; 466 std::set<FoliaElement *> delSet; 467 FoliaElement *foliadoc; 468 xmlDoc *_xmldoc; 469 const xmlChar* _foliaNsIn_href; 470 const xmlChar* _foliaNsIn_prefix; 471 mutable xmlNs *_foliaNsOut; 472 Provenance *_provenance; 473 MetaData *_metadata; 474 ForeignMetaData *_foreign_metadata; 475 std::map<std::string,MetaData *> submetadata; 476 std::multimap<std::string,std::string> styles; 477 mutable Mode mode; 478 std::string _source_filename; 479 std::string _version_string; 480 int _major_version; 481 int _minor_version; 482 int _sub_version; 483 std::string _patch_version; 484 bool _external_document; 485 bool _incremental_parse; 486 bool _preserve_spaces; 487 mutable int _warn_count; 488 Document( const Document& ); // inhibit copies 489 Document& operator=( const Document& ); // inhibit copies 490 }; 491 492 template <> inline create_root(const KWargs & args)493 Text *Document::create_root( const KWargs& args ){ 494 return setTextRoot( args ); 495 } 496 497 template <> inline create_root(const KWargs & args)498 Speech *Document::create_root( const KWargs& args ){ 499 return setSpeechRoot( args ); 500 } 501 502 template <> inline create_root()503 Text *Document::create_root(){ 504 return setTextRoot(); 505 } 506 template <> inline create_root()507 Speech *Document::create_root(){ 508 return setSpeechRoot(); 509 } 510 511 std::ostream& operator<<( std::ostream& os, const Document *d ); 512 inline std::ostream& operator<<( std::ostream& os, const Document& d ){ 513 os << &d; 514 return os; 515 } 516 517 std::ostream& operator<<( std::ostream& os, const Document::at_t& at ); 518 519 void expand_version_string( const std::string&, 520 int&, int&, int&, std::string& ); 521 522 std::string library_version(); 523 std::string folia_version(); 524 525 } // namespace folia 526 527 #endif // FOLIA_DOCUMENT_H 528