1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of libfolia
7 
8   libfolia is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   libfolia is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 
26 */
27 
28 #ifndef FOLIA_DOCUMENT_H
29 #define FOLIA_DOCUMENT_H
30 
31 #include <list>
32 #include <set>
33 #include <map>
34 #include <vector>
35 #include <string>
36 #include <iostream>
37 #include "unicode/unistr.h"
38 #include "unicode/regex.h"
39 #include "libxml/tree.h"
40 #include "libxml/xpath.h"
41 #include "libfolia/folia.h"
42 
43 using namespace icu;
44 
45 namespace folia {
46 
47   extern const std::string NSFOLIA;
48 
49   enum ElementType : unsigned int;
50 
51   class Pattern {
52     friend std::ostream& operator<<( std::ostream&, const Pattern& );
53   public:
54     Pattern( const std::vector<std::string>&,
55 	     const ElementType = BASE,
56 	     const std::string& = "" );
57     Pattern( const std::vector<std::string>&,  const std::string& );
58 
59     ~Pattern();
60     bool match( const UnicodeString& , size_t&, int&, bool&, bool& ) const;
size()61     size_t size() const { return sequence.size(); };
62     void unsetwild();
63     bool variablesize() const;
64     std::set<int> variablewildcards() const;
65     ElementType matchannotation;
66     bool regexp;
67   private:
68     bool case_sensitive;
69     int maxgapsize;
70     std::vector<UnicodeString> sequence;
71     std::vector<RegexMatcher*> matchers;
72     std::string matchannotationset;
73   };
74 
75   class FoliaElement;
76   class Word;
77   class Sentence;
78   class Paragraph;
79   class processor;
80   class Provenance;
81 
82   class Document {
83     friend std::ostream& operator<<( std::ostream& os, const Document *d );
84     /// enum Mode determines runtime characteristic of the document
85     /*!
86       The default settings are CHECKTEXT and AUTODECLARE
87      */
88     enum Mode {
89       NOMODE=0,        //!< no special mode is set.
90       PERMISSIVE=1,    //!< be permissive for certain incompatablities
91       CHECKTEXT=2,     //!< check text consistency
92       FIXTEXT=4,       //!< try to fix text inconsistencies in the fly
93       STRIP=8,         //!< on output, strip
94       CANONICAL=16,    //!< sort ouput in a reproducable way.
95       AUTODECLARE=32,  //!< Automagicly add missing Annotation Declarations
96       EXPLICIT=64      //!< add all set information
97     };
98     friend class Engine;
99   public:
100     Document();
101     explicit Document( const KWargs& );
102     explicit Document( const std::string& );
103     ~Document();
104     void init();
105     void init_args( const KWargs& );
106     bool read_from_string( const std::string& );
readFromString(const std::string & s)107     bool readFromString( const std::string& s ){
108       /// backward compatability. read_from_string() is preferred
109       return read_from_string( s );
110     }
111     bool read_from_file( const std::string& );
readFromFile(const std::string & s)112     bool readFromFile( const std::string& s ){
113       /// backward compatability. read_from_file() is preferred
114       return read_from_file( s );
115     }
116     bool save( std::ostream&, const std::string&, bool = false ) const;
117     bool save( std::ostream& os, bool canonical = false ) const {
118       /// save a Document to a stream without using a namespace name
119       return save( os, "", canonical );
120     }
121     bool save( const std::string&, const std::string&, bool = false ) const ;
122     bool save( const std::string& s, bool canonical = false ) const {
123       /// save a Document to a file without using a namespace name
124       return save( s, "", canonical );
125     }
126     std::string xmlstring( bool = false ) const;
127 
doc()128     FoliaElement* doc() const {
129       /// return a pointer to the internal FoLiA tree
130       return foliadoc;
131     }
132 
133     template <typename T>
create_root(const KWargs &)134       T *create_root( const KWargs& ){
135       throw std::logic_error( "create_root() only possible for 'Text' and 'Speech'" );
136     }
137     template <typename T>
create_root()138       T *create_root(){
139       throw std::logic_error( "create_root() only possible for 'Text' and 'Speech'" );
140     }
141 
142     FoliaElement* append( FoliaElement *t );
143     Text* setTextRoot();
144     Text* setTextRoot( const KWargs& );
145     Speech* setSpeechRoot();
146     Speech* setSpeechRoot( const KWargs& );
147     FoliaElement *getRoot();
148     // backward compatible:
addText(KWargs & a)149     Text* addText( KWargs& a ){ return setTextRoot( a ); };
addText(Text * t)150     Text* addText( Text *t ){ return dynamic_cast<Text*>( append(t) ); };
addSpeech(KWargs & a)151     Speech* addSpeech( KWargs& a ){ return setSpeechRoot( a ); };
addSpeech(Speech * s)152     Speech* addSpeech( Speech *s ){ return dynamic_cast<Speech*>( append(s) ); };
153 
154     void set_foreign_metadata( xmlNode * );
155     void addStyle( const std::string&, const std::string& );
156     void replaceStyle( const std::string&, const std::string& );
157     UnicodeString text( const std::string& = "current",
158 			bool = false,
159 			bool = false ) const;
160     UnicodeString text( const TextPolicy& ) const;
161     std::vector<Paragraph*> paragraphs() const;
162     std::vector<Sentence*> sentences() const;
163     std::vector<Sentence*> sentenceParts() const;
164     std::vector<Word*> words() const;
165     std::vector<std::vector<Word*> > findwords( const Pattern&,
166 						const std::string& ="" ) const;
167     std::vector<std::vector<Word*> > findwords( std::list<Pattern>&,
168 						const std::string& = "" ) const;
169     Word *words( size_t ) const;
170     Word *rwords( size_t ) const;
171     Paragraph *paragraphs( size_t ) const;
172     Paragraph *rparagraphs( size_t ) const;
173     Sentence *sentences( size_t ) const;
174     Sentence *rsentences( size_t ) const;
175     std::string toXml( const std::string& ="" ) const;
176     bool toXml( const std::string&,
177 		const std::string& ) const;
178     std::string metadata_type() const;
179     std::string metadata_file() const;
180     std::string annotation_type_to_string( AnnotationType ) const;
181     void set_metadata( const std::string&, const std::string& );
182     const std::string get_metadata( const std::string&) const;
183     processor *get_default_processor() const;
184     processor *get_processor( const std::string& ) const;
185     std::vector<processor*> get_processors_by_name( const std::string& ) const;
186     void add_doc_index( FoliaElement * );
187     void del_doc_index( const std::string& );
188 
189     FoliaElement *index( const std::string& ) const; //retrieve element with specified ID
190     FoliaElement* operator []( const std::string& ) const ; //index as operator
191     bool declared( const AnnotationType&,
192 		   const std::string&,
193 		   const std::string&,
194 		   const AnnotatorType&,
195 		   const std::string& ) const;
196     bool declared( const AnnotationType&,
197 		   const std::string&,
198 		   const std::string&,
199 		   const AnnotatorType&,
200 		   const std::set<std::string>& ) const;
201     bool declared( const AnnotationType&,
202 		   const std::string& = "" ) const;
203     bool declared( ElementType, const std::string& = "" ) const;
204     std::string unalias( AnnotationType,
205 			 const std::string& ) const;
206     std::string alias( AnnotationType,
207 		       const std::string& ) const;
208 
209     processor *add_processor( const KWargs&, processor * =0 );
210     std::vector<std::string> get_annotators( AnnotationType,
211 					    const std::string& ="" ) const;
212     std::vector<const processor *> get_processors( AnnotationType,
213 						   const std::string& ="" ) const;
214 
215     std::string default_set( AnnotationType ) const;
216     std::string original_default_set( AnnotationType ) const;
217 
218     std::string default_annotator( AnnotationType,
219 				   const std::string& ="" ) const;
220     AnnotatorType default_annotatortype( AnnotationType,
221 					const std::string& ="" ) const;
222 
223     std::string default_datetime( AnnotationType,
224 				  const std::string& ="" ) const;
225     std::string default_processor( AnnotationType,
226 				   const std::string& ="" ) const;
227     std::string original_default_processor( AnnotationType ) const;
228 
229     FoliaElement* parseXml( );
230 
id()231     std::string id() const {
232       /// return the Document id value
233       return _id;
234     };
235     std::string language() const;
236     void auto_declare( AnnotationType,
237 		       const std::string& = "" );
238     void declare( AnnotationType,
239 		  const std::string&,
240 		  const std::string& = "" );
241     void declare( AnnotationType,
242 		  const std::string&,
243 		  const KWargs& );
244     void declare( AnnotationType,
245 		  const std::string&, const std::string&, const std::string&,
246 		  const std::string&, const std::string&,
247 		  const std::set<std::string>&,
248 		  const std::string& = "" );
249     void un_declare( AnnotationType,
250 		     const std::string& );
XmlDoc()251     const xmlDoc *XmlDoc() const {
252       /// return a pointer to the internal xmlDoc. handle with care.
253       return _xmldoc;
254     };
foliaNs()255     xmlNs *foliaNs() const {
256       /// return a pointer to the output namespace structure
257       return _foliaNsOut;
258     };
keepForDeletion(FoliaElement * p)259     void keepForDeletion( FoliaElement *p ) {
260       /// add FoliaElement \e p to the delSet
261       /*!
262 	\param p the FoliaElement to keep for later annihilation
263 	the delSet is kept until the destruction of the Document
264        */
265       delSet.insert( p );
266     };
addExternal(External * p)267     void addExternal( External *p ) {
268       /// add a node to the _externals list
269       /*!
270 	\param p The node to add
271       */
272       _externals.push_back( p );
273     };
274     void resolveExternals();
275     int debug; //!< the debug level. 0 means NO debugging.
276 
277     /// is the PERMISSIVE mode set?
permissive()278     bool permissive() const { return mode & PERMISSIVE; };
279     /// is the CHECKTEXT mode set?
checktext()280     bool checktext() const { return mode & CHECKTEXT; };
281     /// is the FIXTEXT mode set?
fixtext()282     bool fixtext() const { return mode & FIXTEXT; };
283     /// is the STRIP mode set?
strip()284     bool strip() const { return mode & STRIP; };
285     /// is the CANONICAL mode set?
canonical()286     bool canonical() const { return mode & CANONICAL; };
287     /// is the AUTODECLARE mode set?
autodeclare()288     bool autodeclare() const { return mode & AUTODECLARE; };
has_explicit()289     bool has_explicit() const { return mode & EXPLICIT; };
290     bool set_permissive( bool ) const; // defined const, but the mode is mutable!
291     bool set_checktext( bool ) const; // defined const, but the mode is mutable!
292     bool set_fixtext( bool ) const; // defined const, but the mode is mutable!
293     bool set_strip( bool ) const; // defined const, but the mode is mutable!
294     bool set_canonical( bool ) const; // defined const, but the mode is mutable!
295     bool set_autodeclare( bool ) const; // defined const, but the mode is mutable!
296     bool set_explicit( bool ) const; // defined const, but the mode is mutable!
297     /// this class holds annotation declaration information
298     class at_t {
299       friend std::ostream& operator<<( std::ostream& os, const at_t& at );
300     public:
at_t(const std::string & a,const AnnotatorType & t,const std::string & d,const std::string & f,const std::set<std::string> & p)301     at_t( const std::string& a,
302 	  const AnnotatorType& t,
303 	  const std::string& d,
304 	  const std::string& f,
305 	  const std::set<std::string>& p ):
306       _annotator(a),
307 	_ann_type(t),
308 	_date(d),
309 	_format(f),
310 	_processors(p){};
311       std::string _annotator;   ///< the annotator as a string
312       AnnotatorType _ann_type; ///< the annotator type
313       std::string _date;   ///< the timestamp as a string
314       std::string _format;   ///< the format
315       std::set<std::string> _processors; ///< the id's of all associated processors
316     };
317 
318     void incrRef( AnnotationType, const std::string& );
319     void decrRef( AnnotationType, const std::string& );
320     void setmode( const std::string& ) const;
321     std::string getmode() const;
setdebug(int val)322     int setdebug( int val ){
323       /// set the debug level
324       /*!
325 	\param val the new debug value
326 	\return the old debug value
327       */
328       int ret=debug; debug=val; return ret;
329     };
330     std::multimap<AnnotationType,std::string> unused_declarations( ) const;
get_submetadata(const std::string & m)331     const MetaData *get_submetadata( const std::string& m ){
332       /// get the metadata structure with value \e m
333       /*!
334 	\param m the value we search
335 	\return the found MetaData element, or 0
336        */
337       const auto& it = submetadata.find( m );
338       if ( it == submetadata.end() ){
339 	return 0;
340       }
341       else {
342 	return it->second;
343       }
344     }
cache_textcontent(TextContent * tc)345     void cache_textcontent( TextContent *tc ){
346       /// add a TextContent to the validation buffer
347       /*!
348 	\param tc the TextContent to add to the buffer
349 	on a call to validate_offsets() this buffer is used to validate
350 	all offsets.
351       */
352       t_offset_validation_buffer.push_back( tc );
353     }
cache_phoncontent(PhonContent * pc)354     void cache_phoncontent( PhonContent *pc ){
355       /// add a PhonContent to the validation buffer
356       /*!
357 	\param pc the PhonContent to add
358 	on a call to validate_offsets() this buffer is used to validate
359 	all offsets.
360       */
361       p_offset_validation_buffer.push_back( pc );
362     }
363     bool validate_offsets() const;
364     int compare_to_build_version() const;
version()365     const std::string& version() const {
366       /// return the version string
367       return _version_string;
368     };
369     std::string doc_version() const;
370     std::string update_version();
371     bool version_below( int, int ) const;
annotationdefaults()372     const std::map<AnnotationType,std::multimap<std::string,at_t>>& annotationdefaults() const { return _annotationdefaults; };
373     void parse_metadata( const xmlNode * );
374     void setDocumentProps( KWargs& );
provenance()375     Provenance *provenance() const {
376       /// return a pointer to the Provenance data
377       return _provenance;
378     };
filename()379     const std::string& filename() const {
380       /// return the filename the Document was created from
381       return _source_filename;
382     };
383     void save_orig_ann_defaults();
set_incremental(bool b)384     void set_incremental( bool b ) {
385       /// set/unset the incremental_parse flag
386       _incremental_parse = b;
387     };
is_incremental()388     bool is_incremental() const {
389       /// return the value of the incremental_parse flag
390       return _incremental_parse;
391     };
set_preserve_spaces(bool b)392     void set_preserve_spaces( bool b ) {
393       /// set/unset the preserve_spaces flag
394       _preserve_spaces = b;
395     };
preserve_spaces()396     bool preserve_spaces() const {
397       /// return the value of the preserve_spaces flag
398       return _preserve_spaces;
399     }
get_warn_count()400     int get_warn_count( ) const {
401       /// return the number of warnings
402       return _warn_count;
403     }
reset_warn_count()404     void reset_warn_count( ) {
405       /// reset the number of warnings to 0
406       _warn_count = 0;
407     }
increment_warn_count()408     void increment_warn_count() const {
409       /// increment the warning count
410       // NOTE: function is defined const, but the _warn_count is mutable
411       ++_warn_count;
412     }
413 
414   private:
415     void adjustTextMode();
416     std::map<AnnotationType,std::multimap<std::string,at_t> > _annotationdefaults;   ///< stores all declared annotations per AnnotationType
417     ///< every AnnotationType can have multiple annotations even with the same
418     ///< setnames. hence a multimap
419     std::map<AnnotationType,std::map<std::string,bool> > _groupannotations; ///<
420     ///< register which annotations are GROUP annotations
421     std::vector<std::pair<AnnotationType,std::string>> _anno_sort; ///<
422     ///< register the original sorting of the annotation declarations in the
423     ///< input, so we can use that for output in the same order. (cannonical
424     ///< mode
425     std::map<AnnotationType,std::map<std::string,int> > _annotationrefs; ///<
426     ///< register the number of references to this AnnotationType/setname
427     std::map<AnnotationType,std::map<std::string,std::string>> _alias_set; ///<
428     ///< register the mapping from aliases to setnames per AnnotationType
429     std::map<AnnotationType,std::map<std::string,std::string>> _set_alias; ///<
430     ///< register the mapping from setname to aliases per AnnotationType
431     std::map<AnnotationType,std::string> _orig_ann_default_sets; ///<
432     ///< for folia::Engine we need to register the original mapping from a
433     ///< AnnoationType to a setname, because in the process more mappings
434     ///< can be added, loosing the default.
435     std::map<AnnotationType,std::string> _orig_ann_default_procs;///<
436     ///< for folia::Engine we need to register the original mapping from a
437     ///< AnnoationType to a processor name, because in the process more mappings
438     ///< can be added, loosing the default.
439 
440     std::vector<TextContent*> t_offset_validation_buffer; ///< we register all
441     ///< TextContent nodes here to quickly access them for offset checks
442     ///< that check is performed directly after parsing
443     std::vector<PhonContent*> p_offset_validation_buffer; ///< we register all
444     ///< PhonContent nodes here to quickly access them for offset checks
445     ///< that check is performed directly after parsing
446     void parse_imdi( const xmlNode * );
447     void parse_annotations( const xmlNode * );
448     void parse_provenance( const xmlNode * );
449     void parse_submeta( const xmlNode * );
450     void parse_styles();
451     void add_annotations( xmlNode * ) const;
452     void add_provenance( xmlNode * ) const;
453     void add_metadata( xmlNode * ) const;
454     void add_submetadata( xmlNode *) const;
455     void add_styles( xmlDoc* ) const;
456     void append_processor( xmlNode *, const processor * ) const;
457     xmlDoc *to_xmlDoc( const std::string& ="" ) const;
458     void add_one_anno( const std::pair<AnnotationType,std::string>&,
459 		       xmlNode *,
460 		       std::set<std::string>& ) const;
461     std::map<std::string, FoliaElement* > sindex; ///< the lookup table
462     ///< for FoliaElements by index (xml:id) (not all nodes do have an index)
463     //    std::vector<FoliaElement*> data;
464     std::vector<External*> _externals;
465     std::string _id;
466     std::set<FoliaElement *> delSet;
467     FoliaElement *foliadoc;
468     xmlDoc *_xmldoc;
469     const xmlChar* _foliaNsIn_href;
470     const xmlChar* _foliaNsIn_prefix;
471     mutable xmlNs *_foliaNsOut;
472     Provenance *_provenance;
473     MetaData *_metadata;
474     ForeignMetaData *_foreign_metadata;
475     std::map<std::string,MetaData *> submetadata;
476     std::multimap<std::string,std::string> styles;
477     mutable Mode mode;
478     std::string _source_filename;
479     std::string _version_string;
480     int _major_version;
481     int _minor_version;
482     int _sub_version;
483     std::string _patch_version;
484     bool _external_document;
485     bool _incremental_parse;
486     bool _preserve_spaces;
487     mutable int _warn_count;
488     Document( const Document& ); // inhibit copies
489     Document& operator=( const Document& ); // inhibit copies
490   };
491 
492   template <> inline
create_root(const KWargs & args)493     Text *Document::create_root( const KWargs& args ){
494     return setTextRoot( args );
495   }
496 
497   template <> inline
create_root(const KWargs & args)498     Speech *Document::create_root( const KWargs& args ){
499     return setSpeechRoot( args );
500   }
501 
502   template <> inline
create_root()503     Text *Document::create_root(){
504     return setTextRoot();
505   }
506   template <> inline
create_root()507     Speech *Document::create_root(){
508     return setSpeechRoot();
509   }
510 
511   std::ostream& operator<<( std::ostream& os, const Document *d );
512   inline std::ostream& operator<<( std::ostream& os, const Document& d ){
513     os << &d;
514     return os;
515   }
516 
517   std::ostream& operator<<( std::ostream& os, const Document::at_t& at );
518 
519   void expand_version_string( const std::string&,
520 			      int&, int&, int&, std::string& );
521 
522   std::string library_version();
523   std::string folia_version();
524 
525 } // namespace folia
526 
527 #endif // FOLIA_DOCUMENT_H
528