1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of libfolia
7 
8   libfolia is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   libfolia is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 
27 #ifndef TYPES_H
28 #define TYPES_H
29 #include <string>
30 #include "ticcutils/StringOps.h"
31 
32 namespace folia {
33 
34   //foliaspec:elementtype
35   enum ElementType : unsigned int { BASE=0,AbstractAnnotationLayer_t, AbstractContentAnnotation_t, AbstractCorrectionChild_t, AbstractHigherOrderAnnotation_t, AbstractInlineAnnotation_t, AbstractSpanAnnotation_t, AbstractSpanRole_t, AbstractStructureElement_t, AbstractSubtokenAnnotation_t, AbstractTextMarkup_t, ActorFeature_t, Alternative_t, AlternativeLayers_t, BegindatetimeFeature_t, Caption_t, Cell_t, Chunk_t, ChunkingLayer_t, Comment_t, Content_t, CoreferenceChain_t, CoreferenceLayer_t, CoreferenceLink_t, Correction_t, Cue_t, Current_t, Definition_t, DependenciesLayer_t, Dependency_t, DependencyDependent_t, Description_t, Division_t, DomainAnnotation_t, EnddatetimeFeature_t, EntitiesLayer_t, Entity_t, Entry_t, ErrorDetection_t, Event_t, Example_t, External_t, Feature_t, Figure_t, FontFeature_t, ForeignData_t, FunctionFeature_t, Gap_t, Head_t, HeadFeature_t, Headspan_t, Hiddenword_t, Hyphbreak_t, Label_t, LangAnnotation_t, LemmaAnnotation_t, LevelFeature_t, Linebreak_t, LinkReference_t, List_t, ListItem_t, Metric_t, ModalitiesLayer_t, Modality_t, ModalityFeature_t, Morpheme_t, MorphologyLayer_t, New_t, Note_t, Observation_t, ObservationLayer_t, Original_t, Paragraph_t, Part_t, PhonContent_t, Phoneme_t, PhonologyLayer_t, PolarityFeature_t, PosAnnotation_t, Predicate_t, Quote_t, Reference_t, Relation_t, Row_t, Scope_t, SemanticRole_t, SemanticRolesLayer_t, SenseAnnotation_t, Sentence_t, Sentiment_t, SentimentLayer_t, SizeFeature_t, Source_t, SpanRelation_t, SpanRelationLayer_t, Speech_t, Statement_t, StatementLayer_t, StatementRelation_t, StrengthFeature_t, String_t, StyleFeature_t, SubjectivityAnnotation_t, Suggestion_t, SynsetFeature_t, SyntacticUnit_t, SyntaxLayer_t, Table_t, TableHead_t, Target_t, Term_t, Text_t, TextContent_t, TextMarkupCorrection_t, TextMarkupError_t, TextMarkupGap_t, TextMarkupHSpace_t, TextMarkupLanguage_t, TextMarkupReference_t, TextMarkupString_t, TextMarkupStyle_t, TextMarkupWhitespace_t, TimeFeature_t, TimeSegment_t, TimingLayer_t, Utterance_t, ValueFeature_t, Whitespace_t, Word_t, WordReference_t, PlaceHolder_t, XmlComment_t, XmlText_t,  LastElement };
36 
37   inline ElementType& operator++( ElementType &et ){
38     return et = ( LastElement == et )
39       ? BASE
40       : ElementType(et+1);
41   }
42 
43   /** AnnotatorType is the Internal representation of the Annatator attribute
44    *
45    */
46   enum AnnotatorType: int {
47     UNDEFINED = 0,   ///< The value is (yet) unknown
48       AUTO = 1,      ///< The value is automaticly assigned
49       MANUAL = 2,    ///< The value is manually assigned
50       GENERATOR = 3, ///< The value is added by a generator
51       DATASOURCE = 4 ///< The value comes from an external datasource
52       };
53 
54   //foliaspec:attributes
55   //Defines all common FoLiA attributes (as part of the Attrib enumeration)
56   enum Attrib : int { NO_ATT=0, ///<No attribute
57 ID=1,  ///<xml:id: The ID of the element; this has to be a unique in the entire document or collection of documents (corpus). All identifiers in FoLiA are of the `XML NCName <https://www.w3.org/TR/1999/WD-xmlschema-2-19990924/#NCName>`_ datatype, which roughly means it is a unique string that has to start with a letter (not a number or symbol), may contain numbers, but may never contain colons or spaces. FoLiA does not define any naming convention for IDs.
58 CLASS=2,  ///<class: The class of the annotation, i.e. the annotation tag in the vocabulary defined by ``set``.
59 ANNOTATOR=4,  ///<annotator: This is an older alternative to the ``processor`` attribute, without support for full provenance. The annotator attribute simply refers to the name o ID of the system or human annotator that made the annotation.
60 CONFIDENCE=8,  ///<confidence: A floating point value between zero and one; expresses the confidence the annotator places in his annotation.
61 N=16,  ///<n: A number in a sequence, corresponding to a number in the original document, for example chapter numbers, section numbers, list item numbers. This this not have to be an actual number but other sequence identifiers are also possible (think alphanumeric characters or roman numerals).
62 DATETIME=32,  ///<datetime: The date and time when this annotation was recorded, the format is ``YYYY-MM-DDThh:mm:ss`` (note the literal T in the middle to separate date from time), as per the XSD Datetime data type.
63 BEGINTIME=64,  ///<begintime: A timestamp in ``HH:MM:SS.MMM`` format, indicating the begin time of the speech. If a sound clip is specified (``src``); the timestamp refers to a location in the soundclip.
64 ENDTIME=128,  ///<endtime: A timestamp in ``HH:MM:SS.MMM`` format, indicating the end time of the speech. If a sound clip is specified (``src``); the timestamp refers to a location in the soundclip.
65 SRC=256,  ///<src: Points to a file or full URL of a sound or video file. This attribute is inheritable.
66 SPEAKER=512,  ///<speaker: A string identifying the speaker. This attribute is inheritable. Multiple speakers are not allowed, simply do not specify a speaker on a certain level if you are unable to link the speech to a specific (single) speaker.
67 TEXTCLASS=1024,  ///<textclass: Refers to the text class this annotation is based on. This is an advanced attribute, if not specified, it defaults to ``current``. See :ref:`textclass_attribute`.
68 METADATA=2048,
69 IDREF=4096,  ///<id: A reference to the ID of another element. This is a reference and not an assignment, unlike xml:id, so do not confuse the two! It is only supported on certain elements that are referential in nature.
70 SPACE=8192,  ///<space: This attribute indicates whether spacing should be inserted after this element (it's default value is always ``yes``, so it does not need to be specified in that case), but if tokens or other structural elements are glued together then the value should be set to ``no``. This allows for reconstruction of the detokenised original text.
71 TAG=16384,  ///<tag: Contains a space separated list of processing tags associated with the element. A processing tag carries arbitrary user-defined information that may aid in processing a document. It may carry cues on how a specific tool should treat a specific element. The tag vocabulary is specific to the tool that processes the document. Tags carry no instrinsic meaning for the data representation and should not be used except to inform/aid processors in their task. Processors are encouraged to clean up the tags they use. Ideally, published FoLiA documents at the end of a processing pipeline carry no further tags. For encoding actual data, use ``class`` and optionally features instead.
72 ALL=32768 };
73 
74   inline Attrib& operator++( Attrib & a ){
75     return a = ( ALL == a )
76       ? NO_ATT
77       : ( NO_ATT == a ? ID : Attrib(a<<1) );
78   }
79 
80   inline Attrib operator|( Attrib a1, Attrib a2 ){
81     return (Attrib) ((int)a1|(int)a2) ;
82   }
83 
84   inline Attrib& operator|=( Attrib& a1, Attrib& a2 ){
85     a1 = (a1 | a2);
86     return a1;
87   }
88 
89   std::string toString( const Attrib );
90   std::ostream& operator<<( std::ostream&, const Attrib& );
91 
92 #undef DOMAIN // ugly hack but Clang defines DOMAIN in math.h
93 
94   /*
95    * Annotation types tie FoLiA elements to a particular kind of annotation.
96    * Especially declarations make use of this.
97    *  static const annotation_type = {AnnotationType}
98    */
99 
100   //foliaspec:annotationtype
101   //Defines all annotation types (as part of the AnnotationType enumeration)
102   enum AnnotationType : int { NO_ANN, ///<No type dummy
103     TEXT, ///<Text Annotation: Text annotation associates actual textual content with structural elements, without it a document would be textless. FoLiA treats it as an annotation like any other.
104     TOKEN, ///<Token Annotation: This annotation type introduces a tokenisation layer for the document. The terms **token** and **word** are used interchangeably in FoLiA as FoLiA itself does not commit to a specific tokenisation paradigm. Tokenisation is a prerequisite for the majority of linguistic annotation types offered by FoLiA and it is one of the most fundamental types of Structure Annotation. The words/tokens are typically embedded in other types of structure elements, such as sentences or paragraphs.
105     DIVISION, ///<Division Annotation: Structure annotation representing some kind of division, typically used for chapters, sections, subsections (up to the set definition). Divisions may be nested at will, and may include almost all kinds of other structure elements.
106     PARAGRAPH, ///<Paragraph Annotation: Represents a paragraph and holds further structure annotation such as sentences.
107     HEAD, ///<Head Annotation: The ``head`` element is used to provide a header or title for the structure element in which it is embedded, usually a division (``<div>``)
108     LIST, ///<List Annotation: Structure annotation for enumeration/itemisation, e.g. bulleted lists.
109     FIGURE, ///<Figure Annotation: Structure annotation for including pictures, optionally captioned, in documents.
110     WHITESPACE, ///<Vertical Whitespace: Structure annotation introducing vertical whitespace
111     LINEBREAK, ///<Linebreak: Structure annotation representing a single linebreak and with special facilities to denote pagebreaks.
112     SENTENCE, ///<Sentence Annotation: Structure annotation representing a sentence. Sentence detection is a common stage in NLP alongside tokenisation.
113     POS, ///<Part-of-Speech Annotation: Part-of-Speech Annotation, one of the most common types of linguistic annotation. Assigns a lexical class to words.
114     LEMMA, ///<Lemmatisation: Lemma Annotation, one of the most common types of linguistic annotation. Represents the canonical form of a word.
115     DOMAIN, ///<Domain/topic Annotation: Domain/topic Annotation. A form of inline annotation used to assign a certain domain or topic to a structure element.
116     SENSE, ///<Sense Annotation: Sense Annotation allows to assign a lexical semantic sense to a word.
117     SYNTAX, ///<Syntactic Annotation: Assign grammatical categories to spans of words. Syntactic units are nestable and allow representation of complete syntax trees that are usually the result of consistuency parsing.
118     CHUNKING, ///<Chunking: Assigns shallow grammatical categories to spans of words. Unlike syntax annotation, chunks are not nestable. They are often produced by a process called Shallow Parsing, or alternatively, chunking.
119     ENTITY, ///<Entity Annotation: Entity annotation is a broad and common category in FoLiA. It is used for specifying all kinds of multi-word expressions, including but not limited to named entities. The set definition used determines the vocabulary and therefore the precise nature of the entity annotation.
120     CORRECTION, ///<Correction Annotation: Corrections are one of the most complex annotation types in FoLiA. Corrections can be applied not just over text, but over any type of structure annotation, inline annotation or span annotation. Corrections explicitly preserve the original, and recursively so if corrections are done over other corrections.
121     ERRORDETECTION, ///<Error Detection: This annotation type is deprecated in favour of `Observation Annotation` and only exists for backward compatibility.
122     PHON, ///<Phonetic Annotation: This is the phonetic analogy to text content (``<t>``) and allows associating a phonetic transcription with any structural element, it is often used in a speech context. Note that for actual segmentation into phonemes, FoLiA has another related type: ``Phonological Annotation``
123     SUBJECTIVITY, ///<Subjectivity Annotation: This annotation type is deprecated in favour of `Sentiment Annotation` and only exists for backward compatibility.
124     MORPHOLOGICAL, ///<Morphological Annotation: Morphological Annotation allows splitting a word/token into morphemes, morphemes itself may be nested. It is embedded within a layer ``morphology`` which can be embedded within word/tokens.
125     EVENT, ///<Event Annotation: Structural annotation type representing events, often used in new media contexts for things such as tweets, chat messages and forum posts (as defined by a user-defined set definition). Note that a more linguistic kind of event annotation can be accomplished with `Entity Annotation` or even `Time Segmentation` rather than this one.
126     DEPENDENCY, ///<Dependency Annotation: Dependency relations are syntactic relations between spans of tokens. A dependency relation takes a particular class and consists of a single head component and a single dependent component.
127     TIMESEGMENT, ///<Time Segmentation: FoLiA supports time segmentation to allow for more fine-grained control of timing information by associating spans of words/tokens with exact timestamps. It can provide a more linguistic alternative to `Event Annotation`.
128     GAP, ///<Gap Annotation: Sometimes there are parts of a document you want to skip and not annotate at all, but include as is. This is where gap annotation comes in, the user-defined set may indicate the kind of gap. Common omissions in books are for example front-matter and back-matter, i.e. the cover.
129     QUOTE, ///<Quote Annotation: Structural annotation used to explicitly mark quoted speech, i.e. that what is reported to be said and appears in the text in some form of quotation marks.
130     NOTE, ///<Note Annotation: Structural annotation used for notes, such as footnotes or warnings or notice blocks.
131     REFERENCE, ///<Reference Annotation: Structural annotation for referring to other annotation types. Used e.g. for referring to bibliography entries (citations) and footnotes.
132     RELATION, ///<Relation Annotation: FoLiA provides a facility to relate arbitrary parts of your document with other parts of your document, or even with parts of other FoLiA documents or external resources, even in other formats. It thus allows linking resources together. Within this context, the ``xref`` element is used to refer to the linked FoLiA elements.
133     SPANRELATION, ///<Span Relation Annotation: Span relations are a stand-off extension of relation annotation that allows for more complex relations, such as word alignments that include many-to-one, one-to-many or many-to-many alignments. One of its uses is in the alignment of multiple translations of (parts) of a text.
134     COREFERENCE, ///<Coreference Annotation: Relations between words that refer to the same referent (anaphora) are expressed in FoLiA using Coreference Annotation. The co-reference relations are expressed by specifying the entire chain in which all links are coreferent.
135     SEMROLE, ///<Semantic Role Annotation: This span annotation type allows for the expression of semantic roles, or thematic roles. It is often used together with `Predicate Annotation`
136     METRIC, ///<Metric Annotation: Metric Annotation is a form of higher-order annotation that allows annotation of some kind of measurement. The type of measurement is defined by the class, which in turn is defined by the set as always. The metric element has a ``value`` attribute that stores the actual measurement, the value is often numeric but this needs not be the case.
137     LANG, ///<Language Annotation: Language Annotation simply identifies the language a part of the text is in. Though this information is often part of the metadata, this form is considered an actual annotation.
138     STRING, ///<String Annotation: This is a form of higher-order annotation for selecting an arbitrary substring of a text, even untokenised, and allows further forms of higher-order annotation on the substring. It is also tied to a form of text markup annotation.
139     TABLE, ///<Table Annotation: Structural annotation type for creating a simple tabular environment, i.e. a table with rows, columns and cells and an optional header.
140     STYLE, ///<Style Annotation: This is a text markup annotation type for applying styling to text. The actual styling is defined by the user-defined set definition and can for example included classes such as italics, bold, underline
141     PART, ///<Part Annotation: The structure element ``part`` is a fairly abstract structure element that should only be used when a more specific structure element is not available. Most notably, the part element should never be used for representation of morphemes or phonemes! Part can be used to divide a larger structure element, such as a division, or a paragraph into arbitrary subparts.
142     UTTERANCE, ///<Utterance Annotation: An utterance is a structure element that may consist of words or sentences, which in turn may contain words. The opposite is also true, a sentence may consist of multiple utterances. Utterances are often used in the absence of sentences in a speech context, where neat grammatical sentences can not always be distinguished.
143     ENTRY, ///<Entry Annotation: FoLiA has a set of structure elements that can be used to represent collections such as glossaries, dictionaries, thesauri, and wordnets. `Entry annotation` defines the entries in such collections, `Term annotation` defines the terms, and `Definition Annotation` provides the definitions.
144     TERM, ///<Term Annotation: FoLiA has a set of structure elements that can be used to represent collections such as glossaries, dictionaries, thesauri, and wordnets. `Entry annotation` defines the entries in such collections, `Term annotation` defines the terms, and `Definition Annotation` provides the definitions.
145     DEFINITION, ///<Definition Annotation: FoLiA has a set of structure elements that can be used to represent collections such as glossaries, dictionaries, thesauri, and wordnets. `Entry annotation` defines the entries in such collections, `Term annotation` defines the terms, and `Definition Annotation` provides the definitions.
146     EXAMPLE, ///<Example Annotation: FoLiA has a set of structure elements that can be used to represent collections such as glossaries, dictionaries, thesauri, and wordnets. `Examples annotation` defines examples in such collections.
147     PHONOLOGICAL, ///<Phonological Annotation: The smallest unit of annotatable speech in FoLiA is the phoneme level. The phoneme element is a form of structure annotation used for phonemes.  Alike to morphology, it is embedded within a layer ``phonology`` which can be embedded within word/tokens.
148     PREDICATE, ///<Predicate Annotation: Allows annotation of predicates, this annotation type is usually used together with Semantic Role Annotation. The types of predicates are defined by a user-defined set definition.
149     OBSERVATION, ///<Observation Annotation: Observation annotation is used to make an observation pertaining to one or more word tokens.  Observations offer a an external qualification on part of a text. The qualification is expressed by the class, in turn defined by a set. The precise semantics of the observation depends on the user-defined set.
150     SENTIMENT, ///<Sentiment Annotation: Sentiment analysis marks subjective information such as sentiments or attitudes expressed in text. The sentiments/attitudes are defined by a user-defined set definition.
151     STATEMENT, ///<Statement Annotation: Statement annotation, sometimes also refered to as attribution, allows to decompose statements into the source of the statement, the content of the statement, and the way these relate, provided these are made explicit in the text.
152     ALTERNATIVE, ///<Alternative Annotation: This form of higher-order annotation encapsulates alternative annotations, i.e. annotations that are posed as an alternative option rather than the authoratitive chosen annotation
153     RAWCONTENT, ///<Raw Content: This associates raw text content which can not carry any further annotation. It is used in the context of :ref:`gap_annotation`
154     COMMENT, ///<Comment Annotation: This is a form of higher-order annotation that allows you to associate comments with almost all other annotation elements
155     DESCRIPTION, ///<Description Annotation: This is a form of higher-order annotation that allows you to associate descriptions with almost all other annotation elements
156     HYPHENATION, ///<Hyphenation Annotation: This is a text-markup annotation form that indicates where in the original text a linebreak was inserted and a word was hyphenised.
157     HIDDENTOKEN, ///<Hidden Token Annotation: This annotation type allows for a hidden token layer in the document. Hidden tokens are ignored for most intents and purposes but may serve a purpose when annotations on implicit tokens is required, for example as targets for syntactic movement annotation.
158     MODALITY, ///<Modality Annotation: Modality annotation is used to describe the relationship between cue word(s) and the scope it covers. It is primarily used for the annotation of negation, but also for the annotation of factuality, certainty and truthfulness:.
159     EXTERNAL, ///<External Annotation: External annotation makes a reference to an external FoLiA document whose structure is inserted at the exact place the external element occurs.
160     HSPACE, ///<Horizontal Whitespace: Markup annotation introducing horizontal whitespace
161 LAST_ANN };
162 
163   inline AnnotationType& operator++( AnnotationType &at ){
164     return at = ( LAST_ANN == at )
165       ? NO_ANN
166       : AnnotationType(at+1);
167   }
168 
169   std::string toString( const AnnotationType& );
170   AnnotationType stringToAnnotationType( const std::string& );
171 
172   AnnotatorType stringToAnnotatorType( const std::string& );
173   std::string toString( const AnnotatorType& );
174 
175   std::string toString( const ElementType& );
176   ElementType stringToElementType( const std::string& );
177 
178   ElementType layertypeof( ElementType );
179 
180 } // namespace folia
181 
182 namespace TiCC {
183   // add specializations to the TiCC stringTo() and toString() family
184 
185   template<>
stringTo(const std::string & str)186     inline folia::AnnotationType stringTo( const std::string& str ) {
187     return folia::stringToAnnotationType( str );
188   }
189 
190   template<>
stringTo(const std::string & str)191     inline folia::ElementType stringTo( const std::string& str ) {
192     return folia::stringToElementType( str );
193   }
194 
195   template<>
stringTo(const std::string & str)196     inline folia::AnnotatorType stringTo( const std::string& str ) {
197     return folia::stringToAnnotatorType( str );
198   }
199 
200   inline std::ostream& operator<<( std::ostream& os,
201 				   const folia::ElementType& el ){
202     os << folia::toString( el );
203     return os;
204   }
205 
206   inline std::ostream& operator<<( std::ostream& os,
207 				   const folia::AnnotatorType& at ){
208     os << folia::toString(at);
209     return os;
210   }
211 
212   inline std::ostream& operator<<( std::ostream& os,
213 				   const folia::AnnotationType& at ){
214     os << folia::toString( at );
215     return os;
216   }
217 
218 } // namespace TiCC
219 
220 #endif
221