1 /*
2   Copyright (c) 2006 - 2021
3   CLST - Radboud University
4   ILK  - Tilburg University
5 
6   This file is part of Ucto
7 
8   Ucto is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   Ucto is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ucto/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 
27 #ifndef UCTO_TOKENIZE_H
28 #define UCTO_TOKENIZE_H
29 
30 #include <vector>
31 #include <set>
32 #include <map>
33 #include <sstream>
34 #include <stdexcept>
35 #include "libfolia/folia.h"
36 #include "ticcutils/LogStream.h"
37 #include "ticcutils/Unicode.h"
38 #include "ucto/setting.h"
39 
40 class TextCat;
41 
42 namespace Tokenizer {
43 
44   using namespace icu;
45 
46   const std::string Version();
47   const std::string VersionName();
48 
49   enum TokenRole {
50     NOROLE                      = 0,
51     NOSPACE                     = 1,
52     BEGINOFSENTENCE             = 2,
53     ENDOFSENTENCE               = 4,
54     NEWPARAGRAPH                = 8,
55     BEGINQUOTE                  = 16,
56     ENDQUOTE                    = 32,
57     TEMPENDOFSENTENCE           = 64,
58     LINEBREAK                   = 128
59   };
60 
61   std::ostream& operator<<( std::ostream&, const TokenRole& );
62 
63   // setter
64   inline TokenRole operator|( TokenRole T1, TokenRole T2 ){
65     return (TokenRole)( (int)T1|(int)T2 );
66   }
67 
68   inline TokenRole& operator|= ( TokenRole& T1, TokenRole T2 ){
69     T1 = (T1 | T2);
70     return T1;
71   }
72 
73   // invert
74   inline TokenRole operator~( TokenRole T1 ){
75     return (TokenRole)~(int)T1;
76   }
77 
78   // union
79   inline TokenRole operator&( TokenRole T1, TokenRole T2 ){
80     return (TokenRole)( (int)T1 & (int)T2 );
81   }
82 
83   inline TokenRole& operator&=( TokenRole& T1, TokenRole T2 ){
84     T1 = (T1 & T2);
85     return T1;
86   }
87 
88   class Token {
89     friend std::ostream& operator<< (std::ostream&, const Token& );
90   public:
91     UnicodeString type;
92     UnicodeString us;
93     TokenRole role;
94     Token( const UnicodeString&,
95 	   const UnicodeString&,
96 	   TokenRole role = NOROLE,
97 	   const std::string& = "" );
98     std::string lang_code;                // ISO 639-3 language code
99     std::string texttostring();
100     std::string typetostring();
101   };
102 
103   class TokenizerClass{
104   protected:
105     int linenum;
106   public:
107     TokenizerClass();
108     ~TokenizerClass();
109     bool init( const std::string&,
110 	       const std::string& ="" ); // init from a configfile
111     bool init( const std::vector<std::string>&,
112 	       const std::string& ="" ); // init 1 or more languages
113     bool reset( const std::string& = "default" );
114     void setErrorLog( TiCC::LogStream *os );
115 
116     // Tokenize from input stream with text OR FoLiA to a FoLiA document
117     folia::Document *tokenize_folia( const std::string& );
118     // Tokenize from input stream with text to a FoLiA document (
119     folia::Document *tokenize( std::istream& );
120 
121     // Tokenize from input stream with text OR FoLiA to a FoLiA document and
122     //   save it
123     void tokenize_folia( const std::string&, const std::string&  );
124 
125     // Tokenize from an input text stream to a token vector
126     // (representing a sentence)
127     // non greedy. Stops after the first full sentence is returned.
128     // may be called multiple times until EOF
129     std::vector<Token> tokenizeOneSentence( std::istream& );
130 
131     // tokenize from file to file
132     void tokenize( const std::string&, const std::string& );
133 
134     //Tokenize from input stream to output stream
135     void tokenize( std::istream&, std::ostream& );
136 
137     // Tokenize a line (a line is NOT just a sentence, but an arbitrary string
138     //                  of characters, inclusive EOS markers, Newlines etc.)
139     //
140     // OR use popSentence() repeatedly to extract all sentences as vectors
141     //    using getString() to extract the UTF8 value of that sentence
142     // OR getSentences() to get ALL sentences as UTF8 strings in a vector
143     void tokenizeLine( const UnicodeString&, const std::string& = "" );
144     void tokenizeLine( const std::string&, const std::string& = "" );
145 
146     // extract 1 sentence from Token vector;
147     std::vector<Token> popSentence();
148 
149     // convert the sentence in a token vector to a UnicodeString
150     icu::UnicodeString getString( const std::vector<Token>& );
151     // convert the sentence in a token vector to a string (UTF-8 encoded)
152     std::string getUTF8String( const std::vector<Token>& );
153 
154     // extract all sentences as a vector of UnicodeStrings
155     std::vector<icu::UnicodeString> getSentences();
156 
157     // extract all sentences as a vector of strings (UTF-8 encoded)
158     std::vector<std::string> getUTF8Sentences();
159 
160     //Enable verbose mode
161     bool setVerbose( bool b=true ) { bool t = verbose; verbose = b; return t; };
getVerbose()162     bool getVerbose() const { return verbose; }
163 
164     //set debug value
setDebug(int d)165     int setDebug( int d ) { int dd = tokDebug; tokDebug = d; return dd; };
getDebug()166     int getDebug() const { return tokDebug; }
167 
168     // set the commandline used
set_command(const std::string & c)169     void set_command( const std::string& c ){ _command =  c; };
170 
171     //set textcat debug value
172     bool set_tc_debug( bool b );
173 
174     //Enable conversion of all output to lowercase
175     bool setLowercase( bool b=true ) { bool t = lowercase; lowercase = b; if (b) uppercase = false; return t; };
getLowercase()176     bool getLowercase() const { return lowercase; }
177 
178     //Enable passtru mode
179     bool setPassThru( bool b=true ) { bool t = passthru; passthru = b; return t; };
getPassThru()180     bool getPassThru() const { return passthru; }
181 
182     //Disable tag hints
183     bool setNoTags( bool b=true ) { bool t = ignore_tag_hints;
184       ignore_tag_hints = b;
185       return t; };
getNoTags()186     bool getNoTags() const { return ignore_tag_hints; }
187 
188     //Enable conversion of all output to uppercase
189     bool setUppercase( bool b=true ) { bool t = uppercase; uppercase = b; if (b) lowercase = false; return t; };
getUppercase()190     bool getUppercase() const { return uppercase; }
191 
192     //Enable sentence splitting only
193     bool setSentenceSplit( bool b=true ) { bool t = splitOnly; splitOnly = b; return t; }
getSentenceSplit()194     bool getSentenceSplit() const { return splitOnly; }
195 
196     //Enable paragraph detection
197     bool setParagraphDetection( bool b=true ) { bool t = detectPar; detectPar = b; return t; }
getParagraphDetection()198     bool getParagraphDetection() const { return detectPar; }
199 
200     //Enable quote detection
201     bool setQuoteDetection( bool b=true ) { bool t = detectQuotes; detectQuotes = b; return t; }
getQuoteDetection()202     bool getQuoteDetection() const { return detectQuotes; }
203 
204     //Enable language detection
205     bool setLangDetection( bool b=true ) { bool t = doDetectLang; doDetectLang = b; return t; }
getLangDetection()206     bool getLangDetection() const { return doDetectLang; }
207 
208     //Enable filtering
209     bool setFiltering( bool b=true ) {
210       bool t = doFilter; doFilter = b; return t;
211     }
getFiltering()212     bool getFiltering() const { return doFilter; };
213 
214     //Enable word corrections (FoLiA only)
215     bool setWordCorrection( bool b=true ) {
216       bool t = doWordCorrection; doWordCorrection = b; return t;
217     }
getWordCorrection()218     bool getWordCorrection() const { return doWordCorrection; };
219 
220     //Enable punctuation filtering
221     bool setPunctFilter( bool b=true ) {
222       bool t = doPunctFilter; doPunctFilter = b; return t;
223     }
getPunctFilter()224     bool getPunctFilter() const { return doPunctFilter; };
225 
226     std::string setTextRedundancy( const std::string& );
227 
228     // set normalization mode
setNormalization(const std::string & s)229     std::string setNormalization( const std::string& s ) {
230       return normalizer.setMode( s );
231     }
getNormalization()232     std::string getNormalization() const { return normalizer.getMode(); };
233 
234     // set input encoding
235     std::string setInputEncoding( const std::string& );
getInputEncoding()236     std::string getInputEncoding() const { return inputEncoding; };
237 
setLanguage(const std::string & l)238     void setLanguage( const std::string& l ){ default_language = l; };
getLanguage()239     std::string getLanguage() const { return default_language; };
240 
241     // set eos marker
242     UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = TiCC::UnicodeFromUTF8(s); return t; };
getEosMarker()243     UnicodeString getEosMarker( ) const { return eosmark; }
244 
245     bool setNormSet( const std::string& );
246 
247     bool setSentencePerLineOutput( bool b=true ) { bool t = sentenceperlineoutput; sentenceperlineoutput = b; return t; };
getSentencePerLineOutput()248     bool getSentencePerLineOutput() const { return sentenceperlineoutput; }
249 
250     bool setSentencePerLineInput( bool b=true ) { bool t = sentenceperlineinput; sentenceperlineinput = b; return t; };
getSentencePerLineInput()251     bool getSentencePerLineInput() const { return sentenceperlineinput; }
252 
setXMLOutput(bool b)253     bool setXMLOutput( bool b ) {
254       bool t = xmlout; xmlout = b; return t; }
setXMLOutput(bool b,const std::string & id)255     bool setXMLOutput( bool b, const std::string& id ) {
256       setDocID( id ); return setXMLOutput(b); }
getXMLOutput()257     bool getXMLOutput() const { return xmlout; }
258 
setXMLInput(bool b)259     bool setXMLInput( bool b ) { bool t = xmlin; xmlin = b; return t; }
getXMLInput()260     bool getXMLInput() const { return xmlin; }
261 
262 
getInputClass()263     const std::string getInputClass( ) const { return inputclass; }
setInputClass(const std::string & cls)264     const std::string setInputClass( const std::string& cls) {
265       std::string res = inputclass;
266       inputclass = cls;
267       return res;
268     }
getOutputClass()269     const std::string getOutputClass( ) const { return outputclass; }
setOutputClass(const std::string & cls)270     const std::string setOutputClass( const std::string& cls) {
271       std::string res = outputclass;
272       outputclass = cls;
273       return res;
274     }
275 
getDocID()276     std::string getDocID() const { return docid; }
setDocID(const std::string & id)277     std::string setDocID( const std::string& id ) {
278       const std::string s = docid; docid = id; return s; }
279 
280     bool get_setting_info( const std::string&,
281 			   std::string&,
282 			   std::string& ) const;
283     std::string get_data_version() const;
284 
285     folia::processor *init_provenance( folia::Document *,
286 				       folia::processor * =0 ) const;
287     folia::processor *add_provenance_passthru( folia::Document *,
288 					       folia::processor * =0 ) const;
289     folia::processor *add_provenance_data( folia::Document *,
290 					   folia::processor * =0 ) const;
291     folia::processor *add_provenance_setting( folia::Document *,
292 					      folia::processor * =0 ) const;
293     folia::processor *add_provenance_structure( folia::Document *,
294 						folia::processor * =0 ) const;
295     folia::processor *add_provenance_structure( folia::Document *,
296 						const folia::AnnotationType,
297 						folia::processor * =0 ) const;
ucto_re_run()298     bool ucto_re_run() const { return already_tokenized; };
299     std::vector<Token> correct_elements( folia::FoliaElement *,
300 					 const std::vector<folia::FoliaElement*>& );
301 
302   private:
303 
304     TokenizerClass( const TokenizerClass& ); // inhibit copies
305     TokenizerClass& operator=( const TokenizerClass& ); // inhibit copies
306 
307     void passthruLine( const UnicodeString&, bool& );
308     void passthruLine( const std::string&, bool& );
309 
310     folia::Document *start_document( const std::string& ) const;
311     folia::FoliaElement *append_to_folia( folia::FoliaElement *root,
312 					  const std::vector<Token>& tv,
313 					  int& p_count ) const;
314 
315     std::vector<folia::Word*> append_to_sentence( folia::Sentence *,
316 						  const std::vector<Token>& ) const;
317     void correct_element( folia::FoliaElement *,
318 			  const std::vector<Token>&,
319 			  const std::string& ) const;
320 
321     void handle_one_sentence( folia::Sentence *, int& );
322     void handle_one_paragraph( folia::Paragraph *, int& );
323     void handle_one_text_parent( folia::FoliaElement *, int& );
324 
325     //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found
326     int countSentences(bool forceentirebuffer = false);
327     //count the number of sentences (only after detectSentenceBounds) (does some extra validation as well)
328     int flushSentences( int, const std::string& = "default" );
329     //Flush n sentences from buffer (does some extra validation as well)
330 
331     icu::UnicodeString outputTokens( const std::vector<Token>&,
332 				     const bool=false ) const;
333     void add_rule( const UnicodeString&,
334 		   const std::vector<UnicodeString>& );
335     void tokenizeWord( const UnicodeString&,
336 		       bool,
337 		       const std::string&,
338 		       const UnicodeString& ="" );
339     int internal_tokenize_line( const UnicodeString&,
340 				const std::string& );
341 
342     void tokenize_one_line( const UnicodeString&,
343 			    bool&,
344 			    const std::string& = "" );
345 
346     bool detectEos( size_t, const UnicodeString&, const Quoting& ) const;
347     void detectSentenceBounds( const int offset,
348 			       const std::string& = "default" );
349     void detectQuotedSentenceBounds( const int offset,
350 				     const std::string& = "default" );
351     void detectQuoteBounds( const int,
352 			    Quoting& );
353 
354     bool resolveQuote( int, const UnicodeString&, Quoting& );
355     bool u_isquote( UChar32,
356 		    const Quoting& ) const;
357     std::string checkBOM( std::istream& );
358     void outputTokensDoc_init( folia::Document& ) const;
359 
360     TiCC::UnicodeNormalizer normalizer;
361     std::string inputEncoding;
362 
363     UnicodeString eosmark;
364     std::vector<Token> tokens;
365     std::set<UnicodeString> norm_set;
366     TiCC::LogStream *theErrLog;
367 
368     std::string default_language;
369     std::string document_language; // in case of an input FoLiA document
370     std::map<std::string,Setting*> settings;
371     std::string _command; // original commandline
372     //debug flag
373     int tokDebug;
374 
375     //verbose tokenisation mode
376     bool verbose;
377 
378     //detect quotes?
379     bool detectQuotes;
380 
381     //filter special characters (default on)
382     bool doFilter;
383 
384     //filter all punctuation characters (default off)
385     bool doPunctFilter;
386 
387     //allow correction of FoLiA Word elements
388     bool doWordCorrection;
389 
390     // only sentence spliiting?
391     bool splitOnly;
392 
393     //detect paragraphs?
394     bool detectPar;
395 
396     //has a paragraph been signaled?
397     bool paragraphsignal;
398     bool paragraphsignal_next;
399 
400     //has do we attempt to assign languages?
401     bool doDetectLang;
402 
403     //has do we percolate text up from <w> to <s> and <p> nodes? (FoLiA)
404     // values should be: 'full', 'minimal' or 'none'
405     std::string text_redundancy;
406 
407     //one sentence per line output
408     bool sentenceperlineoutput;
409     bool sentenceperlineinput;
410 
411 
412     bool lowercase;
413     bool uppercase;
414     bool xmlout;
415     bool xmlin;
416     bool passthru;
417     bool ignore_tag_hints;
418     mutable folia::processor *ucto_processor;
419     mutable bool already_tokenized; // set when ucto is called again on tokenized FoLiA
420     std::string docid; //document ID (UTF-8), necessary for XML output
421     std::string inputclass; // class for folia text
422     std::string outputclass; // class for folia text
423     std::string data_version; // the version of uctodata
424     TextCat *text_cat;
425     folia::TextPolicy text_policy;
426   };
427 
428   template< typename T >
stringTo(const std::string & str)429     T stringTo( const std::string& str ) {
430     T result;
431     std::stringstream dummy ( str );
432     if ( !( dummy >> result ) ) {
433       throw( std::runtime_error( "conversion from '" + str + "' failed" ) );
434     }
435     return result;
436   }
437 
438   template< typename T >
toString(const T val)439     std::string toString( const T val ) {
440     std::stringstream dummy;
441     if ( !( dummy << val ) ) {
442       throw( std::runtime_error( "conversion failed" ) );
443     }
444     return dummy.str();
445   }
446 
447   // extract the language assigned to this vector, if any...
448   // will return "" if indetermined.
449   std::string get_language( const std::vector<Token>& );
450   // set the language on a FoliaElement
451   void set_language( folia::FoliaElement*, const std::string& );
452 }
453 #endif
454