1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of Ucto 7 8 Ucto is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 Ucto is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ucto/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 */ 26 27 #ifndef UCTO_TOKENIZE_H 28 #define UCTO_TOKENIZE_H 29 30 #include <vector> 31 #include <set> 32 #include <map> 33 #include <sstream> 34 #include <stdexcept> 35 #include "libfolia/folia.h" 36 #include "ticcutils/LogStream.h" 37 #include "ticcutils/Unicode.h" 38 #include "ucto/setting.h" 39 40 class TextCat; 41 42 namespace Tokenizer { 43 44 using namespace icu; 45 46 const std::string Version(); 47 const std::string VersionName(); 48 49 enum TokenRole { 50 NOROLE = 0, 51 NOSPACE = 1, 52 BEGINOFSENTENCE = 2, 53 ENDOFSENTENCE = 4, 54 NEWPARAGRAPH = 8, 55 BEGINQUOTE = 16, 56 ENDQUOTE = 32, 57 TEMPENDOFSENTENCE = 64, 58 LINEBREAK = 128 59 }; 60 61 std::ostream& operator<<( std::ostream&, const TokenRole& ); 62 63 // setter 64 inline TokenRole operator|( TokenRole T1, TokenRole T2 ){ 65 return (TokenRole)( (int)T1|(int)T2 ); 66 } 67 68 inline TokenRole& operator|= ( TokenRole& T1, TokenRole T2 ){ 69 T1 = (T1 | T2); 70 return T1; 71 } 72 73 // invert 74 inline TokenRole operator~( TokenRole T1 ){ 75 return (TokenRole)~(int)T1; 76 } 77 78 // union 79 inline TokenRole operator&( TokenRole T1, TokenRole T2 ){ 80 return (TokenRole)( (int)T1 & (int)T2 ); 81 } 82 83 inline TokenRole& operator&=( TokenRole& T1, TokenRole T2 ){ 84 T1 = (T1 & T2); 85 return T1; 86 } 87 88 class Token { 89 friend std::ostream& operator<< (std::ostream&, const Token& ); 90 public: 91 UnicodeString type; 92 UnicodeString us; 93 TokenRole role; 94 Token( const UnicodeString&, 95 const UnicodeString&, 96 TokenRole role = NOROLE, 97 const std::string& = "" ); 98 std::string lang_code; // ISO 639-3 language code 99 std::string texttostring(); 100 std::string typetostring(); 101 }; 102 103 class TokenizerClass{ 104 protected: 105 int linenum; 106 public: 107 TokenizerClass(); 108 ~TokenizerClass(); 109 bool init( const std::string&, 110 const std::string& ="" ); // init from a configfile 111 bool init( const std::vector<std::string>&, 112 const std::string& ="" ); // init 1 or more languages 113 bool reset( const std::string& = "default" ); 114 void setErrorLog( TiCC::LogStream *os ); 115 116 // Tokenize from input stream with text OR FoLiA to a FoLiA document 117 folia::Document *tokenize_folia( const std::string& ); 118 // Tokenize from input stream with text to a FoLiA document ( 119 folia::Document *tokenize( std::istream& ); 120 121 // Tokenize from input stream with text OR FoLiA to a FoLiA document and 122 // save it 123 void tokenize_folia( const std::string&, const std::string& ); 124 125 // Tokenize from an input text stream to a token vector 126 // (representing a sentence) 127 // non greedy. Stops after the first full sentence is returned. 128 // may be called multiple times until EOF 129 std::vector<Token> tokenizeOneSentence( std::istream& ); 130 131 // tokenize from file to file 132 void tokenize( const std::string&, const std::string& ); 133 134 //Tokenize from input stream to output stream 135 void tokenize( std::istream&, std::ostream& ); 136 137 // Tokenize a line (a line is NOT just a sentence, but an arbitrary string 138 // of characters, inclusive EOS markers, Newlines etc.) 139 // 140 // OR use popSentence() repeatedly to extract all sentences as vectors 141 // using getString() to extract the UTF8 value of that sentence 142 // OR getSentences() to get ALL sentences as UTF8 strings in a vector 143 void tokenizeLine( const UnicodeString&, const std::string& = "" ); 144 void tokenizeLine( const std::string&, const std::string& = "" ); 145 146 // extract 1 sentence from Token vector; 147 std::vector<Token> popSentence(); 148 149 // convert the sentence in a token vector to a UnicodeString 150 icu::UnicodeString getString( const std::vector<Token>& ); 151 // convert the sentence in a token vector to a string (UTF-8 encoded) 152 std::string getUTF8String( const std::vector<Token>& ); 153 154 // extract all sentences as a vector of UnicodeStrings 155 std::vector<icu::UnicodeString> getSentences(); 156 157 // extract all sentences as a vector of strings (UTF-8 encoded) 158 std::vector<std::string> getUTF8Sentences(); 159 160 //Enable verbose mode 161 bool setVerbose( bool b=true ) { bool t = verbose; verbose = b; return t; }; getVerbose()162 bool getVerbose() const { return verbose; } 163 164 //set debug value setDebug(int d)165 int setDebug( int d ) { int dd = tokDebug; tokDebug = d; return dd; }; getDebug()166 int getDebug() const { return tokDebug; } 167 168 // set the commandline used set_command(const std::string & c)169 void set_command( const std::string& c ){ _command = c; }; 170 171 //set textcat debug value 172 bool set_tc_debug( bool b ); 173 174 //Enable conversion of all output to lowercase 175 bool setLowercase( bool b=true ) { bool t = lowercase; lowercase = b; if (b) uppercase = false; return t; }; getLowercase()176 bool getLowercase() const { return lowercase; } 177 178 //Enable passtru mode 179 bool setPassThru( bool b=true ) { bool t = passthru; passthru = b; return t; }; getPassThru()180 bool getPassThru() const { return passthru; } 181 182 //Disable tag hints 183 bool setNoTags( bool b=true ) { bool t = ignore_tag_hints; 184 ignore_tag_hints = b; 185 return t; }; getNoTags()186 bool getNoTags() const { return ignore_tag_hints; } 187 188 //Enable conversion of all output to uppercase 189 bool setUppercase( bool b=true ) { bool t = uppercase; uppercase = b; if (b) lowercase = false; return t; }; getUppercase()190 bool getUppercase() const { return uppercase; } 191 192 //Enable sentence splitting only 193 bool setSentenceSplit( bool b=true ) { bool t = splitOnly; splitOnly = b; return t; } getSentenceSplit()194 bool getSentenceSplit() const { return splitOnly; } 195 196 //Enable paragraph detection 197 bool setParagraphDetection( bool b=true ) { bool t = detectPar; detectPar = b; return t; } getParagraphDetection()198 bool getParagraphDetection() const { return detectPar; } 199 200 //Enable quote detection 201 bool setQuoteDetection( bool b=true ) { bool t = detectQuotes; detectQuotes = b; return t; } getQuoteDetection()202 bool getQuoteDetection() const { return detectQuotes; } 203 204 //Enable language detection 205 bool setLangDetection( bool b=true ) { bool t = doDetectLang; doDetectLang = b; return t; } getLangDetection()206 bool getLangDetection() const { return doDetectLang; } 207 208 //Enable filtering 209 bool setFiltering( bool b=true ) { 210 bool t = doFilter; doFilter = b; return t; 211 } getFiltering()212 bool getFiltering() const { return doFilter; }; 213 214 //Enable word corrections (FoLiA only) 215 bool setWordCorrection( bool b=true ) { 216 bool t = doWordCorrection; doWordCorrection = b; return t; 217 } getWordCorrection()218 bool getWordCorrection() const { return doWordCorrection; }; 219 220 //Enable punctuation filtering 221 bool setPunctFilter( bool b=true ) { 222 bool t = doPunctFilter; doPunctFilter = b; return t; 223 } getPunctFilter()224 bool getPunctFilter() const { return doPunctFilter; }; 225 226 std::string setTextRedundancy( const std::string& ); 227 228 // set normalization mode setNormalization(const std::string & s)229 std::string setNormalization( const std::string& s ) { 230 return normalizer.setMode( s ); 231 } getNormalization()232 std::string getNormalization() const { return normalizer.getMode(); }; 233 234 // set input encoding 235 std::string setInputEncoding( const std::string& ); getInputEncoding()236 std::string getInputEncoding() const { return inputEncoding; }; 237 setLanguage(const std::string & l)238 void setLanguage( const std::string& l ){ default_language = l; }; getLanguage()239 std::string getLanguage() const { return default_language; }; 240 241 // set eos marker 242 UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = TiCC::UnicodeFromUTF8(s); return t; }; getEosMarker()243 UnicodeString getEosMarker( ) const { return eosmark; } 244 245 bool setNormSet( const std::string& ); 246 247 bool setSentencePerLineOutput( bool b=true ) { bool t = sentenceperlineoutput; sentenceperlineoutput = b; return t; }; getSentencePerLineOutput()248 bool getSentencePerLineOutput() const { return sentenceperlineoutput; } 249 250 bool setSentencePerLineInput( bool b=true ) { bool t = sentenceperlineinput; sentenceperlineinput = b; return t; }; getSentencePerLineInput()251 bool getSentencePerLineInput() const { return sentenceperlineinput; } 252 setXMLOutput(bool b)253 bool setXMLOutput( bool b ) { 254 bool t = xmlout; xmlout = b; return t; } setXMLOutput(bool b,const std::string & id)255 bool setXMLOutput( bool b, const std::string& id ) { 256 setDocID( id ); return setXMLOutput(b); } getXMLOutput()257 bool getXMLOutput() const { return xmlout; } 258 setXMLInput(bool b)259 bool setXMLInput( bool b ) { bool t = xmlin; xmlin = b; return t; } getXMLInput()260 bool getXMLInput() const { return xmlin; } 261 262 getInputClass()263 const std::string getInputClass( ) const { return inputclass; } setInputClass(const std::string & cls)264 const std::string setInputClass( const std::string& cls) { 265 std::string res = inputclass; 266 inputclass = cls; 267 return res; 268 } getOutputClass()269 const std::string getOutputClass( ) const { return outputclass; } setOutputClass(const std::string & cls)270 const std::string setOutputClass( const std::string& cls) { 271 std::string res = outputclass; 272 outputclass = cls; 273 return res; 274 } 275 getDocID()276 std::string getDocID() const { return docid; } setDocID(const std::string & id)277 std::string setDocID( const std::string& id ) { 278 const std::string s = docid; docid = id; return s; } 279 280 bool get_setting_info( const std::string&, 281 std::string&, 282 std::string& ) const; 283 std::string get_data_version() const; 284 285 folia::processor *init_provenance( folia::Document *, 286 folia::processor * =0 ) const; 287 folia::processor *add_provenance_passthru( folia::Document *, 288 folia::processor * =0 ) const; 289 folia::processor *add_provenance_data( folia::Document *, 290 folia::processor * =0 ) const; 291 folia::processor *add_provenance_setting( folia::Document *, 292 folia::processor * =0 ) const; 293 folia::processor *add_provenance_structure( folia::Document *, 294 folia::processor * =0 ) const; 295 folia::processor *add_provenance_structure( folia::Document *, 296 const folia::AnnotationType, 297 folia::processor * =0 ) const; ucto_re_run()298 bool ucto_re_run() const { return already_tokenized; }; 299 std::vector<Token> correct_elements( folia::FoliaElement *, 300 const std::vector<folia::FoliaElement*>& ); 301 302 private: 303 304 TokenizerClass( const TokenizerClass& ); // inhibit copies 305 TokenizerClass& operator=( const TokenizerClass& ); // inhibit copies 306 307 void passthruLine( const UnicodeString&, bool& ); 308 void passthruLine( const std::string&, bool& ); 309 310 folia::Document *start_document( const std::string& ) const; 311 folia::FoliaElement *append_to_folia( folia::FoliaElement *root, 312 const std::vector<Token>& tv, 313 int& p_count ) const; 314 315 std::vector<folia::Word*> append_to_sentence( folia::Sentence *, 316 const std::vector<Token>& ) const; 317 void correct_element( folia::FoliaElement *, 318 const std::vector<Token>&, 319 const std::string& ) const; 320 321 void handle_one_sentence( folia::Sentence *, int& ); 322 void handle_one_paragraph( folia::Paragraph *, int& ); 323 void handle_one_text_parent( folia::FoliaElement *, int& ); 324 325 //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found 326 int countSentences(bool forceentirebuffer = false); 327 //count the number of sentences (only after detectSentenceBounds) (does some extra validation as well) 328 int flushSentences( int, const std::string& = "default" ); 329 //Flush n sentences from buffer (does some extra validation as well) 330 331 icu::UnicodeString outputTokens( const std::vector<Token>&, 332 const bool=false ) const; 333 void add_rule( const UnicodeString&, 334 const std::vector<UnicodeString>& ); 335 void tokenizeWord( const UnicodeString&, 336 bool, 337 const std::string&, 338 const UnicodeString& ="" ); 339 int internal_tokenize_line( const UnicodeString&, 340 const std::string& ); 341 342 void tokenize_one_line( const UnicodeString&, 343 bool&, 344 const std::string& = "" ); 345 346 bool detectEos( size_t, const UnicodeString&, const Quoting& ) const; 347 void detectSentenceBounds( const int offset, 348 const std::string& = "default" ); 349 void detectQuotedSentenceBounds( const int offset, 350 const std::string& = "default" ); 351 void detectQuoteBounds( const int, 352 Quoting& ); 353 354 bool resolveQuote( int, const UnicodeString&, Quoting& ); 355 bool u_isquote( UChar32, 356 const Quoting& ) const; 357 std::string checkBOM( std::istream& ); 358 void outputTokensDoc_init( folia::Document& ) const; 359 360 TiCC::UnicodeNormalizer normalizer; 361 std::string inputEncoding; 362 363 UnicodeString eosmark; 364 std::vector<Token> tokens; 365 std::set<UnicodeString> norm_set; 366 TiCC::LogStream *theErrLog; 367 368 std::string default_language; 369 std::string document_language; // in case of an input FoLiA document 370 std::map<std::string,Setting*> settings; 371 std::string _command; // original commandline 372 //debug flag 373 int tokDebug; 374 375 //verbose tokenisation mode 376 bool verbose; 377 378 //detect quotes? 379 bool detectQuotes; 380 381 //filter special characters (default on) 382 bool doFilter; 383 384 //filter all punctuation characters (default off) 385 bool doPunctFilter; 386 387 //allow correction of FoLiA Word elements 388 bool doWordCorrection; 389 390 // only sentence spliiting? 391 bool splitOnly; 392 393 //detect paragraphs? 394 bool detectPar; 395 396 //has a paragraph been signaled? 397 bool paragraphsignal; 398 bool paragraphsignal_next; 399 400 //has do we attempt to assign languages? 401 bool doDetectLang; 402 403 //has do we percolate text up from <w> to <s> and <p> nodes? (FoLiA) 404 // values should be: 'full', 'minimal' or 'none' 405 std::string text_redundancy; 406 407 //one sentence per line output 408 bool sentenceperlineoutput; 409 bool sentenceperlineinput; 410 411 412 bool lowercase; 413 bool uppercase; 414 bool xmlout; 415 bool xmlin; 416 bool passthru; 417 bool ignore_tag_hints; 418 mutable folia::processor *ucto_processor; 419 mutable bool already_tokenized; // set when ucto is called again on tokenized FoLiA 420 std::string docid; //document ID (UTF-8), necessary for XML output 421 std::string inputclass; // class for folia text 422 std::string outputclass; // class for folia text 423 std::string data_version; // the version of uctodata 424 TextCat *text_cat; 425 folia::TextPolicy text_policy; 426 }; 427 428 template< typename T > stringTo(const std::string & str)429 T stringTo( const std::string& str ) { 430 T result; 431 std::stringstream dummy ( str ); 432 if ( !( dummy >> result ) ) { 433 throw( std::runtime_error( "conversion from '" + str + "' failed" ) ); 434 } 435 return result; 436 } 437 438 template< typename T > toString(const T val)439 std::string toString( const T val ) { 440 std::stringstream dummy; 441 if ( !( dummy << val ) ) { 442 throw( std::runtime_error( "conversion failed" ) ); 443 } 444 return dummy.str(); 445 } 446 447 // extract the language assigned to this vector, if any... 448 // will return "" if indetermined. 449 std::string get_language( const std::vector<Token>& ); 450 // set the language on a FoliaElement 451 void set_language( folia::FoliaElement*, const std::string& ); 452 } 453 #endif 454