1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of Ucto 7 8 Ucto is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 Ucto is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ucto/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 26 */ 27 28 #include "ucto/tokenize.h" 29 30 #include <cassert> 31 #include <unistd.h> 32 #include <iostream> 33 #include <fstream> 34 #include <vector> 35 #include "config.h" 36 #include "unicode/schriter.h" 37 #include "unicode/ucnv.h" 38 #include "ticcutils/StringOps.h" 39 #include "ticcutils/PrettyPrint.h" 40 #include "ticcutils/Unicode.h" 41 #include "ticcutils/Timer.h" 42 #include "ucto/my_textcat.h" 43 44 #define DO_READLINE 45 #ifdef HAVE_LIBREADLINE 46 # if defined(HAVE_READLINE_READLINE_H) 47 # include <readline/readline.h> 48 # elif defined(HAVE_READLINE_H) 49 # include <readline.h> 50 # else 51 # undef DO_READLINE 52 # endif /* !defined(HAVE_READLINE_H) */ 53 #else 54 # undef DO_READLINE 55 #endif /* HAVE_LIBREADLINE */ 56 57 #ifdef HAVE_READLINE_HISTORY 58 # if defined(HAVE_READLINE_HISTORY_H) 59 # include <readline/history.h> 60 # elif defined(HAVE_HISTORY_H) 61 # include <history.h> 62 # endif /* defined(HAVE_READLINE_HISTORY_H) */ 63 #endif /* HAVE_READLINE_HISTORY */ 64 65 using namespace std; 66 67 #define LOG *TiCC::Log(theErrLog) 68 69 namespace Tokenizer { 70 71 using namespace icu; 72 using TiCC::operator<<; 73 74 const string ISO_SET = "http://raw.github.com/proycon/folia/master/setdefinitions/iso639_3.foliaset.ttl"; 75 76 const string UCTO_SET_PREFIX = "https://raw.githubusercontent.com/LanguageMachines/uctodata/master/setdefinitions/"; 77 Version()78 const std::string Version() { return VERSION; } VersionName()79 const std::string VersionName() { return PACKAGE_STRING; } 80 81 class uRangeError: public std::out_of_range { 82 public: uRangeError(const string & s)83 explicit uRangeError( const string& s ): out_of_range( "ucto: out of range:" + s ){}; 84 }; 85 86 class uLogicError: public std::logic_error { 87 public: uLogicError(const string & s)88 explicit uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){}; 89 }; 90 91 class uCodingError: public std::runtime_error { 92 public: uCodingError(const string & s)93 explicit uCodingError( const string& s ): runtime_error( "ucto: coding problem:" + s ){}; 94 }; 95 96 convert(const string & line,const string & inputEncoding)97 UnicodeString convert( const string& line, 98 const string& inputEncoding ){ 99 UnicodeString result; 100 if ( !line.empty() ){ 101 try { 102 result = UnicodeString( line.c_str(), 103 line.length(), 104 inputEncoding.c_str() ); 105 } 106 catch ( exception &e) { 107 throw uCodingError( "Unexpected character found in input. " + 108 string(e.what()) + "Make sure input is valid: " + 109 inputEncoding ); 110 } 111 if ( result.isBogus() ){ 112 throw uCodingError( "string decoding failed: (invalid inputEncoding '" 113 + inputEncoding + "' ?)" ); 114 } 115 } 116 return result; 117 } 118 119 const UnicodeString type_space = "SPACE"; 120 const UnicodeString type_currency = "CURRENCY"; 121 const UnicodeString type_emoticon = "EMOTICON"; 122 const UnicodeString type_picto = "PICTOGRAM"; 123 const UnicodeString type_word = "WORD"; 124 const UnicodeString type_symbol = "SYMBOL"; 125 const UnicodeString type_punctuation = "PUNCTUATION"; 126 const UnicodeString type_number = "NUMBER"; 127 const UnicodeString type_unknown = "UNKNOWN"; 128 Token(const UnicodeString & _type,const UnicodeString & _s,TokenRole _role,const string & _lang_code)129 Token::Token( const UnicodeString& _type, 130 const UnicodeString& _s, 131 TokenRole _role, const string& _lang_code ): 132 type(_type), us(_s), role(_role), lang_code(_lang_code) { 133 } 134 135 texttostring()136 std::string Token::texttostring() { return TiCC::UnicodeToUTF8(us); } typetostring()137 std::string Token::typetostring() { return TiCC::UnicodeToUTF8(type); } 138 operator <<(std::ostream & os,const Token & t)139 ostream& operator<< (std::ostream& os, const Token& t ){ 140 os << t.type << " : " << t.role << ":" << t.us << " (" << t.lang_code << ")"; 141 return os; 142 } 143 toUString(const TokenRole & tok)144 UnicodeString toUString( const TokenRole& tok ){ 145 UnicodeString result; 146 if ( tok & NOSPACE){ 147 result += "NOSPACE "; 148 } 149 if ( tok & BEGINOFSENTENCE) { 150 result += "BEGINOFSENTENCE "; 151 } 152 if ( tok & ENDOFSENTENCE) { 153 result += "ENDOFSENTENCE "; 154 } 155 if ( tok & NEWPARAGRAPH) { 156 result += "NEWPARAGRAPH "; 157 } 158 if ( tok & BEGINQUOTE) { 159 result += "BEGINQUOTE "; 160 } 161 if ( tok & ENDQUOTE) { 162 result += "ENDQUOTE "; 163 } 164 return result; 165 } 166 operator <<(ostream & os,const TokenRole & tok)167 ostream& operator<<( ostream& os, const TokenRole& tok ){ 168 os << toUString( tok ); 169 return os; 170 } 171 TokenizerClass()172 TokenizerClass::TokenizerClass(): 173 linenum(0), 174 inputEncoding( "UTF-8" ), 175 eosmark("<utt>"), 176 tokDebug(0), 177 verbose(false), 178 detectQuotes(false), 179 doFilter(true), 180 doPunctFilter(false), 181 doWordCorrection(true), 182 splitOnly( false ), 183 detectPar(true), 184 paragraphsignal(true), 185 paragraphsignal_next(false), 186 doDetectLang(false), 187 text_redundancy("minimal"), 188 sentenceperlineoutput(false), 189 sentenceperlineinput(false), 190 lowercase(false), 191 uppercase(false), 192 xmlout(false), 193 xmlin(false), 194 passthru(false), 195 ignore_tag_hints(false), 196 ucto_processor(0), 197 already_tokenized(false), 198 inputclass("current"), 199 outputclass("current"), 200 text_cat( 0 ) 201 { 202 theErrLog = new TiCC::LogStream(cerr, "ucto" ); 203 theErrLog->setstamp( StampMessage ); 204 #ifdef HAVE_TEXTCAT 205 string textcat_cfg = string(SYSCONF_PATH) + "/ucto/textcat.cfg"; 206 text_cat = new TextCat( textcat_cfg, theErrLog ); 207 // text_cat->set_debug( true ); 208 LOG << " textcat configured from: " << textcat_cfg << endl; 209 // ifstream is( textcat_cfg ); 210 // string line; 211 // while ( getline( is, line ) ){ 212 // LOG << line << endl; 213 // vector<string> v = TiCC::split( line ); 214 // if ( v.size()==2 && v[1] == "nld" ){ 215 // LOG << "voor nederlands: " << endl; 216 // ifstream is2( v[0] ); 217 // string line2; 218 // while ( getline( is2, line2 ) ){ 219 // LOG << line2 << endl; 220 // break; 221 // } 222 // LOG << " done with nederlands" << endl; 223 // } 224 // } 225 #else 226 LOG << "NO TEXTCAT SUPPORT!" << endl; 227 #endif 228 } 229 ~TokenizerClass()230 TokenizerClass::~TokenizerClass(){ 231 Setting *d = 0; 232 for ( const auto& s : settings ){ 233 if ( s.first == "default" ){ 234 // the 'default' may also return as a real 'language' 235 // avoid deleting it twice 236 d = s.second; 237 delete d; 238 } 239 if ( s.second != d ){ 240 delete s.second; 241 } 242 243 } 244 delete theErrLog; 245 delete text_cat; 246 } 247 reset(const string & lang)248 bool TokenizerClass::reset( const string& lang ){ 249 ucto_processor = 0; 250 already_tokenized = false; 251 tokens.clear(); 252 if ( settings.find("lang") != settings.end() ){ 253 settings[lang]->quotes.clearStack(); 254 } 255 return true; 256 } 257 setNormSet(const std::string & values)258 bool TokenizerClass::setNormSet( const std::string& values ){ 259 vector<string> parts = TiCC::split_at( values, "," ); 260 for ( const auto& val : parts ){ 261 norm_set.insert( TiCC::UnicodeFromUTF8( val ) ); 262 } 263 return true; 264 } 265 setErrorLog(TiCC::LogStream * os)266 void TokenizerClass::setErrorLog( TiCC::LogStream *os ) { 267 if ( theErrLog != os ){ 268 text_cat->set_debug_stream( os ); 269 delete theErrLog; 270 } 271 theErrLog = os; 272 } 273 setInputEncoding(const std::string & enc)274 string TokenizerClass::setInputEncoding( const std::string& enc ){ 275 string old = inputEncoding; 276 inputEncoding = enc; 277 return old; 278 } 279 setTextRedundancy(const std::string & tr)280 string TokenizerClass::setTextRedundancy( const std::string& tr ){ 281 if ( tr == "none" || tr == "minimal" || tr == "full" ){ 282 string s = text_redundancy; 283 text_redundancy = tr; 284 return s; 285 } 286 else { 287 throw runtime_error( "illegal value '" + tr + "' for textredundancy. " 288 "expected 'full', 'minimal' or 'none'." ); 289 } 290 } 291 set_tc_debug(bool b)292 bool TokenizerClass::set_tc_debug( bool b ){ 293 if ( !text_cat ){ 294 throw logic_error( "attempt to set debug on uninitialized TextClass object" ); 295 } 296 else { 297 return text_cat->set_debug( b ); 298 } 299 } 300 fixup_UTF16(const string & input_line,const string & encoding)301 string fixup_UTF16( const string& input_line, const string& encoding ){ 302 string line = input_line; 303 // some hackery to handle exotic input. UTF-16 but also CR at end. 304 string::size_type pos = line.rfind( '\r' ); 305 if ( pos != string::npos ){ 306 line.erase( pos ); 307 } 308 if ( line.size() > 0 && line[0] == 0 ){ 309 // when processing UTF16LE, '0' bytes show up at pos 0 310 // we discard them, not for UTF16BE! 311 // this works on Linux with GCC (atm) 312 if ( encoding != "UTF16BE" ){ 313 line.erase(0,1); 314 } 315 } 316 if ( line.size() > 0 && encoding == "UTF16BE" && 317 line.back() == 0 ){ 318 // when processing UTF16BE, '0' bytes show up at the end 319 // we discard them. 320 // this works on Linux with GCC (atm) 321 line.erase(line.size()-1); 322 } 323 return line; 324 } 325 init_provenance(folia::Document * doc,folia::processor * parent) const326 folia::processor *TokenizerClass::init_provenance( folia::Document *doc, 327 folia::processor *parent ) const { 328 if ( ucto_processor ){ 329 // already created 330 if ( tokDebug > 0 ){ 331 LOG << "use already created processor: " << ucto_processor->id() << endl; 332 } 333 return ucto_processor; 334 } 335 vector<folia::processor *> procs = doc->get_processors_by_name( "ucto" ); 336 if ( !procs.empty() ){ 337 if ( procs.size() > 1 ){ 338 LOG << "ucto is very confused about '" << doc->filename() << "'\n" 339 << "Multiple 'ucto' processors have already been run?" << endl; 340 exit( EXIT_FAILURE ); 341 } 342 // ucto has been used one before, we can't do it complettely over again! 343 LOG << "Difficult to tokenize '" << doc->filename() 344 << "' again, already processed by ucto before!" << endl; 345 LOG << " The document will be copied as-is to the output file" << endl; 346 already_tokenized = true; 347 return procs[0]; 348 } 349 else { 350 folia::KWargs args; 351 args["name"] = "ucto"; 352 args["generate_id"] = "auto()"; 353 args["version"] = PACKAGE_VERSION; 354 args["command"] = _command; 355 args["begindatetime"] = "now()"; 356 if ( parent ){ 357 ucto_processor = doc->add_processor( args, parent ); 358 } 359 else { 360 args["generator"] = "yes"; 361 ucto_processor = doc->add_processor( args ); 362 ucto_processor->get_system_defaults(); 363 } 364 if ( tokDebug > 0 ){ 365 LOG << "created a new processor: " << ucto_processor->id() << endl; 366 } 367 return ucto_processor; 368 } 369 } 370 add_provenance_passthru(folia::Document * doc,folia::processor * parent) const371 folia::processor *TokenizerClass::add_provenance_passthru( folia::Document *doc, 372 folia::processor *parent ) const { 373 folia::processor *proc = init_provenance( doc, parent ); 374 if ( proc ){ 375 folia::KWargs args; 376 args["processor"] = proc->id(); 377 doc->declare( folia::AnnotationType::TOKEN, "passthru", args ); 378 } 379 return proc; 380 } 381 add_provenance_data(folia::Document * doc,folia::processor * parent) const382 folia::processor *TokenizerClass::add_provenance_data( folia::Document *doc, 383 folia::processor* parent ) const { 384 folia::processor *proc = init_provenance( doc, parent ); 385 if ( proc ){ 386 if ( !ucto_re_run() ){ 387 string id = "ucto.1.1"; 388 folia::processor *data_proc = doc->get_processor( id ); 389 if ( !data_proc ){ 390 folia::KWargs args; 391 args["name"] = "uctodata"; 392 args["generate_id"] = "auto()"; 393 args["type"] = "datasource"; 394 args["version"] = data_version; 395 data_proc = doc->add_processor( args, proc ); 396 } 397 return data_proc; 398 } 399 else { 400 return proc; 401 } 402 } 403 else { 404 return 0; 405 } 406 } 407 add_provenance_structure(folia::Document * doc,const folia::AnnotationType type,folia::processor * parent) const408 folia::processor *TokenizerClass::add_provenance_structure( folia::Document *doc, 409 const folia::AnnotationType type, 410 folia::processor *parent ) const { 411 folia::processor *proc = init_provenance( doc, parent ); 412 if ( proc && !ucto_re_run() ){ 413 if ( !doc->declared( type ) ){ 414 // we can declare it 415 folia::KWargs args; 416 args["processor"] = proc->id(); 417 doc->declare( type, "None", args ); 418 if ( tokDebug > 3 ){ 419 LOG << "added " << folia::toString(type) << "-annotation for: '" 420 << proc->id() << endl; 421 } 422 } 423 else { 424 string proc_id = doc->default_processor(type); 425 if ( !proc_id.empty() ){ 426 proc = doc->get_processor(proc_id); 427 if ( tokDebug ){ 428 LOG << "REUSE " << folia::toString(type) << "-annotation for: '" 429 << proc->id() << "' with set=" << doc->default_set(type) << endl; 430 } 431 } 432 else { 433 proc = 0; 434 if ( tokDebug ){ 435 LOG << "REUSE " << folia::toString(type) << "-annotation" 436 << " with set=" << doc->default_set(type) << endl; 437 } 438 } 439 } 440 } 441 return proc; 442 } 443 add_provenance_structure(folia::Document * doc,folia::processor * parent) const444 folia::processor *TokenizerClass::add_provenance_structure( folia::Document *doc, 445 folia::processor *parent ) const { 446 folia::processor *res = 0; 447 add_provenance_structure( doc, 448 folia::AnnotationType::PARAGRAPH, parent ); 449 add_provenance_structure( doc, 450 folia::AnnotationType::SENTENCE, 451 parent ); 452 res = add_provenance_structure( doc, 453 folia::AnnotationType::QUOTE, 454 parent ); 455 return res; 456 } 457 add_provenance_setting(folia::Document * doc,folia::processor * parent) const458 folia::processor *TokenizerClass::add_provenance_setting( folia::Document *doc, 459 folia::processor *parent ) const { 460 folia::processor *proc = init_provenance( doc, parent ); 461 if ( proc && !ucto_re_run() ){ 462 folia::processor *data_proc = add_provenance_data( doc, parent ); 463 if ( doc->metadata_type() == "native" ){ 464 doc->set_metadata( "language", default_language ); 465 } 466 for ( const auto& s : settings ){ 467 if ( tokDebug > 3 ){ 468 LOG << "language: " << s.first << endl; 469 } 470 if ( s.first == "default" ){ 471 continue; 472 } 473 folia::KWargs args; 474 args["name"] = s.second->set_file; 475 args["generate_id"] = "next()"; 476 args["type"] = "datasource"; 477 args["version"] = s.second->version; 478 doc->add_processor( args, data_proc ); 479 args.clear(); 480 args["processor"] = proc->id(); 481 string alias = "tokconfig-" + s.first; 482 string ucto_set = UCTO_SET_PREFIX + alias + ".foliaset.ttl"; 483 args["alias"] = alias; 484 if ( doc->declared( folia::AnnotationType::TOKEN, alias ) ){ 485 // we assume that an old-style declaration is present 486 doc->un_declare( folia::AnnotationType::TOKEN, alias ); 487 } 488 doc->declare( folia::AnnotationType::TOKEN, ucto_set, args ); 489 if ( tokDebug > 3 ){ 490 LOG << "added processor and token-annotation for: '" 491 << alias << "'" << endl; 492 } 493 } 494 return data_proc; 495 } 496 else { 497 return 0; 498 } 499 } 500 start_document(const string & id) const501 folia::Document *TokenizerClass::start_document( const string& id ) const { 502 folia::Document *doc = new folia::Document( "xml:id='" + id + "'" ); 503 doc->addStyle( "text/xsl", "folia.xsl" ); 504 if ( tokDebug > 3 ){ 505 LOG << "start document!!!" << endl; 506 } 507 if ( passthru ){ 508 add_provenance_passthru( doc ); 509 } 510 else { 511 add_provenance_setting( doc ); 512 } 513 folia::KWargs args; 514 args["xml:id"] = doc->id() + ".text"; 515 doc->create_root<folia::Text>( args ); 516 return doc; 517 } 518 tokenize_one_line(const UnicodeString & input_line,bool & bos,const string & lang)519 void TokenizerClass::tokenize_one_line( const UnicodeString& input_line, 520 bool& bos, 521 const string& lang ){ 522 if ( passthru ){ 523 passthruLine( input_line, bos ); 524 } 525 else { 526 string language = lang; 527 if ( language.empty() ){ 528 if ( tokDebug > 3 ){ 529 LOG << "should we guess the language? " 530 << (text_cat && doDetectLang) << endl; 531 } 532 if ( text_cat && doDetectLang ){ 533 UnicodeString temp = input_line; 534 temp.findAndReplace( eosmark, "" ); 535 temp.toLower(); 536 if ( tokDebug > 3 ){ 537 LOG << "use textCat to guess language from: " 538 << temp << endl; 539 } 540 language = text_cat->get_language( TiCC::UnicodeToUTF8(temp) ); 541 if ( settings.find( language ) != settings.end() ){ 542 if ( tokDebug > 3 ){ 543 LOG << "found a supported language: " << language << endl; 544 } 545 } 546 else { 547 if ( tokDebug > 3 ){ 548 LOG << "found an unsupported language: " << language << endl; 549 } 550 language = "default"; 551 } 552 } 553 } 554 internal_tokenize_line( input_line, language ); 555 } 556 } 557 tokenizeOneSentence(istream & IN)558 vector<Token> TokenizerClass::tokenizeOneSentence( istream& IN ){ 559 if (tokDebug > 0) { 560 LOG << "[tokenizeOneSentence()] before countSent " << endl; 561 } 562 int numS = countSentences(); //count full sentences in token buffer 563 if ( numS > 0 ) { // still some sentences in the buffer 564 if (tokDebug > 0) { 565 LOG << "[tokenizeOneSentence] " << numS 566 << " sentence(s) in buffer, processing..." << endl; 567 } 568 return popSentence( ); 569 } 570 if (tokDebug > 0) { 571 LOG << "[tokenizeOneSentence] NO sentences in buffer, searching.." << endl; 572 } 573 bool done = false; 574 bool bos = true; 575 inputEncoding = checkBOM( IN ); 576 string line; 577 do { 578 done = !getline( IN, line ); 579 UnicodeString input_line; 580 if ( !done ){ 581 ++linenum; 582 if (tokDebug > 0) { 583 LOG << "[tokenize] Read input line " << linenum 584 << "-: '" << TiCC::format_nonascii( line ) << "'" << endl; 585 } 586 string tmp_line = fixup_UTF16( line, inputEncoding ); 587 if ( tokDebug > 0 588 && tmp_line != line ){ 589 LOG << "After fixup, input_line= '" 590 << TiCC::format_nonascii( tmp_line ) << "'" << endl; 591 } 592 input_line = convert( tmp_line, inputEncoding ); 593 if ( sentenceperlineinput ){ 594 input_line += " " + eosmark; 595 } 596 } 597 if (tokDebug > 0) { 598 LOG << "[tokenizeOneSentence] before next countSentences " << endl; 599 } 600 if ( done || input_line.isEmpty() ){ 601 //Signal the tokenizer that a paragraph is detected 602 paragraphsignal = true; 603 numS = countSentences(true); //count full sentences in token buffer, 604 // setting explicit END_OF_SENTENCE 605 } 606 else { 607 tokenize_one_line( input_line, bos ); 608 numS = countSentences(); //count full sentences in token buffer 609 } 610 if ( numS > 0 ) { 611 // 1 or more sentences in the buffer. 612 // extract the first 1 613 if (tokDebug > 0) { 614 LOG << "[tokenizeOneSentence] " << numS << " sentence(s) in buffer, processing first one..." << endl; 615 } 616 return popSentence(); 617 } 618 else { 619 if (tokDebug > 0) { 620 LOG << "[tokenizeOneSentence] No sentence yet, reading on..." << endl; 621 } 622 } 623 } while (!done); 624 vector<Token> result; 625 return result; 626 } 627 appendText(folia::FoliaElement * root,const string & outputclass)628 void appendText( folia::FoliaElement *root, 629 const string& outputclass ){ 630 // set the textcontent of root to that of it's children 631 if ( !root ){ 632 throw logic_error( "appendText() on empty root" ); 633 } 634 if ( root->hastext( outputclass ) ){ 635 // there is already text, bail out. 636 return; 637 } 638 if ( root->isSubClass( folia::Linebreak_t ) ){ 639 // exception 640 return; 641 } 642 UnicodeString utxt = root->text( outputclass ); 643 // so get Untokenized text from the children, and set it 644 root->settext( TiCC::UnicodeToUTF8(utxt), outputclass ); 645 } 646 removeText(folia::FoliaElement * root,const string & outputclass)647 void removeText( folia::FoliaElement *root, 648 const string& outputclass ){ 649 if ( !root ){ 650 throw logic_error( "removeText() on empty root" ); 651 } 652 // remove the textcontent in outputclass of root 653 root->clear_textcontent( outputclass ); 654 } 655 tokenize(istream & IN)656 folia::Document *TokenizerClass::tokenize( istream& IN ) { 657 inputEncoding = checkBOM( IN ); 658 folia::Document *doc = start_document( docid ); 659 folia::FoliaElement *root = doc->doc()->index(0); 660 int parCount = 0; 661 vector<Token> buffer; 662 do { 663 if ( tokDebug > 0 ){ 664 LOG << "[tokenize] looping on stream" << endl; 665 } 666 vector<Token> v = tokenizeOneSentence( IN ); 667 if ( !v.empty() ){ 668 if ( tokDebug > 1 ){ 669 LOG << "[tokenize] sentence=" << v << endl; 670 } 671 root = append_to_folia( root, v, parCount ); 672 } 673 } 674 while ( IN ); 675 if ( tokDebug > 0 ){ 676 LOG << "[tokenize] end of stream reached" << endl; 677 } 678 if (!buffer.empty()){ 679 if ( tokDebug > 1 ){ 680 LOG << "[tokenize] remainder=" << buffer << endl; 681 } 682 append_to_folia( root, buffer, parCount); 683 } 684 // make sure to set the text on the last root created 685 if ( text_redundancy == "full" ){ 686 appendText( root, outputclass ); 687 } 688 else if ( text_redundancy == "none" ){ 689 removeText( root, outputclass ); 690 } 691 return doc; 692 } 693 tokenize(const string & ifile,const string & ofile)694 void TokenizerClass::tokenize( const string& ifile, const string& ofile ){ 695 ostream *OUT = NULL; 696 if ( ofile.empty() ) 697 OUT = &cout; 698 else { 699 OUT = new ofstream( ofile ); 700 } 701 702 istream *IN = NULL; 703 if ( xmlin ){ 704 folia::Document *doc = tokenize_folia( ifile ); 705 *OUT << *doc; 706 OUT->flush(); 707 delete doc; 708 } 709 else { 710 if ( ifile.empty() ) 711 IN = &cin; 712 else { 713 IN = new ifstream( ifile ); 714 if ( !IN || !IN->good() ){ 715 cerr << "ucto: problems opening inputfile " << ifile << endl; 716 cerr << "ucto: Courageously refusing to start..." << endl; 717 throw runtime_error( "unable to find or read file: '" + ifile + "'" ); 718 } 719 } 720 this->tokenize( *IN, *OUT ); 721 } 722 if ( IN != &cin ) delete IN; 723 if ( OUT != &cout ) delete OUT; 724 } 725 tokenize(istream & IN,ostream & OUT)726 void TokenizerClass::tokenize( istream& IN, ostream& OUT) { 727 if (xmlout) { 728 folia::Document *doc = tokenize( IN ); 729 OUT << doc; 730 OUT.flush(); 731 delete doc; 732 } 733 #ifdef DO_READLINE 734 else if ( &IN == &cin && isatty(0) ){ 735 // interactive use on a terminal (quite a hack..) 736 const char *prompt = "ucto> "; 737 string line; 738 int i = 0; 739 while ( true ){ 740 string data; 741 char *input = readline( prompt ); 742 if ( !input ){ 743 break; 744 } 745 line = input; 746 sentenceperlineinput = true; 747 if ( line.empty() ){ 748 free( input ); 749 continue; 750 } 751 else { 752 add_history( input ); 753 free( input ); 754 data += line + " "; 755 } 756 if ( !data.empty() ){ 757 tokenizeLine( data ); 758 // extract sentence from Token vector until done 759 vector<Token> v = popSentence(); 760 while( !v.empty() ){ 761 UnicodeString res = outputTokens( v , (i>0) ); 762 OUT << res; 763 ++i; 764 v = popSentence(); 765 } 766 OUT << endl; 767 } 768 } 769 } 770 #endif 771 else { 772 int i = 0; 773 inputEncoding = checkBOM( IN ); 774 do { 775 if ( tokDebug > 0 ){ 776 LOG << "[tokenize] looping on stream" << endl; 777 } 778 vector<Token> v = tokenizeOneSentence( IN ); 779 while( !v.empty() ){ 780 UnicodeString res = outputTokens( v , (i>0) ); 781 OUT << res; 782 ++i; 783 v = tokenizeOneSentence( IN ); 784 } 785 } while ( IN ); 786 if ( tokDebug > 0 ){ 787 LOG << "[tokenize] end_of_stream" << endl; 788 } 789 OUT << endl; 790 } 791 } 792 set_language(folia::FoliaElement * node,const string & lang)793 void set_language( folia::FoliaElement* node, const string& lang ){ 794 // set the language on this @node to @lang 795 // If a LangAnnotation with a set is already present, we silently 796 // keep using that set. 797 // Otherwise we add the ISO_SET 798 string lang_set = node->doc()->default_set( folia::AnnotationType::LANG ); 799 if ( lang_set.empty() ){ 800 lang_set = ISO_SET; 801 folia::KWargs args; 802 args["processor"] = "ucto.1"; 803 node->doc()->declare( folia::AnnotationType::LANG, 804 ISO_SET, 805 args ); 806 } 807 folia::KWargs args; 808 args["class"] = lang; 809 args["set"] = lang_set; 810 folia::LangAnnotation *la = new folia::LangAnnotation( args, node->doc() ); 811 node->replace( la ); 812 } 813 get_parent_id(folia::FoliaElement * el)814 string get_parent_id( folia::FoliaElement *el ){ 815 if ( !el ){ 816 return ""; 817 } 818 else if ( !el->id().empty() ){ 819 return el->id(); 820 } 821 else { 822 return get_parent_id( el->parent() ); 823 } 824 } 825 append_to_sentence(folia::Sentence * sent,const vector<Token> & toks) const826 vector<folia::Word*> TokenizerClass::append_to_sentence( folia::Sentence *sent, 827 const vector<Token>& toks ) const { 828 vector<folia::Word*> result; 829 folia::Document *doc = sent->doc(); 830 string tok_set; 831 if ( passthru ){ 832 tok_set = "passthru"; 833 } 834 else { 835 string tc_lc = get_language( toks ); 836 if ( tc_lc != "default" ){ 837 tok_set = "tokconfig-" + tc_lc; 838 set_language( sent, tc_lc ); 839 } 840 else { 841 tok_set = "tokconfig-" + default_language; 842 } 843 } 844 folia::FoliaElement *root = sent; 845 if ( tokDebug > 5 ){ 846 LOG << "add_words\n" << toks << endl; 847 } 848 for ( size_t i=0; i < toks.size(); ++i ){ 849 const auto& tok = toks[i]; 850 if ( tokDebug > 5 ){ 851 LOG << "add_result\n" << tok << endl; 852 } 853 if ( tok.role & BEGINQUOTE ){ 854 if (tokDebug > 5 ) { 855 LOG << "[add_words] Creating quote element" << endl; 856 } 857 folia::processor *proc = add_provenance_structure( doc, 858 folia::AnnotationType::QUOTE ); 859 folia::KWargs args; 860 string id = get_parent_id(root); 861 if ( !id.empty() ){ 862 args["generate_id"] = id; 863 } 864 if ( proc ){ 865 args["processor"] = proc->id(); 866 } 867 args["set"] = doc->default_set( folia::AnnotationType::QUOTE ); 868 folia::FoliaElement *q = new folia::Quote( args, doc ); 869 root->append( q ); 870 // might need a new Sentence 871 if ( i+1 < toks.size() 872 && toks[i+1].role & BEGINOFSENTENCE ){ 873 folia::processor *proc2 = add_provenance_structure( doc, 874 folia::AnnotationType::SENTENCE ); 875 folia::KWargs args2; 876 string pid = get_parent_id(root); 877 if ( !pid.empty() ){ 878 args2["generate_id"] = pid; 879 } 880 if ( proc2 ){ 881 args2["processor"] = proc2->id(); 882 } 883 args2["set"] = doc->default_set( folia::AnnotationType::SENTENCE ); 884 folia::Sentence *ns = new folia::Sentence( args2, doc ); 885 q->append( ns ); 886 root = ns; 887 } 888 else { 889 root = q; 890 } 891 } 892 else if ( (tok.role & BEGINOFSENTENCE) 893 && root != sent 894 && root->element_id() == folia::Sentence_t ){ 895 // Ok, another Sentence in a quote 896 if ( i > 0 && !(toks[i-1].role & BEGINQUOTE) ){ 897 // close the current one, and start a new one. 898 // except when it is implicit created by a QUOTE 899 if ( tokDebug > 5 ){ 900 LOG << "[add_words] next embedded sentence" << endl; 901 } 902 // honour text_redundancy on the Sentence 903 if ( text_redundancy == "full" ){ 904 appendText( root, outputclass ); 905 } 906 else if ( text_redundancy == "none" ){ 907 removeText( root, outputclass ); 908 } 909 root = root->parent(); 910 folia::processor *proc = add_provenance_structure( doc, 911 folia::AnnotationType::SENTENCE ); 912 folia::KWargs args; 913 string id = get_parent_id(root); 914 if ( !id.empty() ){ 915 args["generate_id"] = id; 916 } 917 if ( proc ){ 918 args["processor"] = proc->id(); 919 } 920 args["set"] = doc->default_set( folia::AnnotationType::SENTENCE ); 921 folia::Sentence *ns = new folia::Sentence( args, doc ); 922 root->append( ns ); 923 root = ns; 924 } 925 } 926 folia::KWargs args; 927 string ids = get_parent_id( root ); 928 if ( !ids.empty() ){ 929 args["generate_id"] = ids; 930 } 931 args["class"] = TiCC::UnicodeToUTF8(tok.type); 932 if ( tok.role & NOSPACE ){ 933 args["space"] = "no"; 934 } 935 if ( outputclass != "current" ){ 936 args["textclass"] = outputclass; 937 } 938 args["set"] = tok_set; 939 #pragma omp critical (foliaupdate) 940 { 941 UnicodeString ws = tok.us; 942 if (lowercase) { 943 ws = ws.toLower(); 944 } 945 else if (uppercase) { 946 ws = ws.toUpper(); 947 } 948 if ( tokDebug > 5 ){ 949 LOG << "create Word(" << args << ") = " << ws << endl; 950 } 951 folia::Word *w; 952 try { 953 w = new folia::Word( args, doc ); 954 } 955 catch ( const exception& e ){ 956 cerr << "Word(" << args << ") creation failed: " << e.what() << endl; 957 exit(EXIT_FAILURE); 958 } 959 result.push_back( w ); 960 w->setutext( ws, outputclass ); 961 if ( tokDebug > 5 ){ 962 LOG << "add_result, created a word: " << w << "(" << ws << ")" << endl; 963 } 964 root->append( w ); 965 } 966 if ( tok.role & ENDQUOTE ){ 967 if ( i > 0 968 && toks[i-1].role & ENDOFSENTENCE ){ 969 // end of quote implies with embedded Sentence 970 if ( tokDebug > 5 ){ 971 LOG << "[add_words] End of quote" << endl; 972 } 973 // honour text_redundancy on the Sentence 974 if ( text_redundancy == "full" ){ 975 appendText( root->parent(), outputclass ); 976 } 977 else if ( text_redundancy == "none" ){ 978 removeText( root->parent(), outputclass ); 979 } 980 root = root->parent()->parent(); // so close Sentence too 981 } 982 else { 983 root = root->parent(); 984 } 985 } 986 } 987 if ( text_redundancy == "full" ){ 988 appendText( sent, outputclass ); 989 } 990 else if ( text_redundancy == "none" ){ 991 removeText( sent, outputclass ); 992 } 993 return result; 994 } 995 append_to_folia(folia::FoliaElement * root,const vector<Token> & tv,int & p_count) const996 folia::FoliaElement *TokenizerClass::append_to_folia( folia::FoliaElement *root, 997 const vector<Token>& tv, 998 int& p_count ) const { 999 if ( !root || !root->doc() ){ 1000 throw logic_error( "missing root" ); 1001 } 1002 if ( tokDebug > 5 ){ 1003 LOG << "append_to_folia, root = " << root << endl; 1004 LOG << "tokens=\n" << tv << endl; 1005 } 1006 if ( (tv[0].role & NEWPARAGRAPH) ) { 1007 if ( tokDebug > 5 ){ 1008 LOG << "append_to_folia, NEW paragraph " << endl; 1009 } 1010 folia::processor *proc = add_provenance_structure( root->doc(), 1011 folia::AnnotationType::PARAGRAPH ); 1012 folia::KWargs args; 1013 if ( proc ){ 1014 args["processor"] = proc->id(); 1015 } 1016 args["set"] = root->doc()->default_set( folia::AnnotationType::PARAGRAPH ); 1017 args["xml:id"] = root->doc()->id() + ".p." + TiCC::toString(++p_count); 1018 folia::Paragraph *p = new folia::Paragraph( args, root->doc() ); 1019 if ( root->element_id() == folia::Text_t ){ 1020 if ( tokDebug > 5 ){ 1021 LOG << "append_to_folia, add paragraph to Text" << endl; 1022 } 1023 root->append( p ); 1024 } 1025 else { 1026 // root is a paragraph, which is done now. 1027 if ( text_redundancy == "full" ){ 1028 root->settext( root->str(outputclass), outputclass); 1029 } 1030 if ( tokDebug > 5 ){ 1031 LOG << "append_to_folia, add paragraph to parent of " << root << endl; 1032 } 1033 root = root->parent(); 1034 root->append( p ); 1035 } 1036 root = p; 1037 } 1038 folia::processor *proc = add_provenance_structure( root->doc(), 1039 folia::AnnotationType::SENTENCE ); 1040 folia::KWargs args; 1041 if ( proc ){ 1042 args["processor"] = proc->id(); 1043 } 1044 args["set"] = root->doc()->default_set( folia::AnnotationType::SENTENCE ); 1045 args["generate_id"] = root->id(); 1046 folia::Sentence *s = new folia::Sentence( args, root->doc() ); 1047 root->append( s ); 1048 if ( tokDebug > 5 ){ 1049 LOG << "append_to_folia, created Sentence" << s << endl; 1050 } 1051 append_to_sentence( s, tv ); 1052 return root; 1053 } 1054 handle_token_tag(const folia::FoliaElement * d,const folia::TextPolicy & tp)1055 UnicodeString handle_token_tag( const folia::FoliaElement *d, 1056 const folia::TextPolicy& tp ){ 1057 /// a handler that is passed on to libfolia to handle special tag="token" 1058 /// nodes 1059 /*! 1060 \param d The FoliaElement that libfolia will handle us 1061 \param tp The TextPolicy at hand. This function has been registered in 1062 \em tp 1063 \return a UnicodeString which we will mark specially so that we know 1064 that this string is to be handled as a separate token 1065 1066 This function will be called by libfolia's text() functions on 1067 encountering a tag="token" attribute in a TextContent. 1068 It has to be registered in \em tp 1069 */ 1070 UnicodeString tmp_result = text( d, tp ); 1071 tmp_result = u'\u200D' + tmp_result; 1072 tmp_result += u'\u200D'; 1073 return tmp_result; 1074 } 1075 correct_element(folia::FoliaElement * orig,const vector<Token> & toks,const string & tok_set) const1076 void TokenizerClass::correct_element( folia::FoliaElement *orig, 1077 const vector<Token>& toks, 1078 const string& tok_set ) const { 1079 vector<folia::FoliaElement*> sV; 1080 vector<folia::FoliaElement*> cV; 1081 vector<folia::FoliaElement*> oV; 1082 vector<folia::FoliaElement*> nV; 1083 // Original element 1084 oV.push_back( orig ); 1085 // Add the edits 1086 for ( const auto& tok : toks ){ 1087 // New elements 1088 folia::KWargs args; 1089 args["xml:id"] = orig->generateId( "tokenized" ); 1090 args["class"] = TiCC::UnicodeToUTF8(tok.type); 1091 if ( tok.role & NOSPACE ){ 1092 args["space"] = "no"; 1093 } 1094 if ( outputclass != "current" ){ 1095 args["textclass"] = outputclass; 1096 } 1097 args["set"] = tok_set; 1098 #pragma omp critical (foliaupdate) 1099 { 1100 UnicodeString ws = tok.us; 1101 if (lowercase) { 1102 ws = ws.toLower(); 1103 } 1104 else if (uppercase) { 1105 ws = ws.toUpper(); 1106 } 1107 if ( tokDebug > 5 ){ 1108 LOG << "create Word(" << args << ") = " << ws << endl; 1109 } 1110 folia::FoliaElement *new_elt; 1111 try { 1112 new_elt = folia::AbstractElement::createElement( orig->element_id(), 1113 orig->doc() ); 1114 new_elt->setAttributes( args ); 1115 } 1116 catch ( const exception& e ){ 1117 cerr << "Word(" << args << ") creation failed: " << e.what() << endl; 1118 exit(EXIT_FAILURE); 1119 } 1120 new_elt->setutext( ws, outputclass ); 1121 if ( tokDebug > 5 ){ 1122 LOG << "add_result, created: " << new_elt << "(" << ws << ")" << endl; 1123 } 1124 nV.push_back( new_elt ); 1125 } 1126 } 1127 folia::KWargs no_args; 1128 no_args["processor"] = ucto_processor->id(); 1129 no_args["set"] = tok_set; 1130 folia::Correction *c = orig->parent()->correct( oV, cV, nV, sV, no_args ); 1131 if ( tokDebug > 2 ){ 1132 LOG << "created: " << c->xmlstring() << endl; 1133 } 1134 else if ( tokDebug > 0 ){ 1135 LOG << "created: " << c << endl; 1136 } 1137 } 1138 correct_elements(folia::FoliaElement * e,const vector<folia::FoliaElement * > & wv)1139 vector<Token> TokenizerClass::correct_elements( folia::FoliaElement *e, 1140 const vector<folia::FoliaElement*>& wv ) { 1141 vector<Token> result; 1142 // correct only when the sentence is in the desired language 1143 string s_la; 1144 if ( e->has_annotation<folia::LangAnnotation>() ){ 1145 s_la = e->annotation<folia::LangAnnotation>()->cls(); 1146 } 1147 if ( !s_la.empty() && settings.find(s_la) == settings.end() ){ 1148 // the Sentence already has a language code, and it 1149 // is NOT what we search for. 1150 // just ignore it 1151 if ( tokDebug > 0 ){ 1152 LOG << "skip FoLiA element " << e->id() << " with unsupported language " 1153 << s_la << endl; 1154 } 1155 return result; 1156 } 1157 string tok_set; 1158 if ( !s_la.empty() ){ 1159 tok_set = "tokconfig-" + s_la; 1160 } 1161 else { 1162 tok_set = "tokconfig-" + default_language; 1163 } 1164 folia::KWargs args; 1165 args["processor"] = ucto_processor->id(); 1166 e->doc()->declare( folia::AnnotationType::CORRECTION, tok_set, args ); 1167 for ( auto w : wv ){ 1168 string text = w->str( text_policy ); 1169 if ( tokDebug > 0 ){ 1170 LOG << "correct_elements() text='" << text << "'" << endl; 1171 } 1172 tokenizeLine( text ); 1173 vector<Token> sent = popSentence(); 1174 while ( sent.size() > 0 ){ 1175 sent.front().role &= ~BEGINOFSENTENCE; 1176 sent.back().role &= ~ENDOFSENTENCE; 1177 result.insert( result.end(), sent.begin(), sent.end() ); 1178 correct_element( w, sent, tok_set ); 1179 sent = popSentence(); 1180 } 1181 } 1182 result.front().role |= BEGINOFSENTENCE; 1183 result.back().role |= ENDOFSENTENCE; 1184 return result; 1185 } 1186 handle_one_sentence(folia::Sentence * s,int & sentence_done)1187 void TokenizerClass::handle_one_sentence( folia::Sentence *s, 1188 int& sentence_done ){ 1189 // check feasibility 1190 if ( tokDebug > 1 ){ 1191 LOG << "handle_one_sentence: " << s << endl; 1192 } 1193 if ( inputclass != outputclass && outputclass == "current" ){ 1194 if ( s->hastext( outputclass ) ){ 1195 throw uLogicError( "cannot set text with class='current' on node " 1196 + s->id() + 1197 " because it already has text in that class." ); 1198 } 1199 } 1200 vector<folia::Word *> wv = s->words( inputclass ); 1201 if ( wv.empty() ){ 1202 wv = s->words(); 1203 } 1204 if ( !wv.empty() ){ 1205 // there are already words. 1206 if ( doWordCorrection ){ 1207 // we are allowed to correct those 1208 vector<folia::FoliaElement*> ev(wv.begin(),wv.end()); 1209 if ( !correct_elements( s, ev ).empty() ){ 1210 ++sentence_done; 1211 } 1212 } 1213 } 1214 else { 1215 string s_la; 1216 if ( s->has_annotation<folia::LangAnnotation>() ){ 1217 s_la = s->annotation<folia::LangAnnotation>()->cls(); 1218 } 1219 if ( !s_la.empty() && settings.find(s_la) == settings.end() ){ 1220 // the Sentence already has a language code, and it 1221 // is NOT what we search for. 1222 // just ignore it 1223 if ( tokDebug > 0 ){ 1224 LOG << "skip sentence " << s->id() << " with unsupported language " 1225 << s_la << endl; 1226 } 1227 return; 1228 } 1229 string text = s->str( text_policy ); 1230 if ( tokDebug > 0 ){ 1231 LOG << "handle_one_sentence() from string: '" << text << "'" << endl; 1232 } 1233 tokenizeLine( text ); 1234 vector<Token> sent = popSentence(); 1235 while ( sent.size() > 0 ){ 1236 append_to_sentence( s, sent ); 1237 ++sentence_done; 1238 sent = popSentence(); 1239 } 1240 } 1241 if ( text_redundancy == "full" ){ 1242 appendText( s, outputclass ); 1243 } 1244 else if ( text_redundancy == "none" ){ 1245 removeText( s, outputclass ); 1246 } 1247 } 1248 handle_one_paragraph(folia::Paragraph * p,int & sentence_done)1249 void TokenizerClass::handle_one_paragraph( folia::Paragraph *p, 1250 int& sentence_done ){ 1251 // a Paragraph may contain both Word and Sentence nodes 1252 // Sentences will be handled 1253 vector<folia::Sentence*> sv = p->select<folia::Sentence>(false); 1254 if ( sv.empty() ){ 1255 // No Sentence, so just text or Words 1256 vector<folia::Word*> wv = p->select<folia::Word>(false); 1257 if ( !wv.empty() ){ 1258 vector<folia::FoliaElement*> ev( wv.begin(), wv.end() ); 1259 // Words found 1260 if ( doWordCorrection ){ 1261 if ( correct_elements( p, ev ).empty() ){ 1262 ++sentence_done; 1263 } 1264 } 1265 // otherwise skip 1266 } 1267 else { 1268 // No Words too, handle text, if any 1269 string text = p->str( text_policy ); 1270 if ( tokDebug > 0 ){ 1271 LOG << "handle_one_paragraph:" << text << endl; 1272 } 1273 tokenizeLine( text ); 1274 vector<Token> toks = popSentence(); 1275 folia::processor *proc = 0; 1276 while ( !toks.empty() ){ 1277 if ( proc == 0 ){ 1278 proc = add_provenance_structure( p->doc(), 1279 folia::AnnotationType::SENTENCE ); 1280 } 1281 string p_id = p->id(); 1282 folia::KWargs args; 1283 if ( proc ){ 1284 args["processor"] = proc->id(); 1285 } 1286 args["set"] = p->doc()->default_set(folia::AnnotationType::SENTENCE); 1287 if ( !p_id.empty() ){ 1288 args["generate_id"] = p_id; 1289 } 1290 folia::Sentence *s = new folia::Sentence( args, p->doc() ); 1291 p->append( s ); 1292 append_to_sentence( s, toks ); 1293 ++sentence_done; 1294 toks = popSentence(); 1295 } 1296 } 1297 } 1298 else { 1299 if ( tokDebug > 1 ){ 1300 LOG << "found some Sentences " << sv << endl; 1301 } 1302 // For now wu just IGNORE loose words (backward compatability) 1303 for ( const auto& s : sv ){ 1304 handle_one_sentence( s, sentence_done ); 1305 } 1306 } 1307 if ( text_redundancy == "full" ){ 1308 appendText( p, outputclass ); 1309 } 1310 else if ( text_redundancy == "none" ){ 1311 removeText( p, outputclass ); 1312 } 1313 } 1314 handle_one_text_parent(folia::FoliaElement * e,int & sentence_done)1315 void TokenizerClass::handle_one_text_parent( folia::FoliaElement *e, 1316 int& sentence_done ){ 1317 /// 1318 /// input is a FoLiA element @e containing text, direct or deeper 1319 /// this can be a Word, Sentence, Paragraph or some other element 1320 /// In the latter case, we construct a Sentence from the text, and 1321 /// a Paragraph if more then one Sentence is found 1322 /// 1323 if ( inputclass != outputclass && outputclass == "current" ){ 1324 if ( e->hastext( outputclass ) ){ 1325 throw uLogicError( "cannot set text with class='current' on node " 1326 + e->id() + 1327 " because it already has text in that class." ); 1328 } 1329 } 1330 if ( e->xmltag() == "w" ){ 1331 // SKIP! already tokenized into words! 1332 } 1333 else if ( e->xmltag() == "s" ){ 1334 // OK a text in a sentence 1335 if ( tokDebug > 2 ){ 1336 LOG << "found text in a sentence " << e << endl; 1337 } 1338 handle_one_sentence( dynamic_cast<folia::Sentence*>(e), 1339 ++sentence_done ); 1340 } 1341 else if ( e->xmltag() == "p" ){ 1342 // OK a longer text in some paragraph, maybe more sentences 1343 if ( tokDebug > 2 ){ 1344 LOG << "found text in a paragraph " << e << endl; 1345 } 1346 handle_one_paragraph( dynamic_cast<folia::Paragraph*>(e), 1347 sentence_done ); 1348 } 1349 else { 1350 // Some text outside word, paragraphs or sentences (yet) 1351 // mabe <div> or <note> or such 1352 // there may be embedded Paragraph, Word and Sentence nodes 1353 // if so, Paragraphs and Sentences should be handled separately 1354 vector<folia::Sentence*> sv = e->select<folia::Sentence>(false); 1355 vector<folia::Paragraph*> pv = e->select<folia::Paragraph>(false); 1356 if ( pv.empty() && sv.empty() ){ 1357 // just words or text 1358 string text = e->str( text_policy ); 1359 if ( tokDebug > 1 ){ 1360 LOG << "tok-" << e->xmltag() << ":" << text << endl; 1361 } 1362 tokenizeLine( text ); 1363 vector<vector<Token>> sents; 1364 vector<Token> toks = popSentence(); 1365 while ( toks.size() > 0 ){ 1366 sents.push_back( toks ); 1367 toks = popSentence(); 1368 } 1369 if ( sents.size() == 0 ){ 1370 // can happen in very rare cases (strange spaces in the input) 1371 // SKIP! 1372 } 1373 else if ( sents.size() > 1 ){ 1374 // multiple sentences. We need an extra Paragraph. 1375 // But first check if this is allowed! 1376 folia::FoliaElement *rt; 1377 if ( e->acceptable(folia::Paragraph_t) ){ 1378 folia::KWargs args; 1379 string e_id = e->id(); 1380 if ( !e_id.empty() ){ 1381 args["generate_id"] = e_id; 1382 } 1383 folia::processor *proc = add_provenance_structure( e->doc(), 1384 folia::AnnotationType::PARAGRAPH ); 1385 if ( proc ){ 1386 args["processor"] = proc->id(); 1387 } 1388 args["set"] = e->doc()->default_set( folia::AnnotationType::PARAGRAPH ); 1389 folia::Paragraph *p = new folia::Paragraph( args, e->doc() ); 1390 e->append( p ); 1391 rt = p; 1392 } 1393 else { 1394 rt = e; 1395 } 1396 for ( const auto& sent : sents ){ 1397 folia::KWargs args; 1398 string p_id = rt->id(); 1399 if ( !p_id.empty() ){ 1400 args["generate_id"] = p_id; 1401 } 1402 folia::processor *proc = add_provenance_structure( e->doc(), 1403 folia::AnnotationType::SENTENCE ); 1404 if ( proc ){ 1405 args["processor"] = proc->id(); 1406 } 1407 args["set"] = e->doc()->default_set( folia::AnnotationType::SENTENCE ); 1408 folia::Sentence *s = new folia::Sentence( args, e->doc() ); 1409 append_to_sentence( s, sent ); 1410 ++sentence_done; 1411 if (tokDebug > 0){ 1412 LOG << "created a new sentence: " << s << endl; 1413 } 1414 rt->append( s ); 1415 } 1416 } 1417 else { 1418 // 1 sentence, connect directly. 1419 folia::KWargs args; 1420 string e_id = e->id(); 1421 if ( e_id.empty() ){ 1422 e_id = e->generateId( e->xmltag() ); 1423 args["xml:id"] = e_id + ".s.1"; 1424 } 1425 else { 1426 args["generate_id"] = e_id; 1427 } 1428 folia::processor *proc = add_provenance_structure( e->doc(), 1429 folia::AnnotationType::SENTENCE ); 1430 if ( proc ){ 1431 args["processor"] = proc->id(); 1432 } 1433 args["set"] = e->doc()->default_set( folia::AnnotationType::SENTENCE ); 1434 folia::Sentence *s = new folia::Sentence( args, e->doc() ); 1435 append_to_sentence( s, sents[0] ); 1436 ++sentence_done; 1437 if (tokDebug > 0){ 1438 LOG << "created a new sentence: " << s << endl; 1439 } 1440 e->append( s ); 1441 } 1442 } 1443 else if ( !pv.empty() ){ 1444 if ( tokDebug > 1 ){ 1445 LOG << "found some Paragraphs " << pv << endl; 1446 } 1447 // For now we only handle the Paragraphs, ignore sentences and words 1448 // IS this even valid??? 1449 for ( const auto& p : pv ){ 1450 handle_one_paragraph( p, sentence_done ); 1451 } 1452 } 1453 else { 1454 if ( tokDebug > 1 ){ 1455 LOG << "found some Sentences " << sv << endl; 1456 } 1457 // For now we just IGNORE the loose words (backward compatability) 1458 for ( const auto& s : sv ){ 1459 handle_one_sentence( s, sentence_done ); 1460 } 1461 } 1462 } 1463 if ( text_redundancy == "full" ){ 1464 appendText( e, outputclass ); 1465 } 1466 else if ( text_redundancy == "none" ){ 1467 removeText( e, outputclass ); 1468 } 1469 } 1470 tokenize_folia(const string & infile_name)1471 folia::Document *TokenizerClass::tokenize_folia( const string& infile_name ){ 1472 if ( inputclass == outputclass 1473 && !doWordCorrection ){ 1474 LOG << "ucto: --filter=NO is automatically set. inputclass equals outputclass!" 1475 << endl; 1476 setFiltering(false); 1477 } 1478 text_policy.set_class( inputclass ); 1479 if ( !ignore_tag_hints ){ 1480 text_policy.add_handler("token", &handle_token_tag ); 1481 } 1482 folia::TextEngine proc( infile_name ); 1483 if ( passthru ){ 1484 add_provenance_passthru( proc.doc() ); 1485 } 1486 else { 1487 add_provenance_setting( proc.doc() ); 1488 } 1489 if ( tokDebug > 8){ 1490 proc.set_dbg_stream( theErrLog ); 1491 proc.set_debug( true ); 1492 } 1493 // proc.set_debug( true ); 1494 proc.setup( inputclass, true ); 1495 int sentence_done = 0; 1496 folia::FoliaElement *p = 0; 1497 folia::FoliaElement *parent = 0; 1498 while ( (p = proc.next_text_parent() ) ){ 1499 // LOG << "next text parent: " << p << endl; 1500 if ( !parent ){ 1501 parent = p->parent(); 1502 // LOG << "my parent: " << parent << endl; 1503 } 1504 if ( already_tokenized ){ 1505 ++sentence_done; 1506 } 1507 else { 1508 handle_one_text_parent( p, sentence_done ); 1509 } 1510 if ( tokDebug > 0 ){ 1511 LOG << "done with sentence " << sentence_done << endl; 1512 } 1513 if ( proc.next() ){ 1514 if ( tokDebug > 1 ){ 1515 LOG << "looping for more ..." << endl; 1516 } 1517 } 1518 } 1519 if ( text_redundancy == "full" ){ 1520 appendText( parent, outputclass ); 1521 } 1522 else if ( text_redundancy == "none" ){ 1523 removeText( parent, outputclass ); 1524 } 1525 if ( sentence_done == 0 ){ 1526 LOG << "document contains no text in the desired inputclass: " 1527 << inputclass << endl; 1528 LOG << "NO result!" << endl; 1529 return 0; 1530 } 1531 return proc.doc(true); // take the doc over from the Engine 1532 } 1533 tokenize_folia(const string & infile_name,const string & outfile_name)1534 void TokenizerClass::tokenize_folia( const string& infile_name, 1535 const string& outfile_name ){ 1536 if ( tokDebug > 0 ){ 1537 LOG << "[tokenize_folia] (" << infile_name << "," 1538 << outfile_name << ")" << endl; 1539 } 1540 folia::Document *doc = tokenize_folia( infile_name ); 1541 if ( doc ){ 1542 doc->save( outfile_name, false ); 1543 if ( tokDebug > 0 ){ 1544 LOG << "resulting FoLiA doc saved in " << outfile_name << endl; 1545 } 1546 } 1547 else { 1548 if ( tokDebug > 0 ){ 1549 LOG << "NO FoLiA doc created! " << endl; 1550 } 1551 } 1552 } 1553 outputTokens(const vector<Token> & tokens,const bool continued) const1554 UnicodeString TokenizerClass::outputTokens( const vector<Token>& tokens, 1555 const bool continued ) const { 1556 /*! 1557 \param tokens A list of Token's to display 1558 \param continued Set to true when outputTokens is invoked multiple 1559 times and it is not the first invokation 1560 1561 this makes paragraph boundaries work over multiple calls 1562 \return A UnicodeString representing tokenized lines, including token 1563 information, when verbose mode is on. 1564 */ 1565 short quotelevel = 0; 1566 UnicodeString result; 1567 for ( const auto& token : tokens ) { 1568 UnicodeString outline; 1569 if (tokDebug >= 5){ 1570 LOG << "outputTokens: token=" << token << endl; 1571 } 1572 if ( detectPar 1573 && (token.role & NEWPARAGRAPH) 1574 && !verbose 1575 && continued ) { 1576 //output paragraph separator 1577 if ( sentenceperlineoutput ) { 1578 outline += "\n"; 1579 } 1580 else { 1581 outline += "\n\n"; 1582 } 1583 } 1584 UnicodeString s = token.us; 1585 if (lowercase) { 1586 s = s.toLower(); 1587 } 1588 else if ( uppercase ) { 1589 s = s.toUpper(); 1590 } 1591 outline += s; 1592 if ( token.role & NEWPARAGRAPH ) { 1593 quotelevel = 0; 1594 } 1595 if ( token.role & BEGINQUOTE ) { 1596 ++quotelevel; 1597 } 1598 if ( verbose ) { 1599 outline += "\t" + token.type + "\t" + toUString(token.role) + "\n"; 1600 } 1601 if ( token.role & ENDQUOTE ) { 1602 --quotelevel; 1603 } 1604 1605 if ( token.role & ENDOFSENTENCE ) { 1606 if ( verbose ) { 1607 if ( !(token.role & NOSPACE ) ){ 1608 outline += "\n"; 1609 } 1610 } 1611 else { 1612 if ( quotelevel == 0 ) { 1613 if ( sentenceperlineoutput ) { 1614 outline += "\n"; 1615 } 1616 else { 1617 outline += " " + eosmark + " "; 1618 } 1619 if ( splitOnly ){ 1620 outline += "\n"; 1621 } 1622 } 1623 else { //inside quotation 1624 if ( splitOnly 1625 && !(token.role & NOSPACE ) ){ 1626 outline += " "; 1627 } 1628 } 1629 } 1630 } 1631 if ( ( &token != &(*tokens.rbegin()) ) 1632 && !verbose ) { 1633 if ( !( (token.role & ENDOFSENTENCE) 1634 && sentenceperlineoutput 1635 && !splitOnly ) ){ 1636 if ( !(token.role & ENDOFSENTENCE) ){ 1637 if ( splitOnly 1638 && (token.role & NOSPACE) ){ 1639 } 1640 else { 1641 outline += " "; 1642 } 1643 } 1644 } 1645 else if ( (quotelevel > 0) 1646 && sentenceperlineoutput ) { 1647 //FBK: ADD SPACE WITHIN QUOTE CONTEXT IN ANY CASE 1648 outline += " "; 1649 } 1650 } 1651 if (tokDebug >= 5){ 1652 LOG << "outputTokens: outline=" << outline << endl; 1653 } 1654 result += outline; 1655 } 1656 return result; 1657 } 1658 countSentences(bool forceentirebuffer)1659 int TokenizerClass::countSentences( bool forceentirebuffer ) { 1660 //Return the number of *completed* sentences in the token buffer 1661 1662 //Performs extra sanity checks at the same time! Making sure 1663 //BEGINOFSENTENCE and ENDOFSENTENCE always pair up, and that TEMPENDOFSENTENCE roles 1664 //are converted to proper ENDOFSENTENCE markers 1665 1666 short quotelevel = 0; 1667 int count = 0; 1668 const int size = tokens.size(); 1669 int begin = 0; 1670 int i = 0; 1671 for ( auto& token : tokens ) { 1672 if (tokDebug >= 5){ 1673 LOG << "[countSentences] buffer#" <<i 1674 << " word=[" << token.us 1675 << "] role=" << token.role 1676 << ", quotelevel="<< quotelevel << endl; 1677 } 1678 if (token.role & NEWPARAGRAPH) quotelevel = 0; 1679 if (token.role & BEGINQUOTE) quotelevel++; 1680 if (token.role & ENDQUOTE) quotelevel--; 1681 if ( forceentirebuffer 1682 && (token.role & TEMPENDOFSENTENCE) 1683 && (quotelevel == 0)) { 1684 //we thought we were in a quote, but we're not... No end quote was found and an end is forced now. 1685 //Change TEMPENDOFSENTENCE to ENDOFSENTENCE and make sure sentences match up sanely 1686 token.role &= ~TEMPENDOFSENTENCE; 1687 token.role |= ENDOFSENTENCE; 1688 } 1689 tokens[begin].role |= BEGINOFSENTENCE; //sanity check 1690 if ( (token.role & ENDOFSENTENCE) 1691 && (quotelevel == 0) ) { 1692 begin = i + 1; 1693 count++; 1694 if (tokDebug >= 5){ 1695 LOG << "[countSentences] SENTENCE #" << count << " found" << endl; 1696 } 1697 } 1698 if ( forceentirebuffer 1699 && ( i == size - 1) 1700 && !(token.role & ENDOFSENTENCE) ) { 1701 //last token of buffer 1702 count++; 1703 token.role |= ENDOFSENTENCE; 1704 if (tokDebug >= 5){ 1705 LOG << "[countSentences] SENTENCE #" << count << " *FORCIBLY* ended" << endl; 1706 } 1707 } 1708 ++i; 1709 } 1710 if (tokDebug >= 5){ 1711 LOG << "[countSentences] end of loop: returns " << count << endl; 1712 } 1713 return count; 1714 } 1715 popSentence()1716 vector<Token> TokenizerClass::popSentence( ) { 1717 vector<Token> outToks; 1718 const int size = tokens.size(); 1719 if ( size != 0 ){ 1720 short quotelevel = 0; 1721 size_t begin = 0; 1722 for ( int i = 0; i < size; ++i ) { 1723 if (tokens[i].role & NEWPARAGRAPH) { 1724 quotelevel = 0; 1725 } 1726 else if (tokens[i].role & ENDQUOTE) { 1727 --quotelevel; 1728 } 1729 if ( (tokens[i].role & BEGINOFSENTENCE) 1730 && (quotelevel == 0)) { 1731 begin = i; 1732 } 1733 //FBK: QUOTELEVEL GOES UP BEFORE begin IS UPDATED... RESULTS IN DUPLICATE OUTPUT 1734 if (tokens[i].role & BEGINQUOTE) { 1735 ++quotelevel; 1736 } 1737 1738 if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) { 1739 size_t end = i; 1740 if (tokDebug >= 1){ 1741 LOG << "[tokenize] extracted sentence, begin=" << begin 1742 << ",end="<< end << endl; 1743 } 1744 for ( size_t index=begin; index <= end; ++index ){ 1745 outToks.push_back( tokens[index] ); 1746 } 1747 tokens.erase( tokens.begin(), tokens.begin()+end+1 ); 1748 if ( !passthru ){ 1749 string lang = get_language( outToks ); 1750 if ( !settings[lang]->quotes.emptyStack() ) { 1751 settings[lang]->quotes.flushStack( end+1 ); 1752 } 1753 } 1754 // we are done... 1755 return outToks; 1756 } 1757 } 1758 } 1759 return outToks; 1760 } 1761 getString(const vector<Token> & v)1762 UnicodeString TokenizerClass::getString( const vector<Token>& v ){ 1763 if ( !v.empty() ){ 1764 //This only makes sense in non-verbose mode, force verbose=false 1765 const bool tv = verbose; 1766 verbose = false; 1767 UnicodeString res = outputTokens( v ); 1768 verbose = tv; 1769 return res; 1770 } 1771 return ""; 1772 } 1773 getUTF8String(const vector<Token> & v)1774 string TokenizerClass::getUTF8String( const vector<Token>& v ){ 1775 UnicodeString result = getString( v ); 1776 return TiCC::UnicodeToUTF8( result ); 1777 } 1778 getSentences()1779 vector<UnicodeString> TokenizerClass::getSentences() { 1780 vector<UnicodeString> sentences; 1781 if (tokDebug > 0) { 1782 LOG << "[getSentences()] before countSent " << endl; 1783 } 1784 int numS = countSentences(true); // force buffer to end with END_OF_SENTENCE 1785 if (tokDebug > 0) { 1786 LOG << "[getSentences] found " << numS << " sentence(s)" << endl; 1787 } 1788 for (int i = 0; i < numS; i++) { 1789 vector<Token> v = popSentence( ); 1790 UnicodeString tmp = getString( v ); 1791 sentences.push_back( tmp ); 1792 } 1793 return sentences; 1794 } 1795 getUTF8Sentences()1796 vector<string> TokenizerClass::getUTF8Sentences() { 1797 vector<UnicodeString> uv = getSentences(); 1798 vector<string> result; 1799 for ( const auto& us : uv ){ 1800 result.push_back( TiCC::UnicodeToUTF8(us) ); 1801 } 1802 return result; 1803 } 1804 1805 // FBK: return true if character is a quote. u_isquote(UChar32 c,const Quoting & quotes) const1806 bool TokenizerClass::u_isquote( UChar32 c, const Quoting& quotes ) const { 1807 bool quote = false; 1808 if ( u_hasBinaryProperty( c, UCHAR_QUOTATION_MARK ) 1809 || c == '`' 1810 || c == U'´' ) { 1811 // M$ users use the spacing grave and acute accents often as a 1812 // quote (apostroph) but is DOESN`T have the UCHAR_QUOTATION_MARK property 1813 // so trick that 1814 quote = true; 1815 } 1816 else { 1817 UnicodeString opening = quotes.lookupOpen( c ); 1818 if (!opening.isEmpty()) { 1819 quote = true; 1820 } 1821 else { 1822 UnicodeString closing = quotes.lookupClose( c ); 1823 if (!closing.isEmpty()) { 1824 quote = true; 1825 } 1826 } 1827 } 1828 return quote; 1829 } 1830 1831 //FBK: USED TO CHECK IF CHARACTER AFTER QUOTE IS AN BOS. 1832 //MOSTLY THE SAME AS ABOVE, EXCEPT WITHOUT CHECK FOR PUNCTUATION 1833 //BECAUSE: '"Hoera!", zei de man' MUST NOT BE SPLIT ON ','.. is_BOS(UChar32 c)1834 bool is_BOS( UChar32 c ){ 1835 bool is_bos = false; 1836 UBlockCode s = ublock_getCode(c); 1837 //test for languages that distinguish case 1838 if ( (s == UBLOCK_BASIC_LATIN) || (s == UBLOCK_GREEK) 1839 || (s == UBLOCK_CYRILLIC) || (s == UBLOCK_GEORGIAN) 1840 || (s == UBLOCK_ARMENIAN) || (s == UBLOCK_DESERET)) { 1841 if ( u_isupper(c) || u_istitle(c) ) { 1842 //next 'word' starts with more punctuation or with uppercase 1843 is_bos = true; 1844 } 1845 } 1846 return is_bos; 1847 } 1848 resolveQuote(int endindex,const UnicodeString & open,Quoting & quotes)1849 bool TokenizerClass::resolveQuote( int endindex, 1850 const UnicodeString& open, 1851 Quoting& quotes ) { 1852 //resolve a quote 1853 int stackindex = -1; 1854 int beginindex = quotes.lookup( open, stackindex ); 1855 1856 if (beginindex >= 0) { 1857 if (tokDebug >= 2) { 1858 LOG << "[resolveQuote] Quote found, begin="<< beginindex << ", end="<< endindex << endl; 1859 } 1860 1861 if (beginindex > endindex) { 1862 throw uRangeError( "Begin index for quote is higher than end index!" ); 1863 } 1864 1865 //We have a quote! 1866 1867 //resolve sentences within quote, all sentences must be full sentences: 1868 int beginsentence = beginindex + 1; 1869 int expectingend = 0; 1870 int subquote = 0; 1871 int size = tokens.size(); 1872 for (int i = beginsentence; i < endindex; i++) { 1873 if (tokens[i].role & BEGINQUOTE) subquote++; 1874 1875 if (subquote == 0) { 1876 if (tokens[i].role & BEGINOFSENTENCE) expectingend++; 1877 if (tokens[i].role & ENDOFSENTENCE) expectingend--; 1878 1879 if (tokens[i].role & TEMPENDOFSENTENCE) { 1880 tokens[i].role &= ~TEMPENDOFSENTENCE; 1881 tokens[i].role |= ENDOFSENTENCE; 1882 tokens[beginsentence].role |= BEGINOFSENTENCE; 1883 beginsentence = i + 1; 1884 } 1885 // In case of nested quoted sentences, such as: 1886 // MvD: "Nou, Van het Gouden Been ofzo herinner ik mij als kind: 'Waar is mijn gouden been?'" 1887 // the BEGINOFSENTENCE is only set for the inner quoted sentence 'Waar is mijn gouden been'. However, 1888 // We also need one for the outser sentence. 1889 } 1890 else if ( (tokens[i].role & ENDQUOTE) 1891 && (tokens[i].role & ENDOFSENTENCE)) { 1892 tokens[beginsentence].role |= BEGINOFSENTENCE; 1893 beginsentence = i + 1; 1894 } 1895 if (tokens[i].role & ENDQUOTE) subquote--; 1896 } 1897 if ((expectingend == 0) && (subquote == 0)) { 1898 //ok, all good, mark the quote: 1899 tokens[beginindex].role |= BEGINQUOTE; 1900 tokens[endindex].role |= ENDQUOTE; 1901 if ( tokDebug >= 2 ) { 1902 LOG << "marked BEGIN: " << tokens[beginindex] << endl; 1903 LOG << "marked END: " << tokens[endindex] << endl; 1904 } 1905 } 1906 else if ( expectingend == 1 1907 && subquote == 0 1908 && !( tokens[endindex - 1].role & ENDOFSENTENCE) ) { 1909 //missing one endofsentence, we can correct, last token in quote token is endofsentence: 1910 if ( tokDebug >= 2 ) { 1911 LOG << "[resolveQuote] Missing endofsentence in quote, fixing... " << expectingend << endl; 1912 } 1913 tokens[endindex - 1].role |= ENDOFSENTENCE; 1914 //mark the quote 1915 tokens[beginindex].role |= BEGINQUOTE; 1916 tokens[endindex].role |= ENDQUOTE; 1917 } 1918 else { 1919 if ( tokDebug >= 2) { 1920 LOG << "[resolveQuote] Quote can not be resolved, unbalanced sentences or subquotes within quote, skipping... (expectingend=" << expectingend << ",subquote=" << subquote << ")" << endl; 1921 } 1922 //something is wrong. Sentences within quote are not balanced, so we won't mark the quote. 1923 } 1924 //remove from stack (ok, granted, stack is a bit of a misnomer here) 1925 quotes.eraseAtPos( stackindex ); 1926 //FBK: ENDQUOTES NEED TO BE MARKED AS ENDOFSENTENCE IF THE PREVIOUS TOKEN 1927 //WAS AN ENDOFSENTENCE. OTHERWISE THE SENTENCES WILL NOT BE SPLIT. 1928 if ( tokens[endindex].role & ENDQUOTE 1929 && tokens[endindex-1].role & ENDOFSENTENCE ) { 1930 //FBK: CHECK FOR EOS AFTER QUOTES 1931 if ((endindex+1 == size) || //FBK: endindex EQUALS TOKEN SIZE, MUST BE EOSMARKERS 1932 ((endindex + 1 < size) && (is_BOS(tokens[endindex+1].us[0])))) { 1933 tokens[endindex].role |= ENDOFSENTENCE; 1934 // FBK: CHECK IF NEXT TOKEN IS A QUOTE AND NEXT TO THE QUOTE A BOS 1935 } 1936 else if ( endindex + 2 < size 1937 && u_isquote( tokens[endindex+1].us[0], quotes ) 1938 && is_BOS( tokens[endindex+2].us[0] ) ) { 1939 tokens[endindex].role |= ENDOFSENTENCE; 1940 // If the current token is an ENDQUOTE and the next token is a quote and also the last token, 1941 // the current token is an EOS. 1942 } 1943 else if ( endindex + 2 == size 1944 && u_isquote( tokens[endindex+1].us[0], quotes ) ) { 1945 tokens[endindex].role |= ENDOFSENTENCE; 1946 } 1947 } 1948 return true; 1949 } 1950 else { 1951 return false; 1952 } 1953 } 1954 detectEos(size_t i,const UnicodeString & eosmarkers,const Quoting & quotes) const1955 bool TokenizerClass::detectEos( size_t i, 1956 const UnicodeString& eosmarkers, 1957 const Quoting& quotes ) const { 1958 bool is_eos = false; 1959 UChar32 c = tokens[i].us.char32At(0); 1960 if ( c == '.' || eosmarkers.indexOf( c ) >= 0 ){ 1961 if (i + 1 == tokens.size() ) { //No next character? 1962 is_eos = true; //Newline after eosmarker 1963 } 1964 else { 1965 c = tokens[i+1].us.char32At(0); 1966 if ( u_isquote( c, quotes ) ){ 1967 // next word is quote 1968 if ( detectQuotes ){ 1969 is_eos = true; 1970 } 1971 else if ( i + 2 < tokens.size() ) { 1972 c = tokens[i+2].us.char32At(0); 1973 if ( u_isupper(c) || u_istitle(c) || u_ispunct(c) ){ 1974 //next 'word' after quote starts with uppercase or is punct 1975 is_eos = true; 1976 } 1977 } 1978 } 1979 else if ( tokens[i].us.length() > 1 ){ 1980 // PUNCTUATION multi... 1981 if ( u_isupper(c) || u_istitle(c) ) 1982 is_eos = true; 1983 } 1984 else 1985 is_eos = true; 1986 } 1987 } 1988 return is_eos; 1989 } 1990 detectQuoteBounds(const int i,Quoting & quotes)1991 void TokenizerClass::detectQuoteBounds( const int i, 1992 Quoting& quotes ) { 1993 UChar32 c = tokens[i].us.char32At(0); 1994 //Detect Quotation marks 1995 if ((c == '"') || ( UnicodeString(c) == """) ) { 1996 if (tokDebug > 1 ){ 1997 LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl; 1998 } 1999 if (!resolveQuote(i,c,quotes)) { 2000 if (tokDebug > 1 ) { 2001 LOG << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl; 2002 } 2003 quotes.push( i, c ); 2004 } 2005 } 2006 else if ( c == '\'' ) { 2007 if (tokDebug > 1 ){ 2008 LOG << "[detectQuoteBounds] Standard single-quote (ambiguous) found @i="<< i << endl; 2009 } 2010 if (!resolveQuote(i,c,quotes)) { 2011 if (tokDebug > 1 ) { 2012 LOG << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl; 2013 } 2014 quotes.push( i, c ); 2015 } 2016 } 2017 else { 2018 UnicodeString close = quotes.lookupOpen( c ); 2019 if ( !close.isEmpty() ){ // we have a opening quote 2020 if ( tokDebug > 1 ) { 2021 LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl; 2022 } 2023 quotes.push( i, c ); // remember it 2024 } 2025 else { 2026 UnicodeString open = quotes.lookupClose( c ); 2027 if ( !open.isEmpty() ) { // we have a closing quote 2028 if (tokDebug > 1 ) { 2029 LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl; 2030 } 2031 if ( !resolveQuote( i, open, quotes )) { 2032 // resolve the matching opening 2033 if (tokDebug > 1 ) { 2034 LOG << "[detectQuoteBounds] Unable to resolve" << endl; 2035 } 2036 } 2037 } 2038 } 2039 } 2040 } 2041 isClosing(const Token & tok)2042 bool isClosing( const Token& tok ){ 2043 if ( tok.us.length() == 1 && 2044 ( tok.us[0] == ')' || tok.us[0] == '}' 2045 || tok.us[0] == ']' || tok.us[0] == '>' ) ) 2046 return true; 2047 return false; 2048 } 2049 detectSentenceBounds(const int offset,const string & lang)2050 void TokenizerClass::detectSentenceBounds( const int offset, 2051 const string& lang ){ 2052 //find sentences 2053 string method; 2054 if ( detectQuotes ){ 2055 method = "[detectSentenceBounds-(quoted)]"; 2056 } 2057 else { 2058 method = "[detectSentenceBounds]"; 2059 } 2060 const int size = tokens.size(); 2061 for (int i = offset; i < size; i++) { 2062 if (tokDebug > 1 ){ 2063 LOG << method << " i="<< i << " word=[" << tokens[i].us 2064 << "] type=" << tokens[i].type 2065 << ", role=" << tokens[i].role << endl; 2066 } 2067 if ( tokens[i].type.startsWith("PUNCTUATION") ){ 2068 if ((tokDebug > 1 )){ 2069 LOG << method << " PUNCTUATION FOUND @i=" << i << endl; 2070 } 2071 // we have some kind of punctuation. Does it mark an eos? 2072 bool is_eos = detectEos( i, 2073 settings[lang]->eosmarkers, 2074 settings[lang]->quotes ); 2075 if (is_eos) { 2076 // end of sentence found/ so wrap up 2077 if ( detectQuotes 2078 && !settings[lang]->quotes.emptyStack() ) { 2079 // we have some quotes! 2080 if ( tokDebug > 1 ){ 2081 LOG << method << " Unbalances quotes: Preliminary EOS FOUND @i=" 2082 << i << endl; 2083 } 2084 // we set a temporary EOS marker, 2085 // to be resolved later when full quote is found. 2086 tokens[i].role |= TEMPENDOFSENTENCE; 2087 // If previous token is also TEMPENDOFSENTENCE, 2088 // it stops being so in favour of this one 2089 if ( i > 0 ){ 2090 tokens[i-1].role &= ~TEMPENDOFSENTENCE; 2091 } 2092 } 2093 else { 2094 // No quotes 2095 if ( tokDebug > 1 ){ 2096 LOG << method << " EOS FOUND @i=" << i << endl; 2097 } 2098 tokens[i].role |= ENDOFSENTENCE; 2099 // if this is the end of the sentence, 2100 // the next token is the beginning of a new one 2101 if ( (i + 1) < size ){ 2102 tokens[i+1].role |= BEGINOFSENTENCE; 2103 } 2104 // if previous token is EOS and not BOS, it will stop being EOS, 2105 // as this one will take its place 2106 if ( i > 0 2107 && ( tokens[i-1].role & ENDOFSENTENCE ) 2108 && !( tokens[i-1].role & BEGINOFSENTENCE ) ) { 2109 tokens[i-1].role &= ~ENDOFSENTENCE; 2110 tokens[i].role &= ~BEGINOFSENTENCE; 2111 } 2112 } 2113 } 2114 else if ( isClosing(tokens[i] ) ) { 2115 // we have a closing symbol 2116 if ( tokDebug > 1 ){ 2117 LOG << method << " Close FOUND @i=" << i << endl; 2118 } 2119 //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place 2120 if ( i > 0 2121 && ( tokens[i-1].role & ENDOFSENTENCE ) 2122 && !( tokens[i-1].role & BEGINOFSENTENCE) ) { 2123 tokens[i-1].role &= ~ENDOFSENTENCE; 2124 tokens[i].role &= ~BEGINOFSENTENCE; 2125 } 2126 } 2127 if ( detectQuotes ){ 2128 // check the quotes 2129 detectQuoteBounds( i, settings[lang]->quotes ); 2130 } 2131 } 2132 } 2133 for (int i = size-1; i > offset; --i ) { 2134 // at the end of the buffer there may be some PUNCTUATION which 2135 // has spurious ENDOFSENTENCE and BEGINOFSENTENCE annotation 2136 // fix this up to avoid sentences containing only punctuation 2137 // also we don't want a BEGINQUOTE to be an ENDOFSENTENCE 2138 if ( tokDebug > 2 ){ 2139 LOG << method << " fixup-end i="<< i << " word=[" 2140 << tokens[i].us 2141 << "] type=" << tokens[i].type 2142 << ", role=" << tokens[i].role << endl; 2143 } 2144 if ( tokens[i].type.startsWith("PUNCTUATION") ) { 2145 tokens[i].role &= ~BEGINOFSENTENCE; 2146 if ( !detectQuotes || 2147 (tokens[i].role & BEGINQUOTE) ){ 2148 if ( i != size-1 ){ 2149 tokens[i].role &= ~ENDOFSENTENCE; 2150 } 2151 } 2152 } 2153 else 2154 break; 2155 } 2156 } 2157 passthruLine(const UnicodeString & input,bool & bos)2158 void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) { 2159 if (tokDebug) { 2160 LOG << "[passthruLine] input: line=[" << input << "]" << endl; 2161 } 2162 bool alpha = false, num = false, punct = false; 2163 UnicodeString word; 2164 StringCharacterIterator sit(input); 2165 while ( sit.hasNext() ){ 2166 UChar32 c = sit.current32(); 2167 if ( c == u'\u200D' ){ 2168 // a joiner. just ignore 2169 sit.next32(); 2170 continue; 2171 } 2172 if ( u_isspace(c) ) { 2173 if ( word.isEmpty() ){ 2174 // a leading space. Don't waste time on it. SKIP 2175 sit.next32(); 2176 continue; 2177 } 2178 // so a trailing space. handle the found word. 2179 if (tokDebug){ 2180 LOG << "[passthruLine] word=[" << word << "]" << endl; 2181 } 2182 if ( word == eosmark ) { 2183 word = ""; 2184 if (!tokens.empty()) 2185 tokens.back().role |= ENDOFSENTENCE; 2186 bos = true; 2187 } 2188 else { 2189 UnicodeString type; 2190 if (alpha && !num && !punct) { 2191 type = type_word; 2192 } 2193 else if (num && !alpha && !punct) { 2194 type = type_number; 2195 } 2196 else if (punct && !alpha && !num) { 2197 type = type_punctuation; 2198 } 2199 else { 2200 type = type_unknown; 2201 } 2202 if ( doPunctFilter 2203 && ( type == type_punctuation || type == type_currency || 2204 type == type_emoticon || type == type_picto ) ) { 2205 if (tokDebug >= 2 ){ 2206 LOG << " [passThruLine] skipped PUNCTUATION [" 2207 << input << "]" << endl; 2208 } 2209 if ( !tokens.empty() ){ 2210 tokens.back().role &= ~NOSPACE; 2211 } 2212 } 2213 else { 2214 if ( norm_set.find( type ) != norm_set.end() ){ 2215 word = "{{" + type + "}}"; 2216 } 2217 if (bos) { 2218 tokens.push_back( Token( type, word , BEGINOFSENTENCE ) ); 2219 bos = false; 2220 } 2221 else { 2222 tokens.push_back( Token( type, word ) ); 2223 } 2224 } 2225 alpha = false; 2226 num = false; 2227 punct = false; 2228 word = ""; 2229 } 2230 } 2231 else { 2232 if ( u_isalpha(c)) { 2233 alpha = true; 2234 } 2235 else if (u_ispunct(c)) { 2236 punct = true; 2237 } 2238 else if (u_isdigit(c)) { 2239 num = true; 2240 } 2241 word += c; 2242 } 2243 sit.next32(); 2244 } 2245 if (word != "") { 2246 if ( word == eosmark ) { 2247 word = ""; 2248 if (!tokens.empty()) 2249 tokens.back().role |= ENDOFSENTENCE; 2250 } 2251 else { 2252 UnicodeString type; 2253 if (alpha && !num && !punct) { 2254 type = type_word; 2255 } 2256 else if (num && !alpha && !punct) { 2257 type = type_number; 2258 } 2259 else if (punct && !alpha && !num) { 2260 type = type_punctuation; 2261 } 2262 else { 2263 type = type_unknown; 2264 } 2265 if ( doPunctFilter 2266 && ( type == type_punctuation || type == type_currency || 2267 type == type_emoticon || type == type_picto ) ) { 2268 if (tokDebug >= 2 ){ 2269 LOG << " [passThruLine] skipped PUNCTUATION [" 2270 << input << "]" << endl; 2271 } 2272 if ( !tokens.empty() ){ 2273 tokens.back().role &= ~NOSPACE; 2274 } 2275 } 2276 else { 2277 if ( norm_set.find( type ) != norm_set.end() ){ 2278 word = "{{" + type + "}}"; 2279 } 2280 if (bos) { 2281 tokens.push_back( Token( type, word , BEGINOFSENTENCE ) ); 2282 bos = false; 2283 } 2284 else { 2285 tokens.push_back( Token( type, word ) ); 2286 } 2287 } 2288 } 2289 } 2290 if ( sentenceperlineinput && tokens.size() > 0 ) { 2291 tokens[0].role |= BEGINOFSENTENCE; 2292 tokens.back().role |= ENDOFSENTENCE; 2293 } 2294 } 2295 checkBOM(istream & in)2296 string TokenizerClass::checkBOM( istream& in ){ 2297 string result = inputEncoding; 2298 if ( &in == &cin ){ 2299 return result; 2300 } 2301 streampos pos = in.tellg(); 2302 string s; 2303 in >> s; 2304 UErrorCode err = U_ZERO_ERROR; 2305 int32_t bomLength = 0; 2306 const char *encoding = ucnv_detectUnicodeSignature( s.c_str(), 2307 s.length(), 2308 &bomLength, 2309 &err); 2310 if ( bomLength ){ 2311 if ( tokDebug ){ 2312 LOG << "Autodetected encoding: " << encoding << endl; 2313 } 2314 result = encoding; 2315 if ( result == "UTF16BE" 2316 || result == "UTF-16BE" ){ 2317 result = "UTF16BE"; 2318 } 2319 } 2320 in.seekg( pos + (streampos)bomLength ); 2321 return result; 2322 } 2323 2324 // string wrapper tokenizeLine(const string & s,const string & lang)2325 void TokenizerClass::tokenizeLine( const string& s, 2326 const string& lang ){ 2327 UnicodeString us = convert( s, inputEncoding ); 2328 tokenizeLine( us, lang ); 2329 } 2330 2331 // UnicodeString wrapper tokenizeLine(const UnicodeString & us,const string & lang)2332 void TokenizerClass::tokenizeLine( const UnicodeString& us, 2333 const string& lang ){ 2334 bool bos = true; 2335 tokenize_one_line( us, bos, lang ); 2336 if (tokDebug > 0) { 2337 LOG << "[tokenizeLine()] before countSent " << endl; 2338 } 2339 countSentences(true); // force the ENDOFSENTENCE 2340 } 2341 u_isemo(UChar32 c)2342 bool u_isemo( UChar32 c ){ 2343 UBlockCode s = ublock_getCode(c); 2344 return s == UBLOCK_EMOTICONS; 2345 } 2346 u_ispicto(UChar32 c)2347 bool u_ispicto( UChar32 c ){ 2348 UBlockCode s = ublock_getCode(c); 2349 return s == UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS ; 2350 } 2351 u_iscurrency(UChar32 c)2352 bool u_iscurrency( UChar32 c ){ 2353 return u_charType( c ) == U_CURRENCY_SYMBOL; 2354 } 2355 u_issymbol(UChar32 c)2356 bool u_issymbol( UChar32 c ){ 2357 return u_charType( c ) == U_CURRENCY_SYMBOL 2358 || u_charType( c ) == U_MATH_SYMBOL 2359 || u_charType( c ) == U_MODIFIER_SYMBOL 2360 || u_charType( c ) == U_OTHER_SYMBOL; 2361 } 2362 detect_type(UChar32 c)2363 const UnicodeString& detect_type( UChar32 c ){ 2364 if ( u_isspace(c)) { 2365 return type_space; 2366 } 2367 else if ( u_iscurrency(c)) { 2368 return type_currency; 2369 } 2370 else if ( u_ispunct(c)) { 2371 return type_punctuation; 2372 } 2373 else if ( u_isemo( c ) ) { 2374 return type_emoticon; 2375 } 2376 else if ( u_ispicto( c ) ) { 2377 return type_picto; 2378 } 2379 else if ( u_isalpha(c)) { 2380 return type_word; 2381 } 2382 else if ( u_isdigit(c)) { 2383 return type_number; 2384 } 2385 else if ( u_issymbol(c)) { 2386 return type_symbol; 2387 } 2388 else { 2389 return type_unknown; 2390 } 2391 } 2392 toString(int8_t c)2393 std::string toString( int8_t c ){ 2394 switch ( c ){ 2395 case 0: 2396 return "U_UNASSIGNED"; 2397 case 1: 2398 return "U_UPPERCASE_LETTER"; 2399 case 2: 2400 return "U_LOWERCASE_LETTER"; 2401 case 3: 2402 return "U_TITLECASE_LETTER"; 2403 case 4: 2404 return "U_MODIFIER_LETTER"; 2405 case 5: 2406 return "U_OTHER_LETTER"; 2407 case 6: 2408 return "U_NON_SPACING_MARK"; 2409 case 7: 2410 return "U_ENCLOSING_MARK"; 2411 case 8: 2412 return "U_COMBINING_SPACING_MARK"; 2413 case 9: 2414 return "U_DECIMAL_DIGIT_NUMBER"; 2415 case 10: 2416 return "U_LETTER_NUMBER"; 2417 case 11: 2418 return "U_OTHER_NUMBER"; 2419 case 12: 2420 return "U_SPACE_SEPARATOR"; 2421 case 13: 2422 return "U_LINE_SEPARATOR"; 2423 case 14: 2424 return "U_PARAGRAPH_SEPARATOR"; 2425 case 15: 2426 return "U_CONTROL_CHAR"; 2427 case 16: 2428 return "U_FORMAT_CHAR"; 2429 case 17: 2430 return "U_PRIVATE_USE_CHAR"; 2431 case 18: 2432 return "U_SURROGATE"; 2433 case 19: 2434 return "U_DASH_PUNCTUATION"; 2435 case 20: 2436 return "U_START_PUNCTUATION"; 2437 case 21: 2438 return "U_END_PUNCTUATION"; 2439 case 22: 2440 return "U_CONNECTOR_PUNCTUATION"; 2441 case 23: 2442 return "U_OTHER_PUNCTUATION"; 2443 case 24: 2444 return "U_MATH_SYMBOL"; 2445 case 25: 2446 return "U_CURRENCY_SYMBOL"; 2447 case 26: 2448 return "U_MODIFIER_SYMBOL"; 2449 case 27: 2450 return "U_OTHER_SYMBOL"; 2451 case 28: 2452 return "U_INITIAL_PUNCTUATION"; 2453 case 29: 2454 return "U_FINAL_PUNCTUATION"; 2455 default: 2456 return "OMG NO CLUE WHAT KIND OF SYMBOL THIS IS: " 2457 + TiCC::toString( int(c) ); 2458 } 2459 } 2460 internal_tokenize_line(const UnicodeString & originput,const string & _lang)2461 int TokenizerClass::internal_tokenize_line( const UnicodeString& originput, 2462 const string& _lang ){ 2463 string lang = _lang; 2464 if ( lang.empty() ){ 2465 lang = "default"; 2466 } 2467 else { 2468 auto const it = settings.find( lang ); 2469 if ( it == settings.end() ){ 2470 LOG << "tokenizeLine: no settings found for language=" + lang << endl 2471 << "using the default language instead:" << default_language << endl; 2472 lang = "default"; 2473 } 2474 } 2475 if (tokDebug){ 2476 LOG << "[tokenizeLine] input: line=[" 2477 << originput << "] (language= " << lang << ")" << endl; 2478 } 2479 UnicodeString input = normalizer.normalize( originput ); 2480 if ( doFilter ){ 2481 input = settings[lang]->filter.filter( input ); 2482 } 2483 if ( input.isBogus() ){ //only tokenize valid input 2484 LOG << "ERROR: Invalid UTF-8 in line:" << linenum << endl 2485 << " '" << input << "'" << endl; 2486 return 0; 2487 } 2488 int32_t len = input.countChar32(); 2489 if (tokDebug){ 2490 LOG << "[tokenizeLine] filtered input: line=[" 2491 << input << "] (" << len 2492 << " unicode characters)" << endl; 2493 } 2494 const int begintokencount = tokens.size(); 2495 if (tokDebug) { 2496 LOG << "[tokenizeLine] Tokens still in buffer: " << begintokencount << endl; 2497 } 2498 2499 bool tokenizeword = false; 2500 bool reset = false; 2501 //iterate over all characters 2502 UnicodeString word; 2503 StringCharacterIterator sit(input); 2504 long int i = 0; 2505 long int tok_size = 0; 2506 while ( sit.hasNext() ){ 2507 UChar32 c = sit.current32(); 2508 bool joiner = false; 2509 if ( c == u'\u200D' ){ 2510 joiner = true; 2511 } 2512 if ( tokDebug > 8 ){ 2513 UnicodeString s = c; 2514 int8_t charT = u_charType( c ); 2515 LOG << "examine character: " << s << " type= " 2516 << toString( charT ) << endl; 2517 } 2518 if (reset) { //reset values for new word 2519 reset = false; 2520 tok_size = 0; 2521 if ( !joiner && !u_isspace(c) ){ 2522 word = c; 2523 } 2524 else { 2525 word = ""; 2526 } 2527 tokenizeword = false; 2528 } 2529 else if ( !joiner && !u_isspace(c) ){ 2530 word += c; 2531 } 2532 if ( joiner && sit.hasNext() ){ 2533 UChar32 peek = sit.next32(); 2534 if ( u_isspace(peek) ){ 2535 joiner = false; 2536 } 2537 sit.previous32(); 2538 } 2539 if ( u_isspace(c) || joiner || i == len-1 ){ 2540 if (tokDebug){ 2541 LOG << "[tokenizeLine] space detected, word=[" << word << "]" << endl; 2542 } 2543 if ( i == len-1 ) { 2544 if ( joiner 2545 || u_ispunct(c) 2546 || u_isdigit(c) 2547 || u_isquote( c, settings[lang]->quotes ) 2548 || u_isemo(c) ){ 2549 tokenizeword = true; 2550 } 2551 } 2552 if ( c == '\n' && word.isEmpty() ){ 2553 if (tokDebug){ 2554 LOG << "[tokenizeLine] NEW PARAGRAPH upcoming " << endl; 2555 } 2556 // signal that the next word starts a new Paragraph. (if its there) 2557 paragraphsignal_next = true; 2558 } 2559 int expliciteosfound = -1; 2560 if ( word.length() >= eosmark.length() ) { 2561 expliciteosfound = word.lastIndexOf(eosmark); 2562 2563 if (expliciteosfound != -1) { // word contains eosmark 2564 if ( tokDebug >= 2){ 2565 LOG << "[tokenizeLine] Found explicit EOS marker @"<<expliciteosfound << endl; 2566 } 2567 int eospos = tokens.size()-1; 2568 if (expliciteosfound > 0) { 2569 UnicodeString realword; 2570 word.extract(0,expliciteosfound,realword); 2571 if (tokDebug >= 2) { 2572 LOG << "[tokenizeLine] Prefix before EOS: " 2573 << realword << endl; 2574 } 2575 tokenizeWord( realword, false, lang ); 2576 eospos++; 2577 } 2578 if ( expliciteosfound + eosmark.length() < word.length() ){ 2579 UnicodeString realword; 2580 word.extract( expliciteosfound+eosmark.length(), 2581 word.length() - expliciteosfound - eosmark.length(), 2582 realword ); 2583 if (tokDebug >= 2){ 2584 LOG << "[tokenizeLine] postfix after EOS: " 2585 << realword << endl; 2586 } 2587 tokenizeWord( realword, true, lang ); 2588 } 2589 if ( !tokens.empty() && eospos >= 0 ) { 2590 if (tokDebug >= 2){ 2591 LOG << "[tokenizeLine] Assigned EOS" << endl; 2592 } 2593 tokens[eospos].role |= ENDOFSENTENCE; 2594 } 2595 } 2596 } 2597 if ( word.length() > 0 2598 && expliciteosfound == -1 ) { 2599 if (tokDebug >= 2){ 2600 LOG << "[tokenizeLine] Further tokenization necessary for: [" 2601 << word << "]" << endl; 2602 } 2603 if ( tokenizeword ) { 2604 tokenizeWord( word, !joiner, lang ); 2605 } 2606 else { 2607 tokenizeWord( word, !joiner, lang, type_word ); 2608 } 2609 } 2610 //reset values for new word 2611 reset = true; 2612 } 2613 else if ( u_ispunct(c) 2614 || u_isdigit(c) 2615 || u_isquote( c, settings[lang]->quotes ) 2616 || u_isemo(c) ){ 2617 if (tokDebug){ 2618 LOG << "[tokenizeLine] punctuation or digit detected, word=[" 2619 << word << "]" << endl; 2620 } 2621 //there is punctuation or digits in this word, mark to run through tokenizer 2622 tokenizeword = true; 2623 } 2624 sit.next32(); 2625 ++i; 2626 ++tok_size; 2627 if ( tok_size > 2500 ){ 2628 LOG << "Ridiculously long word/token (over 2500 characters) detected " 2629 << "in line: " << linenum << ". Skipped ..." << endl; 2630 LOG << "The line starts with " << UnicodeString( word, 0, 75 ) 2631 << "..." << endl; 2632 return 0; 2633 } 2634 } 2635 int numNewTokens = tokens.size() - begintokencount; 2636 if (tokDebug >= 10){ 2637 LOG << "tokens.size() = " << tokens.size() << endl; 2638 LOG << "begintokencount = " << begintokencount << endl; 2639 LOG << "numnew = " << numNewTokens << endl; 2640 } 2641 if ( numNewTokens > 0 ){ 2642 if (paragraphsignal) { 2643 tokens[begintokencount].role |= NEWPARAGRAPH | BEGINOFSENTENCE; 2644 paragraphsignal = false; 2645 } 2646 //find sentence boundaries 2647 if (sentenceperlineinput) { 2648 // force it to be a sentence 2649 tokens[begintokencount].role |= BEGINOFSENTENCE; 2650 tokens.back().role |= ENDOFSENTENCE; 2651 } 2652 detectSentenceBounds( begintokencount ); 2653 } 2654 return numNewTokens; 2655 } 2656 tokenizeWord(const UnicodeString & input,bool space,const string & lang,const UnicodeString & assigned_type)2657 void TokenizerClass::tokenizeWord( const UnicodeString& input, 2658 bool space, 2659 const string& lang, 2660 const UnicodeString& assigned_type ) { 2661 bool recurse = !assigned_type.isEmpty(); 2662 2663 int32_t inpLen = input.countChar32(); 2664 if ( tokDebug > 2 ){ 2665 if ( recurse ){ 2666 LOG << " [tokenizeWord] Recurse Input: (" << inpLen << ") " 2667 << "word=[" << input << "], type=" << assigned_type 2668 << " Space=" << (space?"TRUE":"FALSE") << endl; 2669 } 2670 else { 2671 LOG << " [tokenizeWord] Input: (" << inpLen << ") " 2672 << "word=[" << input << "]" 2673 << " Space=" << (space?"TRUE":"FALSE") << endl; } 2674 } 2675 if ( input == eosmark ) { 2676 if (tokDebug >= 2){ 2677 LOG << " [tokenizeWord] Found explicit EOS marker" << endl; 2678 } 2679 if (!tokens.empty()) { 2680 if (tokDebug >= 2){ 2681 LOG << " [tokenizeWord] Assigned EOS" << endl; 2682 } 2683 tokens.back().role |= ENDOFSENTENCE; 2684 } 2685 else { 2686 LOG << "[WARNING] Found explicit EOS marker by itself, this will have no effect!" << endl; 2687 } 2688 return; 2689 } 2690 2691 if ( inpLen == 1) { 2692 //single character, no need to process all rules, do some simpler (faster) detection 2693 UChar32 c = input.char32At(0); 2694 UnicodeString type = detect_type( c ); 2695 if ( type == type_space ){ 2696 return; 2697 } 2698 if ( doPunctFilter 2699 && ( type == type_punctuation || type == type_currency || 2700 type == type_emoticon || type == type_picto ) ) { 2701 if (tokDebug >= 2 ){ 2702 LOG << " [tokenizeWord] skipped PUNCTUATION [" 2703 << input << "]" << endl; 2704 } 2705 if ( !tokens.empty() ){ 2706 tokens.back().role &= ~NOSPACE; 2707 } 2708 } 2709 else { 2710 UnicodeString word = input; 2711 if ( norm_set.find( type ) != norm_set.end() ){ 2712 word = "{{" + type + "}}"; 2713 } 2714 TokenRole role = (space ? NOROLE : NOSPACE); 2715 if ( paragraphsignal_next ){ 2716 role |= NEWPARAGRAPH; 2717 paragraphsignal_next = false; 2718 } 2719 Token T( type, word, role, lang ); 2720 tokens.push_back( T ); 2721 if (tokDebug >= 2){ 2722 LOG << " [tokenizeWord] added token " << T << endl; 2723 } 2724 } 2725 } 2726 else { 2727 bool a_rule_matched = false; 2728 for ( const auto& rule : settings[lang]->rules ) { 2729 if ( tokDebug >= 4){ 2730 LOG << "\tTESTING " << rule->id << endl; 2731 } 2732 UnicodeString type = rule->id; 2733 //Find first matching rule 2734 UnicodeString pre, post; 2735 vector<UnicodeString> matches; 2736 if ( rule->matchAll( input, pre, post, matches ) ){ 2737 a_rule_matched = true; 2738 if ( tokDebug >= 4 ){ 2739 LOG << "\tMATCH: " << type << endl; 2740 LOG << "\tpre= '" << pre << "'" << endl; 2741 LOG << "\tpost= '" << post << "'" << endl; 2742 int cnt = 0; 2743 for ( const auto& m : matches ){ 2744 LOG << "\tmatch[" << ++cnt << "]=" << m << endl; 2745 } 2746 } 2747 if ( recurse 2748 && ( type == type_word 2749 || ( pre.isEmpty() 2750 && post.isEmpty() ) ) ){ 2751 // so only do this recurse step when: 2752 // OR we have a WORD 2753 // OR we have an exact match of the rule (no pre or post) 2754 if ( assigned_type != type_word ){ 2755 // don't change the type when: 2756 // it was already non-WORD 2757 if ( tokDebug >= 4 ){ 2758 LOG << "\trecurse, match didn't do anything new for " << input << endl; 2759 } 2760 TokenRole role = (space ? NOROLE : NOSPACE); 2761 if ( paragraphsignal_next ){ 2762 role |= NEWPARAGRAPH; 2763 paragraphsignal_next = false; 2764 } 2765 tokens.push_back( Token( assigned_type, input, role, lang ) ); 2766 return; 2767 } 2768 else { 2769 if ( tokDebug >= 4 ){ 2770 LOG << "\trecurse, match changes the type:" 2771 << assigned_type << " to " << type << endl; 2772 } 2773 TokenRole role = (space ? NOROLE : NOSPACE); 2774 if ( paragraphsignal_next ){ 2775 role |= NEWPARAGRAPH; 2776 paragraphsignal_next = false; 2777 } 2778 tokens.push_back( Token( type, input, role, lang ) ); 2779 return; 2780 } 2781 } 2782 if ( pre.length() > 0 ){ 2783 if ( tokDebug >= 4 ){ 2784 LOG << "\tTOKEN pre-context (" << pre.length() 2785 << "): [" << pre << "]" << endl; 2786 } 2787 tokenizeWord( pre, false, lang ); //pre-context, no space after 2788 } 2789 if ( matches.size() > 0 ){ 2790 int max = matches.size(); 2791 if ( tokDebug >= 4 ){ 2792 LOG << "\tTOKEN match #=" << matches.size() << endl; 2793 } 2794 for ( int m=0; m < max; ++m ){ 2795 if ( tokDebug >= 4 ){ 2796 LOG << "\tTOKEN match[" << m << "] = " << matches[m] 2797 << " Space=" << (space?"TRUE":"FALSE") << endl; 2798 } 2799 if ( doPunctFilter 2800 && (&rule->id)->startsWith("PUNCTUATION") ){ 2801 if (tokDebug >= 2 ){ 2802 LOG << " [tokenizeWord] skipped PUNCTUATION [" 2803 << matches[m] << "]" << endl; 2804 } 2805 if ( !tokens.empty() ){ 2806 tokens.back().role &= ~NOSPACE; 2807 } 2808 } 2809 else { 2810 bool internal_space = space; 2811 if ( post.length() > 0 ) { 2812 internal_space = false; 2813 } 2814 else if ( m < max-1 ){ 2815 internal_space = false; 2816 } 2817 UnicodeString word = matches[m]; 2818 if ( norm_set.find( type ) != norm_set.end() ){ 2819 word = "{{" + type + "}}"; 2820 TokenRole role = (internal_space ? NOROLE : NOSPACE); 2821 if ( paragraphsignal_next ){ 2822 role |= NEWPARAGRAPH; 2823 paragraphsignal_next = false; 2824 } 2825 tokens.push_back( Token( type, word, role, lang ) ); 2826 } 2827 else { 2828 if ( recurse ){ 2829 TokenRole role = (internal_space ? NOROLE : NOSPACE); 2830 if ( paragraphsignal_next ){ 2831 role |= NEWPARAGRAPH; 2832 paragraphsignal_next = false; 2833 } 2834 tokens.push_back( Token( type, word, role, lang ) ); 2835 } 2836 else { 2837 tokenizeWord( word, internal_space, lang, type ); 2838 } 2839 } 2840 } 2841 } 2842 } 2843 else if ( tokDebug >=4 ){ 2844 // should never come here? 2845 LOG << "\tPANIC there's no match" << endl; 2846 } 2847 if ( post.length() > 0 ){ 2848 if ( tokDebug >= 4 ){ 2849 LOG << "\tTOKEN post-context (" << post.length() 2850 << "): [" << post << "]" << endl; 2851 } 2852 tokenizeWord( post, space, lang ); 2853 } 2854 break; 2855 } 2856 } 2857 if ( !a_rule_matched ){ 2858 // no rule matched 2859 if ( tokDebug >=4 ){ 2860 LOG << "\tthere's no match at all" << endl; 2861 } 2862 TokenRole role = (space ? NOROLE : NOSPACE); 2863 if ( paragraphsignal_next ){ 2864 role |= NEWPARAGRAPH; 2865 paragraphsignal_next = false; 2866 } 2867 tokens.push_back( Token( assigned_type, input, role, lang ) ); 2868 } 2869 } 2870 } 2871 get_data_version() const2872 string TokenizerClass::get_data_version() const { 2873 return UCTODATA_VERSION; 2874 } 2875 init(const string & fname,const string & tname)2876 bool TokenizerClass::init( const string& fname, const string& tname ){ 2877 if ( tokDebug ){ 2878 LOG << "Initiating tokenizer..." << endl; 2879 } 2880 data_version = get_data_version(); 2881 Setting *set = new Setting(); 2882 if ( !set->read( fname, tname, tokDebug, theErrLog ) ){ 2883 LOG << "Cannot read Tokenizer settingsfile " << fname << endl; 2884 LOG << "Unsupported language? (Did you install the uctodata package?)" 2885 << endl; 2886 return false; 2887 } 2888 else { 2889 settings["default"] = set; 2890 default_language = "default"; 2891 auto pos = fname.find("tokconfig-"); 2892 if ( pos != string::npos ){ 2893 default_language = fname.substr(pos+10); 2894 settings[default_language] = set; 2895 } 2896 else if ( xmlout ){ 2897 LOG << " unable to determine a language. cannot proceed" << endl; 2898 return false; 2899 } 2900 } 2901 if ( tokDebug ){ 2902 LOG << "effective rules: " << endl; 2903 for ( size_t i=0; i < set->rules.size(); ++i ){ 2904 LOG << "rule " << i << " " << *(set->rules[i]) << endl; 2905 } 2906 LOG << "EOS markers: " << set->eosmarkers << endl; 2907 LOG << "Quotations: " << set->quotes << endl; 2908 try { 2909 LOG << "Filter: " << set->filter << endl; 2910 } 2911 catch (...){ 2912 } 2913 } 2914 return true; 2915 } 2916 init(const vector<string> & languages,const string & tname)2917 bool TokenizerClass::init( const vector<string>& languages, 2918 const string& tname ){ 2919 if ( tokDebug > 0 ){ 2920 LOG << "Initiating tokenizer from language list..." << endl; 2921 } 2922 data_version = get_data_version(); 2923 Setting *default_set = 0; 2924 for ( const auto& lang : languages ){ 2925 if ( tokDebug > 0 ){ 2926 LOG << "init language=" << lang << endl; 2927 } 2928 string fname = "tokconfig-" + lang; 2929 Setting *set = new Setting(); 2930 string add; 2931 if ( default_set == 0 ){ 2932 add = tname; 2933 } 2934 if ( !set->read( fname, add, tokDebug, theErrLog ) ){ 2935 LOG << "problem reading datafile for language: " << lang << endl; 2936 LOG << "Unsupported language (Did you install the uctodata package?)" 2937 << endl; 2938 } 2939 else { 2940 if ( default_set == 0 ){ 2941 default_set = set; 2942 settings["default"] = set; 2943 default_language = lang; 2944 } 2945 settings[lang] = set; 2946 } 2947 } 2948 if ( settings.empty() ){ 2949 cerr << "ucto: No useful settingsfile(s) could be found (initiating from language list: " << languages << ")" << endl; 2950 return false; 2951 } 2952 return true; 2953 } 2954 get_language(const vector<Token> & tv)2955 string get_language( const vector<Token>& tv ){ 2956 // examine the assigned languages of ALL tokens. 2957 // they should all be the same 2958 // assign that value 2959 string result = "default"; 2960 for ( const auto& t : tv ){ 2961 if ( !t.lang_code.empty() && t.lang_code != "default" ){ 2962 if ( result == "default" ){ 2963 result = t.lang_code; 2964 } 2965 if ( result != t.lang_code ){ 2966 throw logic_error( "ucto: conflicting language(s) assigned" ); 2967 } 2968 } 2969 } 2970 return result; 2971 } 2972 get_setting_info(const std::string & language,std::string & set_file,std::string & version) const2973 bool TokenizerClass::get_setting_info( const std::string& language, 2974 std::string& set_file, 2975 std::string& version ) const { 2976 set_file.clear(); 2977 version.clear(); 2978 auto const& it = settings.find( language ); 2979 if ( it == settings.end() ){ 2980 return false; 2981 } 2982 else { 2983 set_file = it->second->set_file; 2984 version = it->second->version; 2985 return true; 2986 } 2987 } 2988 2989 } //namespace Tokenizer 2990