1 /*
2   Copyright (c) 2006 - 2021
3   CLST - Radboud University
4   ILK  - Tilburg University
5 
6   This file is part of Ucto
7 
8   Ucto is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   Ucto is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ucto/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 
26 */
27 
28 #include "ucto/tokenize.h"
29 
30 #include <cassert>
31 #include <unistd.h>
32 #include <iostream>
33 #include <fstream>
34 #include <vector>
35 #include "config.h"
36 #include "unicode/schriter.h"
37 #include "unicode/ucnv.h"
38 #include "ticcutils/StringOps.h"
39 #include "ticcutils/PrettyPrint.h"
40 #include "ticcutils/Unicode.h"
41 #include "ticcutils/Timer.h"
42 #include "ucto/my_textcat.h"
43 
44 #define DO_READLINE
45 #ifdef HAVE_LIBREADLINE
46 #  if defined(HAVE_READLINE_READLINE_H)
47 #    include <readline/readline.h>
48 #  elif defined(HAVE_READLINE_H)
49 #    include <readline.h>
50 #  else
51 #    undef DO_READLINE
52 #  endif /* !defined(HAVE_READLINE_H) */
53 #else
54 #  undef DO_READLINE
55 #endif /* HAVE_LIBREADLINE */
56 
57 #ifdef HAVE_READLINE_HISTORY
58 #  if defined(HAVE_READLINE_HISTORY_H)
59 #    include <readline/history.h>
60 #  elif defined(HAVE_HISTORY_H)
61 #    include <history.h>
62 #  endif /* defined(HAVE_READLINE_HISTORY_H) */
63 #endif /* HAVE_READLINE_HISTORY */
64 
65 using namespace std;
66 
67 #define LOG *TiCC::Log(theErrLog)
68 
69 namespace Tokenizer {
70 
71   using namespace icu;
72   using TiCC::operator<<;
73 
74   const string ISO_SET = "http://raw.github.com/proycon/folia/master/setdefinitions/iso639_3.foliaset.ttl";
75 
76   const string UCTO_SET_PREFIX = "https://raw.githubusercontent.com/LanguageMachines/uctodata/master/setdefinitions/";
77 
Version()78   const std::string Version() { return VERSION; }
VersionName()79   const std::string VersionName() { return PACKAGE_STRING; }
80 
81   class uRangeError: public std::out_of_range {
82   public:
uRangeError(const string & s)83     explicit uRangeError( const string& s ): out_of_range( "ucto: out of range:" + s ){};
84   };
85 
86   class uLogicError: public std::logic_error {
87   public:
uLogicError(const string & s)88     explicit uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){};
89   };
90 
91   class uCodingError: public std::runtime_error {
92   public:
uCodingError(const string & s)93     explicit uCodingError( const string& s ): runtime_error( "ucto: coding problem:" + s ){};
94   };
95 
96 
convert(const string & line,const string & inputEncoding)97   UnicodeString convert( const string& line,
98 			 const string& inputEncoding ){
99     UnicodeString result;
100     if ( !line.empty() ){
101       try {
102 	result = UnicodeString( line.c_str(),
103 				line.length(),
104 				inputEncoding.c_str() );
105       }
106       catch ( exception &e) {
107 	throw uCodingError( "Unexpected character found in input. " +
108 			    string(e.what()) + "Make sure input is valid: " +
109 			    inputEncoding );
110       }
111       if ( result.isBogus() ){
112 	throw uCodingError( "string decoding failed: (invalid inputEncoding '"
113 			    + inputEncoding + "' ?)" );
114       }
115     }
116     return result;
117   }
118 
119   const UnicodeString type_space = "SPACE";
120   const UnicodeString type_currency = "CURRENCY";
121   const UnicodeString type_emoticon = "EMOTICON";
122   const UnicodeString type_picto = "PICTOGRAM";
123   const UnicodeString type_word = "WORD";
124   const UnicodeString type_symbol = "SYMBOL";
125   const UnicodeString type_punctuation = "PUNCTUATION";
126   const UnicodeString type_number = "NUMBER";
127   const UnicodeString type_unknown = "UNKNOWN";
128 
Token(const UnicodeString & _type,const UnicodeString & _s,TokenRole _role,const string & _lang_code)129   Token::Token( const UnicodeString& _type,
130 		const UnicodeString& _s,
131 		TokenRole _role, const string& _lang_code ):
132     type(_type), us(_s), role(_role), lang_code(_lang_code) {
133   }
134 
135 
texttostring()136   std::string Token::texttostring() { return TiCC::UnicodeToUTF8(us); }
typetostring()137   std::string Token::typetostring() { return TiCC::UnicodeToUTF8(type); }
138 
operator <<(std::ostream & os,const Token & t)139   ostream& operator<< (std::ostream& os, const Token& t ){
140     os << t.type << " : " << t.role  << ":" << t.us << " (" << t.lang_code << ")";
141     return os;
142   }
143 
toUString(const TokenRole & tok)144   UnicodeString toUString( const TokenRole& tok ){
145     UnicodeString result;
146     if ( tok & NOSPACE){
147       result += "NOSPACE ";
148     }
149     if ( tok & BEGINOFSENTENCE) {
150       result += "BEGINOFSENTENCE ";
151     }
152     if ( tok & ENDOFSENTENCE) {
153       result += "ENDOFSENTENCE ";
154     }
155     if ( tok & NEWPARAGRAPH) {
156       result += "NEWPARAGRAPH ";
157     }
158     if ( tok & BEGINQUOTE) {
159       result += "BEGINQUOTE ";
160     }
161     if ( tok & ENDQUOTE) {
162       result += "ENDQUOTE ";
163     }
164     return result;
165   }
166 
operator <<(ostream & os,const TokenRole & tok)167   ostream& operator<<( ostream& os, const TokenRole& tok ){
168     os << toUString( tok );
169     return os;
170   }
171 
TokenizerClass()172   TokenizerClass::TokenizerClass():
173     linenum(0),
174     inputEncoding( "UTF-8" ),
175     eosmark("<utt>"),
176     tokDebug(0),
177     verbose(false),
178     detectQuotes(false),
179     doFilter(true),
180     doPunctFilter(false),
181     doWordCorrection(true),
182     splitOnly( false ),
183     detectPar(true),
184     paragraphsignal(true),
185     paragraphsignal_next(false),
186     doDetectLang(false),
187     text_redundancy("minimal"),
188     sentenceperlineoutput(false),
189     sentenceperlineinput(false),
190     lowercase(false),
191     uppercase(false),
192     xmlout(false),
193     xmlin(false),
194     passthru(false),
195     ignore_tag_hints(false),
196     ucto_processor(0),
197     already_tokenized(false),
198     inputclass("current"),
199     outputclass("current"),
200     text_cat( 0 )
201   {
202     theErrLog = new TiCC::LogStream(cerr, "ucto" );
203     theErrLog->setstamp( StampMessage );
204 #ifdef HAVE_TEXTCAT
205     string textcat_cfg = string(SYSCONF_PATH) + "/ucto/textcat.cfg";
206     text_cat = new TextCat( textcat_cfg, theErrLog );
207     //    text_cat->set_debug( true );
208     LOG << " textcat configured from: " << textcat_cfg << endl;
209     // ifstream is( textcat_cfg );
210     // string line;
211     // while ( getline( is, line ) ){
212     //   LOG << line << endl;
213     //   vector<string> v = TiCC::split( line );
214     //   if ( v.size()==2 && v[1] == "nld" ){
215     // 	LOG << "voor nederlands: " << endl;
216     //     ifstream is2( v[0] );
217     // 	string line2;
218     // 	while ( getline( is2, line2 ) ){
219     // 	  LOG << line2 << endl;
220     // 	  break;
221     // 	}
222     // 	LOG << "   done with nederlands" << endl;
223     //   }
224     // }
225 #else
226     LOG << "NO TEXTCAT SUPPORT!" << endl;
227 #endif
228   }
229 
~TokenizerClass()230   TokenizerClass::~TokenizerClass(){
231     Setting *d = 0;
232     for ( const auto& s : settings ){
233       if ( s.first == "default" ){
234 	// the 'default' may also return as a real 'language'
235 	// avoid deleting it twice
236 	d = s.second;
237 	delete d;
238       }
239       if ( s.second != d ){
240 	delete s.second;
241       }
242 
243     }
244     delete theErrLog;
245     delete text_cat;
246   }
247 
reset(const string & lang)248   bool TokenizerClass::reset( const string& lang ){
249     ucto_processor = 0;
250     already_tokenized = false;
251     tokens.clear();
252     if ( settings.find("lang") != settings.end() ){
253       settings[lang]->quotes.clearStack();
254     }
255     return true;
256   }
257 
setNormSet(const std::string & values)258   bool TokenizerClass::setNormSet( const std::string& values ){
259     vector<string> parts = TiCC::split_at( values, "," );
260     for ( const auto& val : parts ){
261       norm_set.insert( TiCC::UnicodeFromUTF8( val ) );
262     }
263     return true;
264   }
265 
setErrorLog(TiCC::LogStream * os)266   void TokenizerClass::setErrorLog( TiCC::LogStream *os ) {
267     if ( theErrLog != os ){
268       text_cat->set_debug_stream( os );
269       delete theErrLog;
270     }
271     theErrLog = os;
272   }
273 
setInputEncoding(const std::string & enc)274   string TokenizerClass::setInputEncoding( const std::string& enc ){
275     string old = inputEncoding;
276     inputEncoding = enc;
277     return old;
278   }
279 
setTextRedundancy(const std::string & tr)280   string TokenizerClass::setTextRedundancy( const std::string& tr ){
281     if ( tr == "none" || tr == "minimal" || tr == "full" ){
282       string s = text_redundancy;
283       text_redundancy = tr;
284       return s;
285     }
286     else {
287       throw runtime_error( "illegal value '" + tr + "' for textredundancy. "
288 			   "expected 'full', 'minimal' or 'none'." );
289     }
290   }
291 
set_tc_debug(bool b)292   bool TokenizerClass::set_tc_debug( bool b ){
293     if ( !text_cat ){
294       throw logic_error( "attempt to set debug on uninitialized TextClass object" );
295     }
296     else {
297       return text_cat->set_debug( b );
298     }
299   }
300 
fixup_UTF16(const string & input_line,const string & encoding)301   string fixup_UTF16( const string& input_line, const string& encoding ){
302     string line = input_line;
303     // some hackery to handle exotic input. UTF-16 but also CR at end.
304     string::size_type pos = line.rfind( '\r' );
305     if ( pos != string::npos ){
306       line.erase( pos );
307     }
308     if ( line.size() > 0 && line[0] == 0 ){
309       // when processing UTF16LE, '0' bytes show up at pos 0
310       // we discard them, not for UTF16BE!
311       // this works on Linux with GCC (atm)
312       if ( encoding != "UTF16BE" ){
313 	line.erase(0,1);
314       }
315     }
316     if ( line.size() > 0 && encoding == "UTF16BE" &&
317 	 line.back() == 0 ){
318       // when processing UTF16BE, '0' bytes show up at the end
319       // we discard them.
320       // this works on Linux with GCC (atm)
321       line.erase(line.size()-1);
322     }
323     return line;
324   }
325 
init_provenance(folia::Document * doc,folia::processor * parent) const326   folia::processor *TokenizerClass::init_provenance( folia::Document *doc,
327 						     folia::processor *parent ) const {
328     if ( ucto_processor ){
329       // already created
330       if ( tokDebug > 0 ){
331 	LOG << "use already created processor: " << ucto_processor->id() << endl;
332       }
333       return ucto_processor;
334     }
335     vector<folia::processor *> procs = doc->get_processors_by_name( "ucto" );
336     if ( !procs.empty() ){
337       if ( procs.size() > 1 ){
338 	LOG << "ucto is very confused about '" << doc->filename() << "'\n"
339 	    << "Multiple 'ucto' processors have already been run?" << endl;
340 	exit( EXIT_FAILURE );
341       }
342       // ucto has been used one before, we can't do it complettely over again!
343       LOG << "Difficult to tokenize '" << doc->filename()
344 	  << "' again, already processed by ucto before!" << endl;
345       LOG << " The document will be copied as-is to the output file" << endl;
346       already_tokenized = true;
347       return procs[0];
348     }
349     else {
350       folia::KWargs args;
351       args["name"] = "ucto";
352       args["generate_id"] = "auto()";
353       args["version"] = PACKAGE_VERSION;
354       args["command"] = _command;
355       args["begindatetime"] = "now()";
356       if ( parent ){
357 	ucto_processor = doc->add_processor( args, parent );
358       }
359       else {
360 	args["generator"] = "yes";
361 	ucto_processor = doc->add_processor( args );
362 	ucto_processor->get_system_defaults();
363       }
364       if ( tokDebug > 0 ){
365 	LOG << "created a new processor: " << ucto_processor->id() << endl;
366       }
367       return ucto_processor;
368     }
369   }
370 
add_provenance_passthru(folia::Document * doc,folia::processor * parent) const371   folia::processor *TokenizerClass::add_provenance_passthru( folia::Document *doc,
372 							     folia::processor *parent ) const {
373     folia::processor *proc = init_provenance( doc, parent );
374     if ( proc ){
375       folia::KWargs args;
376       args["processor"] = proc->id();
377       doc->declare( folia::AnnotationType::TOKEN, "passthru", args );
378     }
379     return proc;
380   }
381 
add_provenance_data(folia::Document * doc,folia::processor * parent) const382   folia::processor *TokenizerClass::add_provenance_data( folia::Document *doc,
383 							 folia::processor* parent ) const {
384     folia::processor *proc = init_provenance( doc, parent );
385     if ( proc ){
386       if ( !ucto_re_run() ){
387 	string id = "ucto.1.1";
388 	folia::processor *data_proc = doc->get_processor( id );
389 	if ( !data_proc ){
390 	  folia::KWargs args;
391 	  args["name"] = "uctodata";
392 	  args["generate_id"] = "auto()";
393 	  args["type"] = "datasource";
394 	  args["version"] = data_version;
395 	  data_proc = doc->add_processor( args, proc );
396 	}
397 	return data_proc;
398       }
399       else {
400 	return proc;
401       }
402     }
403     else {
404       return 0;
405     }
406   }
407 
add_provenance_structure(folia::Document * doc,const folia::AnnotationType type,folia::processor * parent) const408   folia::processor *TokenizerClass::add_provenance_structure(  folia::Document *doc,
409 							       const folia::AnnotationType type,
410 							       folia::processor *parent ) const {
411     folia::processor *proc = init_provenance( doc, parent );
412     if ( proc && !ucto_re_run() ){
413       if ( !doc->declared( type ) ){
414 	// we can declare it
415 	folia::KWargs args;
416 	args["processor"] = proc->id();
417 	doc->declare( type, "None", args );
418 	if ( tokDebug > 3 ){
419 	  LOG << "added " << folia::toString(type) << "-annotation for: '"
420 	      << proc->id() << endl;
421 	}
422       }
423       else {
424 	string proc_id = doc->default_processor(type);
425 	if ( !proc_id.empty() ){
426 	  proc = doc->get_processor(proc_id);
427 	  if ( tokDebug  ){
428 	    LOG << "REUSE  " << folia::toString(type) << "-annotation for: '"
429 		<< proc->id() << "' with set=" << doc->default_set(type) << endl;
430 	  }
431 	}
432 	else {
433 	  proc = 0;
434 	  if ( tokDebug  ){
435 	  LOG << "REUSE  " << folia::toString(type) << "-annotation"
436 	      << " with set=" << doc->default_set(type) << endl;
437 	  }
438 	}
439       }
440     }
441     return proc;
442   }
443 
add_provenance_structure(folia::Document * doc,folia::processor * parent) const444   folia::processor *TokenizerClass::add_provenance_structure( folia::Document *doc,
445 							      folia::processor *parent ) const {
446     folia::processor *res = 0;
447     add_provenance_structure( doc,
448 			      folia::AnnotationType::PARAGRAPH, parent );
449     add_provenance_structure( doc,
450 			      folia::AnnotationType::SENTENCE,
451 			      parent );
452     res = add_provenance_structure( doc,
453 				    folia::AnnotationType::QUOTE,
454 				    parent );
455     return res;
456   }
457 
add_provenance_setting(folia::Document * doc,folia::processor * parent) const458   folia::processor *TokenizerClass::add_provenance_setting( folia::Document *doc,
459 							    folia::processor *parent ) const {
460     folia::processor *proc = init_provenance( doc, parent );
461     if ( proc && !ucto_re_run() ){
462       folia::processor *data_proc = add_provenance_data( doc, parent );
463       if ( doc->metadata_type() == "native" ){
464 	doc->set_metadata( "language", default_language );
465       }
466       for ( const auto& s : settings ){
467 	if ( tokDebug > 3 ){
468 	  LOG << "language: " << s.first << endl;
469 	}
470 	if ( s.first == "default" ){
471 	  continue;
472 	}
473 	folia::KWargs args;
474 	args["name"] = s.second->set_file;
475 	args["generate_id"] = "next()";
476 	args["type"] = "datasource";
477 	args["version"] = s.second->version;
478 	doc->add_processor( args, data_proc );
479 	args.clear();
480 	args["processor"] = proc->id();
481 	string alias = "tokconfig-" + s.first;
482 	string ucto_set = UCTO_SET_PREFIX + alias + ".foliaset.ttl";
483 	args["alias"] = alias;
484 	if ( doc->declared( folia::AnnotationType::TOKEN, alias ) ){
485 	  // we assume that an old-style declaration is present
486 	  doc->un_declare( folia::AnnotationType::TOKEN, alias );
487 	}
488 	doc->declare( folia::AnnotationType::TOKEN, ucto_set, args );
489 	if ( tokDebug > 3 ){
490 	  LOG << "added processor and token-annotation for: '"
491 	      << alias << "'" << endl;
492 	}
493       }
494       return data_proc;
495     }
496     else {
497       return 0;
498     }
499   }
500 
start_document(const string & id) const501   folia::Document *TokenizerClass::start_document( const string& id ) const {
502     folia::Document *doc = new folia::Document( "xml:id='" + id + "'" );
503     doc->addStyle( "text/xsl", "folia.xsl" );
504     if ( tokDebug > 3 ){
505       LOG << "start document!!!" << endl;
506     }
507     if ( passthru ){
508       add_provenance_passthru( doc );
509     }
510     else {
511       add_provenance_setting( doc );
512     }
513     folia::KWargs args;
514     args["xml:id"] = doc->id() + ".text";
515     doc->create_root<folia::Text>( args );
516     return doc;
517   }
518 
tokenize_one_line(const UnicodeString & input_line,bool & bos,const string & lang)519   void TokenizerClass::tokenize_one_line( const UnicodeString& input_line,
520 					  bool& bos,
521 					  const string& lang ){
522     if ( passthru ){
523       passthruLine( input_line, bos );
524     }
525     else {
526       string language = lang;
527       if ( language.empty() ){
528 	if ( tokDebug > 3 ){
529 	  LOG << "should we guess the language? "
530 	      << (text_cat && doDetectLang) << endl;
531 	}
532 	if ( text_cat && doDetectLang ){
533 	  UnicodeString temp = input_line;
534 	  temp.findAndReplace( eosmark, "" );
535 	  temp.toLower();
536 	  if ( tokDebug > 3 ){
537 	    LOG << "use textCat to guess language from: "
538 		<< temp << endl;
539 	  }
540 	  language = text_cat->get_language( TiCC::UnicodeToUTF8(temp) );
541 	  if ( settings.find( language ) != settings.end() ){
542 	    if ( tokDebug > 3 ){
543 	      LOG << "found a supported language: " << language << endl;
544 	    }
545 	  }
546 	  else {
547 	    if ( tokDebug > 3 ){
548 	      LOG << "found an unsupported language: " << language << endl;
549 	    }
550 	    language = "default";
551 	  }
552 	}
553       }
554       internal_tokenize_line( input_line, language );
555     }
556   }
557 
tokenizeOneSentence(istream & IN)558   vector<Token> TokenizerClass::tokenizeOneSentence( istream& IN ){
559     if  (tokDebug > 0) {
560       LOG << "[tokenizeOneSentence()] before countSent " << endl;
561     }
562     int numS = countSentences(); //count full sentences in token buffer
563     if ( numS > 0 ) { // still some sentences in the buffer
564       if  (tokDebug > 0) {
565 	LOG << "[tokenizeOneSentence] " << numS
566 	    << " sentence(s) in buffer, processing..." << endl;
567       }
568       return popSentence( );
569     }
570     if  (tokDebug > 0) {
571       LOG << "[tokenizeOneSentence] NO sentences in buffer, searching.." << endl;
572     }
573     bool done = false;
574     bool bos = true;
575     inputEncoding = checkBOM( IN );
576     string line;
577     do {
578       done = !getline( IN, line );
579       UnicodeString input_line;
580       if ( !done ){
581 	++linenum;
582 	if (tokDebug > 0) {
583 	  LOG << "[tokenize] Read input line " << linenum
584 	      << "-: '" << TiCC::format_nonascii( line ) << "'" << endl;
585 	}
586 	string tmp_line = fixup_UTF16( line, inputEncoding );
587 	if ( tokDebug > 0
588 	     && tmp_line != line ){
589 	  LOG << "After fixup, input_line= '"
590 	      << TiCC::format_nonascii( tmp_line ) << "'" << endl;
591 	}
592 	input_line = convert( tmp_line, inputEncoding );
593 	if ( sentenceperlineinput ){
594 	  input_line += " " + eosmark;
595 	}
596       }
597       if  (tokDebug > 0) {
598 	LOG << "[tokenizeOneSentence] before next countSentences " << endl;
599       }
600       if ( done || input_line.isEmpty() ){
601 	//Signal the tokenizer that a paragraph is detected
602 	paragraphsignal = true;
603 	numS = countSentences(true); //count full sentences in token buffer,
604 	// setting explicit END_OF_SENTENCE
605       }
606       else {
607 	tokenize_one_line( input_line, bos );
608 	numS = countSentences(); //count full sentences in token buffer
609       }
610       if ( numS > 0 ) {
611 	// 1 or more sentences in the buffer.
612 	// extract the first 1
613 	if  (tokDebug > 0) {
614 	  LOG << "[tokenizeOneSentence] " << numS << " sentence(s) in buffer, processing first one..." << endl;
615 	}
616 	return popSentence();
617       }
618       else {
619 	if  (tokDebug > 0) {
620 	  LOG << "[tokenizeOneSentence] No sentence yet, reading on..." << endl;
621 	}
622       }
623     } while (!done);
624     vector<Token> result;
625     return result;
626   }
627 
appendText(folia::FoliaElement * root,const string & outputclass)628   void appendText( folia::FoliaElement *root,
629 		   const string& outputclass  ){
630     // set the textcontent of root to that of it's children
631     if ( !root ){
632       throw logic_error( "appendText() on empty root" );
633     }
634     if ( root->hastext( outputclass ) ){
635       // there is already text, bail out.
636       return;
637     }
638     if ( root->isSubClass( folia::Linebreak_t ) ){
639       // exception
640       return;
641     }
642     UnicodeString utxt = root->text( outputclass );
643     // so get Untokenized text from the children, and set it
644     root->settext( TiCC::UnicodeToUTF8(utxt), outputclass );
645   }
646 
removeText(folia::FoliaElement * root,const string & outputclass)647   void removeText( folia::FoliaElement *root,
648 		   const string& outputclass  ){
649     if ( !root ){
650       throw logic_error( "removeText() on empty root" );
651     }
652     // remove the textcontent in outputclass of root
653     root->clear_textcontent( outputclass );
654   }
655 
tokenize(istream & IN)656   folia::Document *TokenizerClass::tokenize( istream& IN ) {
657     inputEncoding = checkBOM( IN );
658     folia::Document *doc = start_document( docid );
659     folia::FoliaElement *root = doc->doc()->index(0);
660     int parCount = 0;
661     vector<Token> buffer;
662     do {
663       if ( tokDebug > 0 ){
664 	LOG << "[tokenize] looping on stream" << endl;
665       }
666       vector<Token> v = tokenizeOneSentence( IN );
667       if ( !v.empty() ){
668 	if ( tokDebug > 1 ){
669 	  LOG << "[tokenize] sentence=" << v << endl;
670 	}
671 	root = append_to_folia( root, v, parCount );
672       }
673     }
674     while ( IN );
675     if ( tokDebug > 0 ){
676       LOG << "[tokenize] end of stream reached" << endl;
677     }
678     if (!buffer.empty()){
679       if ( tokDebug > 1 ){
680 	LOG << "[tokenize] remainder=" << buffer << endl;
681       }
682       append_to_folia( root, buffer, parCount);
683     }
684     // make sure to set the text on the last root created
685     if ( text_redundancy == "full" ){
686       appendText( root, outputclass );
687     }
688     else if ( text_redundancy == "none" ){
689       removeText( root, outputclass );
690     }
691     return doc;
692   }
693 
tokenize(const string & ifile,const string & ofile)694   void TokenizerClass::tokenize( const string& ifile, const string& ofile ){
695     ostream *OUT = NULL;
696     if ( ofile.empty() )
697       OUT = &cout;
698     else {
699       OUT = new ofstream( ofile );
700     }
701 
702     istream *IN = NULL;
703     if ( xmlin ){
704       folia::Document *doc = tokenize_folia( ifile );
705       *OUT << *doc;
706       OUT->flush();
707       delete doc;
708     }
709     else {
710       if ( ifile.empty() )
711 	IN = &cin;
712       else {
713 	IN = new ifstream( ifile );
714 	if ( !IN || !IN->good() ){
715 	  cerr << "ucto: problems opening inputfile " << ifile << endl;
716 	  cerr << "ucto: Courageously refusing to start..."  << endl;
717 	  throw runtime_error( "unable to find or read file: '" + ifile + "'" );
718 	}
719       }
720       this->tokenize( *IN, *OUT );
721     }
722     if ( IN != &cin ) delete IN;
723     if ( OUT != &cout ) delete OUT;
724   }
725 
tokenize(istream & IN,ostream & OUT)726   void TokenizerClass::tokenize( istream& IN, ostream& OUT) {
727     if (xmlout) {
728       folia::Document *doc = tokenize( IN );
729       OUT << doc;
730       OUT.flush();
731       delete doc;
732     }
733 #ifdef DO_READLINE
734     else if ( &IN == &cin && isatty(0) ){
735       // interactive use on a terminal (quite a hack..)
736       const char *prompt = "ucto> ";
737       string line;
738       int i = 0;
739       while ( true ){
740 	string data;
741 	char *input = readline( prompt );
742 	if ( !input ){
743 	  break;
744 	}
745 	line = input;
746 	sentenceperlineinput = true;
747 	if ( line.empty() ){
748 	  free( input );
749 	  continue;
750 	}
751 	else {
752 	  add_history( input );
753 	  free( input );
754 	  data += line + " ";
755 	}
756 	if ( !data.empty() ){
757 	  tokenizeLine( data );
758 	  // extract sentence from Token vector until done
759 	  vector<Token> v = popSentence();
760 	  while( !v.empty() ){
761 	    UnicodeString res = outputTokens( v , (i>0) );
762 	    OUT << res;
763 	    ++i;
764 	    v = popSentence();
765 	  }
766 	  OUT << endl;
767 	}
768       }
769     }
770 #endif
771     else {
772       int i = 0;
773       inputEncoding = checkBOM( IN );
774       do {
775 	if ( tokDebug > 0 ){
776 	  LOG << "[tokenize] looping on stream" << endl;
777 	}
778 	vector<Token> v = tokenizeOneSentence( IN );
779 	while( !v.empty() ){
780 	  UnicodeString res = outputTokens( v , (i>0) );
781 	  OUT << res;
782 	  ++i;
783 	  v = tokenizeOneSentence( IN );
784 	}
785       } while ( IN );
786       if ( tokDebug > 0 ){
787 	LOG << "[tokenize] end_of_stream" << endl;
788       }
789       OUT << endl;
790     }
791   }
792 
set_language(folia::FoliaElement * node,const string & lang)793   void set_language( folia::FoliaElement* node, const string& lang ){
794     // set the language on this @node to @lang
795     // If a LangAnnotation with a set is already present, we silently
796     // keep using that set.
797     // Otherwise we add the ISO_SET
798     string lang_set = node->doc()->default_set( folia::AnnotationType::LANG );
799     if ( lang_set.empty() ){
800       lang_set = ISO_SET;
801       folia::KWargs args;
802       args["processor"] = "ucto.1";
803       node->doc()->declare( folia::AnnotationType::LANG,
804 			    ISO_SET,
805 			    args );
806     }
807     folia::KWargs args;
808     args["class"] = lang;
809     args["set"] = lang_set;
810     folia::LangAnnotation *la = new folia::LangAnnotation( args, node->doc() );
811     node->replace( la );
812   }
813 
get_parent_id(folia::FoliaElement * el)814   string get_parent_id( folia::FoliaElement *el ){
815     if ( !el ){
816       return "";
817     }
818     else if ( !el->id().empty() ){
819       return el->id();
820     }
821     else {
822       return get_parent_id( el->parent() );
823     }
824   }
825 
append_to_sentence(folia::Sentence * sent,const vector<Token> & toks) const826   vector<folia::Word*> TokenizerClass::append_to_sentence( folia::Sentence *sent,
827 							   const vector<Token>& toks ) const {
828     vector<folia::Word*> result;
829     folia::Document *doc = sent->doc();
830     string tok_set;
831     if ( passthru ){
832       tok_set = "passthru";
833     }
834     else {
835       string tc_lc = get_language( toks );
836       if ( tc_lc != "default" ){
837 	tok_set = "tokconfig-" + tc_lc;
838 	set_language( sent, tc_lc );
839       }
840       else {
841 	tok_set = "tokconfig-" + default_language;
842       }
843     }
844     folia::FoliaElement *root = sent;
845     if ( tokDebug > 5 ){
846       LOG << "add_words\n" << toks << endl;
847     }
848     for ( size_t i=0; i < toks.size(); ++i ){
849       const auto& tok = toks[i];
850       if ( tokDebug > 5 ){
851 	LOG << "add_result\n" << tok << endl;
852       }
853       if ( tok.role & BEGINQUOTE ){
854 	if  (tokDebug > 5 ) {
855 	  LOG << "[add_words] Creating quote element" << endl;
856 	}
857 	folia::processor *proc = add_provenance_structure( doc,
858 							   folia::AnnotationType::QUOTE );
859 	folia::KWargs args;
860 	string id = get_parent_id(root);
861 	if ( !id.empty() ){
862 	  args["generate_id"] = id;
863 	}
864 	if ( proc ){
865 	  args["processor"] = proc->id();
866 	}
867 	args["set"] = doc->default_set( folia::AnnotationType::QUOTE );
868 	folia::FoliaElement *q = new folia::Quote( args, doc );
869 	root->append( q );
870 	// might need a new Sentence
871 	if ( i+1 < toks.size()
872 	     && toks[i+1].role & BEGINOFSENTENCE ){
873 	  folia::processor *proc2 = add_provenance_structure( doc,
874 							     folia::AnnotationType::SENTENCE );
875 	  folia::KWargs args2;
876 	  string pid = get_parent_id(root);
877 	  if ( !pid.empty() ){
878 	    args2["generate_id"] = pid;
879 	  }
880 	  if ( proc2 ){
881 	    args2["processor"] = proc2->id();
882 	  }
883 	  args2["set"] = doc->default_set( folia::AnnotationType::SENTENCE );
884 	  folia::Sentence *ns = new folia::Sentence( args2, doc );
885 	  q->append( ns );
886 	  root = ns;
887 	}
888 	else {
889 	  root = q;
890 	}
891       }
892       else if ( (tok.role & BEGINOFSENTENCE)
893 		&& root != sent
894 		&& root->element_id() == folia::Sentence_t ){
895 	// Ok, another Sentence in a quote
896 	if ( i > 0 && !(toks[i-1].role & BEGINQUOTE) ){
897 	  // close the current one, and start a new one.
898 	  // except when it is implicit created by a QUOTE
899 	  if ( tokDebug > 5 ){
900 	    LOG << "[add_words] next embedded sentence" << endl;
901 	  }
902 	  // honour text_redundancy on the Sentence
903 	  if ( text_redundancy == "full" ){
904 	    appendText( root, outputclass );
905 	  }
906 	  else if ( text_redundancy == "none" ){
907 	    removeText( root, outputclass );
908 	  }
909 	  root = root->parent();
910 	  folia::processor *proc = add_provenance_structure( doc,
911 							   folia::AnnotationType::SENTENCE );
912 	  folia::KWargs args;
913 	  string id = get_parent_id(root);
914 	  if ( !id.empty() ){
915 	    args["generate_id"] = id;
916 	  }
917 	  if ( proc ){
918 	    args["processor"] = proc->id();
919 	  }
920 	  args["set"] = doc->default_set( folia::AnnotationType::SENTENCE );
921 	  folia::Sentence *ns = new folia::Sentence( args, doc );
922 	  root->append( ns );
923 	  root = ns;
924 	}
925       }
926       folia::KWargs args;
927       string ids = get_parent_id( root );
928       if ( !ids.empty() ){
929 	args["generate_id"] = ids;
930       }
931       args["class"] = TiCC::UnicodeToUTF8(tok.type);
932       if ( tok.role & NOSPACE ){
933 	args["space"] = "no";
934       }
935       if ( outputclass != "current" ){
936 	args["textclass"] = outputclass;
937       }
938       args["set"] = tok_set;
939 #pragma omp critical (foliaupdate)
940       {
941 	UnicodeString ws = tok.us;
942 	if (lowercase) {
943 	  ws = ws.toLower();
944 	}
945 	else if (uppercase) {
946 	  ws = ws.toUpper();
947 	}
948 	if ( tokDebug > 5 ){
949 	  LOG << "create Word(" << args << ") = " << ws << endl;
950 	}
951 	folia::Word *w;
952 	try {
953 	  w = new folia::Word( args, doc );
954 	}
955 	catch ( const exception& e ){
956 	  cerr << "Word(" << args << ") creation failed: " << e.what() << endl;
957 	  exit(EXIT_FAILURE);
958 	}
959 	result.push_back( w );
960 	w->setutext( ws, outputclass );
961 	if ( tokDebug > 5 ){
962 	  LOG << "add_result, created a word: " << w << "(" << ws << ")" << endl;
963 	}
964 	root->append( w );
965       }
966       if ( tok.role & ENDQUOTE ){
967 	if ( i > 0
968 	     && toks[i-1].role & ENDOFSENTENCE ){
969 	  // end of quote implies with embedded Sentence
970 	  if ( tokDebug > 5 ){
971 	    LOG << "[add_words] End of quote" << endl;
972 	  }
973 	  // honour text_redundancy on the Sentence
974 	  if ( text_redundancy == "full" ){
975 	    appendText( root->parent(), outputclass );
976 	  }
977 	  else if ( text_redundancy == "none" ){
978 	    removeText( root->parent(), outputclass );
979 	  }
980 	  root = root->parent()->parent(); // so close Sentence too
981 	}
982 	else {
983 	  root = root->parent();
984 	}
985       }
986     }
987     if ( text_redundancy == "full" ){
988       appendText( sent, outputclass );
989     }
990     else if ( text_redundancy == "none" ){
991       removeText( sent, outputclass );
992     }
993     return result;
994   }
995 
append_to_folia(folia::FoliaElement * root,const vector<Token> & tv,int & p_count) const996   folia::FoliaElement *TokenizerClass::append_to_folia( folia::FoliaElement *root,
997 							const vector<Token>& tv,
998 							int& p_count ) const {
999     if ( !root || !root->doc() ){
1000       throw logic_error( "missing root" );
1001     }
1002     if  ( tokDebug > 5 ){
1003       LOG << "append_to_folia, root = " << root << endl;
1004       LOG << "tokens=\n" << tv << endl;
1005     }
1006     if ( (tv[0].role & NEWPARAGRAPH) ) {
1007       if  ( tokDebug > 5 ){
1008 	LOG << "append_to_folia, NEW paragraph " << endl;
1009       }
1010       folia::processor *proc = add_provenance_structure( root->doc(),
1011 							 folia::AnnotationType::PARAGRAPH );
1012       folia::KWargs args;
1013       if ( proc ){
1014 	args["processor"] = proc->id();
1015       }
1016       args["set"] = root->doc()->default_set( folia::AnnotationType::PARAGRAPH );
1017       args["xml:id"] = root->doc()->id() + ".p." + TiCC::toString(++p_count);
1018       folia::Paragraph *p = new folia::Paragraph( args, root->doc() );
1019       if ( root->element_id() == folia::Text_t ){
1020 	if  ( tokDebug > 5 ){
1021 	  LOG << "append_to_folia, add paragraph to Text" << endl;
1022 	}
1023 	root->append( p );
1024       }
1025       else {
1026 	// root is a paragraph, which is done now.
1027 	if ( text_redundancy == "full" ){
1028 	  root->settext( root->str(outputclass), outputclass);
1029 	}
1030 	if  ( tokDebug > 5 ){
1031 	  LOG << "append_to_folia, add paragraph to parent of " << root << endl;
1032 	}
1033 	root = root->parent();
1034 	root->append( p );
1035       }
1036       root = p;
1037     }
1038     folia::processor *proc = add_provenance_structure( root->doc(),
1039 						       folia::AnnotationType::SENTENCE );
1040     folia::KWargs args;
1041     if ( proc ){
1042       args["processor"] = proc->id();
1043     }
1044     args["set"] = root->doc()->default_set( folia::AnnotationType::SENTENCE );
1045     args["generate_id"] = root->id();
1046     folia::Sentence *s = new folia::Sentence( args, root->doc() );
1047     root->append( s );
1048     if  ( tokDebug > 5 ){
1049       LOG << "append_to_folia, created Sentence" << s << endl;
1050     }
1051     append_to_sentence( s, tv );
1052     return root;
1053   }
1054 
handle_token_tag(const folia::FoliaElement * d,const folia::TextPolicy & tp)1055   UnicodeString handle_token_tag( const folia::FoliaElement *d,
1056 				  const folia::TextPolicy& tp ){
1057     /// a handler that is passed on to libfolia to handle special tag="token"
1058     /// nodes
1059     /*!
1060       \param d The FoliaElement that libfolia will handle us
1061       \param tp The TextPolicy at hand. This function has been registered in
1062       \em tp
1063       \return a UnicodeString which we will mark specially so that we know
1064       that this string is to be handled as a separate token
1065 
1066       This function will be called by libfolia's text() functions on
1067       encountering a tag="token" attribute in a TextContent.
1068       It has to be registered in \em tp
1069      */
1070     UnicodeString tmp_result = text( d, tp );
1071     tmp_result = u'\u200D' + tmp_result;
1072     tmp_result += u'\u200D';
1073     return tmp_result;
1074   }
1075 
correct_element(folia::FoliaElement * orig,const vector<Token> & toks,const string & tok_set) const1076   void TokenizerClass::correct_element( folia::FoliaElement *orig,
1077 					const vector<Token>& toks,
1078 					const string& tok_set ) const {
1079     vector<folia::FoliaElement*> sV;
1080     vector<folia::FoliaElement*> cV;
1081     vector<folia::FoliaElement*> oV;
1082     vector<folia::FoliaElement*> nV;
1083     // Original element
1084     oV.push_back( orig );
1085     // Add the edits
1086     for ( const auto& tok : toks ){
1087       // New elements
1088       folia::KWargs args;
1089       args["xml:id"] = orig->generateId( "tokenized" );
1090       args["class"] = TiCC::UnicodeToUTF8(tok.type);
1091       if ( tok.role & NOSPACE ){
1092 	args["space"] = "no";
1093       }
1094       if ( outputclass != "current" ){
1095 	args["textclass"] = outputclass;
1096       }
1097       args["set"] = tok_set;
1098 #pragma omp critical (foliaupdate)
1099       {
1100 	UnicodeString ws = tok.us;
1101 	if (lowercase) {
1102 	  ws = ws.toLower();
1103 	}
1104 	else if (uppercase) {
1105 	  ws = ws.toUpper();
1106 	}
1107 	if ( tokDebug > 5 ){
1108 	  LOG << "create Word(" << args << ") = " << ws << endl;
1109 	}
1110 	folia::FoliaElement *new_elt;
1111 	try {
1112 	  new_elt = folia::AbstractElement::createElement( orig->element_id(),
1113 							   orig->doc() );
1114 	  new_elt->setAttributes( args );
1115 	}
1116 	catch ( const exception& e ){
1117 	  cerr << "Word(" << args << ") creation failed: " << e.what() << endl;
1118 	  exit(EXIT_FAILURE);
1119 	}
1120 	new_elt->setutext( ws, outputclass );
1121 	if ( tokDebug > 5 ){
1122 	  LOG << "add_result, created: " << new_elt << "(" << ws << ")" << endl;
1123 	}
1124 	nV.push_back( new_elt );
1125       }
1126     }
1127     folia::KWargs no_args;
1128     no_args["processor"] = ucto_processor->id();
1129     no_args["set"] = tok_set;
1130     folia::Correction *c = orig->parent()->correct( oV, cV, nV, sV, no_args );
1131     if ( tokDebug > 2 ){
1132       LOG << "created: " << c->xmlstring() << endl;
1133     }
1134     else if ( tokDebug > 0 ){
1135       LOG << "created: " << c << endl;
1136     }
1137   }
1138 
correct_elements(folia::FoliaElement * e,const vector<folia::FoliaElement * > & wv)1139   vector<Token> TokenizerClass::correct_elements( folia::FoliaElement *e,
1140 						  const vector<folia::FoliaElement*>& wv ) {
1141     vector<Token> result;
1142     // correct only when the sentence is in the desired language
1143     string s_la;
1144     if ( e->has_annotation<folia::LangAnnotation>() ){
1145       s_la = e->annotation<folia::LangAnnotation>()->cls();
1146     }
1147     if ( !s_la.empty() && settings.find(s_la) == settings.end() ){
1148       // the Sentence already has a language code, and it
1149       // is NOT what we search for.
1150       // just ignore it
1151       if ( tokDebug > 0 ){
1152 	LOG << "skip FoLiA element " << e->id() << " with unsupported language "
1153 	    << s_la << endl;
1154       }
1155       return result;
1156     }
1157     string tok_set;
1158     if ( !s_la.empty() ){
1159       tok_set = "tokconfig-" + s_la;
1160     }
1161     else {
1162       tok_set = "tokconfig-" + default_language;
1163     }
1164     folia::KWargs args;
1165     args["processor"] = ucto_processor->id();
1166     e->doc()->declare( folia::AnnotationType::CORRECTION, tok_set, args );
1167     for ( auto w : wv ){
1168       string text = w->str( text_policy );
1169       if ( tokDebug > 0 ){
1170 	LOG << "correct_elements() text='" << text << "'" << endl;
1171       }
1172       tokenizeLine( text );
1173       vector<Token> sent = popSentence();
1174       while ( sent.size() > 0 ){
1175 	sent.front().role &= ~BEGINOFSENTENCE;
1176 	sent.back().role &= ~ENDOFSENTENCE;
1177 	result.insert( result.end(), sent.begin(), sent.end() );
1178 	correct_element( w, sent, tok_set );
1179 	sent = popSentence();
1180       }
1181     }
1182     result.front().role |= BEGINOFSENTENCE;
1183     result.back().role |= ENDOFSENTENCE;
1184     return result;
1185   }
1186 
handle_one_sentence(folia::Sentence * s,int & sentence_done)1187   void TokenizerClass::handle_one_sentence( folia::Sentence *s,
1188 					    int& sentence_done ){
1189     // check feasibility
1190     if ( tokDebug > 1 ){
1191       LOG << "handle_one_sentence: " << s << endl;
1192     }
1193     if ( inputclass != outputclass && outputclass == "current" ){
1194       if ( s->hastext( outputclass ) ){
1195 	throw uLogicError( "cannot set text with class='current' on node "
1196 			   + s->id() +
1197 			   " because it already has text in that class." );
1198       }
1199     }
1200     vector<folia::Word *> wv = s->words( inputclass );
1201     if ( wv.empty() ){
1202       wv = s->words();
1203     }
1204     if ( !wv.empty() ){
1205       // there are already words.
1206       if ( doWordCorrection ){
1207 	// we are allowed to correct those
1208 	vector<folia::FoliaElement*> ev(wv.begin(),wv.end());
1209 	if ( !correct_elements( s, ev ).empty() ){
1210 	  ++sentence_done;
1211 	}
1212       }
1213     }
1214     else {
1215       string s_la;
1216       if ( s->has_annotation<folia::LangAnnotation>() ){
1217 	s_la = s->annotation<folia::LangAnnotation>()->cls();
1218       }
1219       if ( !s_la.empty() && settings.find(s_la) == settings.end() ){
1220 	// the Sentence already has a language code, and it
1221 	// is NOT what we search for.
1222 	// just ignore it
1223 	if ( tokDebug > 0 ){
1224 	  LOG << "skip sentence " << s->id() << " with unsupported language "
1225 	      << s_la << endl;
1226 	}
1227 	return;
1228       }
1229       string text = s->str( text_policy );
1230       if ( tokDebug > 0 ){
1231 	LOG << "handle_one_sentence() from string: '" << text << "'" << endl;
1232       }
1233       tokenizeLine( text );
1234       vector<Token> sent = popSentence();
1235       while ( sent.size() > 0 ){
1236 	append_to_sentence( s, sent );
1237 	++sentence_done;
1238 	sent = popSentence();
1239       }
1240     }
1241     if ( text_redundancy == "full" ){
1242       appendText( s, outputclass );
1243     }
1244     else if ( text_redundancy == "none" ){
1245       removeText( s, outputclass );
1246     }
1247   }
1248 
handle_one_paragraph(folia::Paragraph * p,int & sentence_done)1249   void TokenizerClass::handle_one_paragraph( folia::Paragraph *p,
1250 					     int& sentence_done ){
1251     // a Paragraph may contain both Word and Sentence nodes
1252     // Sentences will be handled
1253     vector<folia::Sentence*> sv = p->select<folia::Sentence>(false);
1254     if ( sv.empty() ){
1255       // No Sentence, so just text or Words
1256       vector<folia::Word*> wv = p->select<folia::Word>(false);
1257       if ( !wv.empty() ){
1258 	vector<folia::FoliaElement*> ev( wv.begin(), wv.end() );
1259 	// Words found
1260 	if ( doWordCorrection ){
1261 	  if ( correct_elements( p, ev ).empty() ){
1262 	    ++sentence_done;
1263 	  }
1264 	}
1265 	// otherwise skip
1266       }
1267       else {
1268 	// No Words too, handle text, if any
1269 	string text = p->str( text_policy );
1270 	if ( tokDebug > 0 ){
1271 	  LOG << "handle_one_paragraph:" << text << endl;
1272 	}
1273 	tokenizeLine( text );
1274 	vector<Token> toks = popSentence();
1275 	folia::processor *proc = 0;
1276 	while ( !toks.empty() ){
1277 	  if ( proc == 0 ){
1278 	    proc = add_provenance_structure( p->doc(),
1279 					     folia::AnnotationType::SENTENCE );
1280 	  }
1281 	  string p_id = p->id();
1282 	  folia::KWargs args;
1283 	  if ( proc ){
1284 	    args["processor"] = proc->id();
1285 	  }
1286 	  args["set"] = p->doc()->default_set(folia::AnnotationType::SENTENCE);
1287 	  if ( !p_id.empty() ){
1288 	    args["generate_id"] = p_id;
1289 	  }
1290 	  folia::Sentence *s = new folia::Sentence( args, p->doc() );
1291 	  p->append( s );
1292 	  append_to_sentence( s, toks );
1293 	  ++sentence_done;
1294 	  toks = popSentence();
1295 	}
1296       }
1297     }
1298     else {
1299       if ( tokDebug > 1 ){
1300 	LOG << "found some Sentences " << sv << endl;
1301       }
1302       // For now wu just IGNORE loose words (backward compatability)
1303       for ( const auto& s : sv ){
1304 	handle_one_sentence( s, sentence_done );
1305       }
1306     }
1307     if ( text_redundancy == "full" ){
1308       appendText( p, outputclass );
1309     }
1310     else if ( text_redundancy == "none" ){
1311       removeText( p, outputclass );
1312     }
1313   }
1314 
handle_one_text_parent(folia::FoliaElement * e,int & sentence_done)1315   void TokenizerClass::handle_one_text_parent( folia::FoliaElement *e,
1316 					       int& sentence_done ){
1317     ///
1318     /// input is a FoLiA element @e containing text, direct or deeper
1319     /// this can be a Word, Sentence, Paragraph or some other element
1320     /// In the latter case, we construct a Sentence from the text, and
1321     /// a Paragraph if more then one Sentence is found
1322     ///
1323     if ( inputclass != outputclass && outputclass == "current" ){
1324       if ( e->hastext( outputclass ) ){
1325 	throw uLogicError( "cannot set text with class='current' on node "
1326 			   + e->id() +
1327 			   " because it already has text in that class." );
1328       }
1329     }
1330     if ( e->xmltag() == "w" ){
1331       // SKIP! already tokenized into words!
1332     }
1333     else if ( e->xmltag() == "s" ){
1334       // OK a text in a sentence
1335       if ( tokDebug > 2 ){
1336 	LOG << "found text in a sentence " << e << endl;
1337       }
1338       handle_one_sentence( dynamic_cast<folia::Sentence*>(e),
1339 			   ++sentence_done );
1340     }
1341     else if ( e->xmltag() == "p" ){
1342       // OK a longer text in some paragraph, maybe more sentences
1343       if ( tokDebug > 2 ){
1344 	LOG << "found text in a paragraph " << e << endl;
1345       }
1346       handle_one_paragraph( dynamic_cast<folia::Paragraph*>(e),
1347 			    sentence_done );
1348     }
1349     else {
1350       // Some text outside word, paragraphs or sentences (yet)
1351       // mabe <div> or <note> or such
1352       // there may be embedded Paragraph, Word and Sentence nodes
1353       // if so, Paragraphs and Sentences should be handled separately
1354       vector<folia::Sentence*> sv = e->select<folia::Sentence>(false);
1355       vector<folia::Paragraph*> pv = e->select<folia::Paragraph>(false);
1356       if ( pv.empty() && sv.empty() ){
1357 	// just words or text
1358 	string text = e->str( text_policy );
1359 	if ( tokDebug > 1 ){
1360 	  LOG << "tok-" << e->xmltag() << ":" << text << endl;
1361 	}
1362 	tokenizeLine( text );
1363 	vector<vector<Token>> sents;
1364 	vector<Token> toks = popSentence();
1365 	while ( toks.size() > 0 ){
1366 	  sents.push_back( toks );
1367 	  toks = popSentence();
1368 	}
1369 	if ( sents.size() == 0 ){
1370 	  // can happen in very rare cases (strange spaces in the input)
1371 	  // SKIP!
1372 	}
1373 	else if ( sents.size() > 1 ){
1374 	  // multiple sentences. We need an extra Paragraph.
1375 	  // But first check if this is allowed!
1376 	  folia::FoliaElement *rt;
1377 	  if ( e->acceptable(folia::Paragraph_t) ){
1378 	    folia::KWargs args;
1379 	    string e_id = e->id();
1380 	    if ( !e_id.empty() ){
1381 	      args["generate_id"] = e_id;
1382 	    }
1383 	    folia::processor *proc = add_provenance_structure( e->doc(),
1384 							       folia::AnnotationType::PARAGRAPH );
1385 	    if ( proc ){
1386 	      args["processor"] = proc->id();
1387 	    }
1388 	    args["set"] = e->doc()->default_set( folia::AnnotationType::PARAGRAPH );
1389 	    folia::Paragraph *p = new folia::Paragraph( args, e->doc() );
1390 	    e->append( p );
1391 	    rt = p;
1392 	  }
1393 	  else {
1394 	    rt = e;
1395 	  }
1396 	  for ( const auto& sent : sents ){
1397 	    folia::KWargs args;
1398 	    string p_id = rt->id();
1399 	    if ( !p_id.empty() ){
1400 	      args["generate_id"] = p_id;
1401 	    }
1402 	    folia::processor *proc = add_provenance_structure( e->doc(),
1403 							       folia::AnnotationType::SENTENCE );
1404 	    if ( proc ){
1405 	      args["processor"] =  proc->id();
1406 	    }
1407 	    args["set"] = e->doc()->default_set( folia::AnnotationType::SENTENCE );
1408 	    folia::Sentence *s = new folia::Sentence( args, e->doc() );
1409 	    append_to_sentence( s, sent );
1410 	    ++sentence_done;
1411 	    if  (tokDebug > 0){
1412 	      LOG << "created a new sentence: " << s << endl;
1413 	    }
1414 	    rt->append( s );
1415 	  }
1416 	}
1417 	else {
1418 	  // 1 sentence, connect directly.
1419 	  folia::KWargs args;
1420 	  string e_id = e->id();
1421 	  if ( e_id.empty() ){
1422 	    e_id = e->generateId( e->xmltag() );
1423 	    args["xml:id"] = e_id + ".s.1";
1424 	  }
1425 	  else {
1426 	    args["generate_id"] = e_id;
1427 	  }
1428 	  folia::processor *proc = add_provenance_structure( e->doc(),
1429 							     folia::AnnotationType::SENTENCE );
1430 	  if ( proc ){
1431 	    args["processor"] =  proc->id();
1432 	  }
1433 	  args["set"] = e->doc()->default_set( folia::AnnotationType::SENTENCE );
1434 	  folia::Sentence *s = new folia::Sentence( args, e->doc() );
1435 	  append_to_sentence( s, sents[0] );
1436 	  ++sentence_done;
1437 	  if  (tokDebug > 0){
1438 	    LOG << "created a new sentence: " << s << endl;
1439 	  }
1440 	  e->append( s );
1441 	}
1442       }
1443       else if ( !pv.empty() ){
1444 	if ( tokDebug > 1 ){
1445 	  LOG << "found some Paragraphs " << pv << endl;
1446 	}
1447 	// For now we only handle the Paragraphs, ignore sentences and words
1448 	// IS this even valid???
1449 	for ( const auto& p : pv ){
1450 	  handle_one_paragraph( p, sentence_done );
1451 	}
1452       }
1453       else {
1454 	if ( tokDebug > 1 ){
1455 	  LOG << "found some Sentences " << sv << endl;
1456 	}
1457 	// For now we just IGNORE the loose words (backward compatability)
1458 	for ( const auto& s : sv ){
1459 	  handle_one_sentence( s, sentence_done );
1460 	}
1461       }
1462     }
1463     if ( text_redundancy == "full" ){
1464       appendText( e, outputclass );
1465     }
1466     else if ( text_redundancy == "none" ){
1467       removeText( e, outputclass );
1468     }
1469   }
1470 
tokenize_folia(const string & infile_name)1471   folia::Document *TokenizerClass::tokenize_folia( const string& infile_name ){
1472     if ( inputclass == outputclass
1473 	 && !doWordCorrection ){
1474       LOG << "ucto: --filter=NO is automatically set. inputclass equals outputclass!"
1475 	  << endl;
1476       setFiltering(false);
1477     }
1478     text_policy.set_class( inputclass );
1479     if ( !ignore_tag_hints ){
1480       text_policy.add_handler("token", &handle_token_tag );
1481     }
1482     folia::TextEngine proc( infile_name );
1483     if ( passthru ){
1484       add_provenance_passthru( proc.doc() );
1485     }
1486     else {
1487       add_provenance_setting( proc.doc() );
1488     }
1489     if  ( tokDebug > 8){
1490       proc.set_dbg_stream( theErrLog );
1491       proc.set_debug( true );
1492     }
1493     //    proc.set_debug( true );
1494     proc.setup( inputclass, true );
1495     int sentence_done = 0;
1496     folia::FoliaElement *p = 0;
1497     folia::FoliaElement *parent = 0;
1498     while ( (p = proc.next_text_parent() ) ){
1499       //      LOG << "next text parent: " << p << endl;
1500       if ( !parent ){
1501 	parent = p->parent();
1502 	//	LOG << "my parent: " << parent << endl;
1503       }
1504       if ( already_tokenized ){
1505 	++sentence_done;
1506       }
1507       else {
1508 	handle_one_text_parent( p, sentence_done );
1509       }
1510       if ( tokDebug > 0 ){
1511 	LOG << "done with sentence " << sentence_done << endl;
1512       }
1513       if ( proc.next() ){
1514 	if ( tokDebug > 1 ){
1515 	  LOG << "looping for more ..." << endl;
1516 	}
1517       }
1518     }
1519     if ( text_redundancy == "full" ){
1520       appendText( parent, outputclass );
1521     }
1522     else if ( text_redundancy == "none" ){
1523       removeText( parent, outputclass );
1524     }
1525     if ( sentence_done == 0 ){
1526       LOG << "document contains no text in the desired inputclass: "
1527 	  << inputclass << endl;
1528       LOG << "NO result!" << endl;
1529       return 0;
1530     }
1531     return proc.doc(true); // take the doc over from the Engine
1532   }
1533 
tokenize_folia(const string & infile_name,const string & outfile_name)1534   void TokenizerClass::tokenize_folia( const string& infile_name,
1535 				       const string& outfile_name ){
1536     if ( tokDebug > 0 ){
1537       LOG << "[tokenize_folia] (" << infile_name << ","
1538 	  << outfile_name << ")" << endl;
1539     }
1540     folia::Document *doc = tokenize_folia( infile_name );
1541     if ( doc ){
1542       doc->save( outfile_name, false );
1543       if ( tokDebug > 0 ){
1544 	LOG << "resulting FoLiA doc saved in " << outfile_name << endl;
1545       }
1546     }
1547     else {
1548       if ( tokDebug > 0 ){
1549 	LOG << "NO FoLiA doc created! " << endl;
1550       }
1551     }
1552   }
1553 
outputTokens(const vector<Token> & tokens,const bool continued) const1554   UnicodeString TokenizerClass::outputTokens( const vector<Token>& tokens,
1555 					      const bool continued ) const {
1556     /*!
1557       \param tokens A list of Token's to display
1558       \param continued Set to true when outputTokens is invoked multiple
1559       times and it is not the first invokation
1560 
1561       this makes paragraph boundaries work over multiple calls
1562       \return A UnicodeString representing tokenized lines, including token
1563       information, when verbose mode is on.
1564     */
1565     short quotelevel = 0;
1566     UnicodeString result;
1567     for ( const auto& token : tokens ) {
1568       UnicodeString outline;
1569       if (tokDebug >= 5){
1570 	LOG << "outputTokens: token=" << token << endl;
1571       }
1572       if ( detectPar
1573 	   && (token.role & NEWPARAGRAPH)
1574 	   && !verbose
1575 	   && continued ) {
1576 	//output paragraph separator
1577 	if ( sentenceperlineoutput ) {
1578 	  outline += "\n";
1579 	}
1580 	else {
1581 	  outline += "\n\n";
1582 	}
1583       }
1584       UnicodeString s = token.us;
1585       if (lowercase) {
1586 	s = s.toLower();
1587       }
1588       else if ( uppercase ) {
1589 	s = s.toUpper();
1590       }
1591       outline += s;
1592       if ( token.role & NEWPARAGRAPH ) {
1593 	quotelevel = 0;
1594       }
1595       if ( token.role & BEGINQUOTE ) {
1596 	++quotelevel;
1597       }
1598       if ( verbose ) {
1599 	outline += "\t" + token.type + "\t" + toUString(token.role) + "\n";
1600       }
1601       if ( token.role & ENDQUOTE ) {
1602 	--quotelevel;
1603       }
1604 
1605       if ( token.role & ENDOFSENTENCE ) {
1606 	if ( verbose ) {
1607 	  if ( !(token.role & NOSPACE ) ){
1608 	    outline += "\n";
1609 	  }
1610 	}
1611 	else {
1612 	  if ( quotelevel == 0 ) {
1613 	    if ( sentenceperlineoutput ) {
1614 	      outline += "\n";
1615 	    }
1616 	    else {
1617 	      outline += " " + eosmark + " ";
1618 	    }
1619 	    if ( splitOnly ){
1620 	      outline += "\n";
1621 	    }
1622 	  }
1623 	  else { //inside quotation
1624 	    if ( splitOnly
1625 		 && !(token.role & NOSPACE ) ){
1626 	      outline += " ";
1627 	    }
1628 	  }
1629 	}
1630       }
1631       if ( ( &token != &(*tokens.rbegin()) )
1632 	   && !verbose ) {
1633 	if ( !( (token.role & ENDOFSENTENCE)
1634 		&& sentenceperlineoutput
1635 		&& !splitOnly ) ){
1636 	  if ( !(token.role & ENDOFSENTENCE) ){
1637 	    if ( splitOnly
1638 		 && (token.role & NOSPACE) ){
1639 	    }
1640 	    else {
1641 	      outline += " ";
1642 	    }
1643 	  }
1644 	}
1645 	else if ( (quotelevel > 0)
1646 		  && sentenceperlineoutput ) {
1647 	  //FBK: ADD SPACE WITHIN QUOTE CONTEXT IN ANY CASE
1648 	  outline += " ";
1649 	}
1650       }
1651       if (tokDebug >= 5){
1652 	LOG << "outputTokens: outline=" << outline << endl;
1653       }
1654       result += outline;
1655     }
1656     return result;
1657   }
1658 
countSentences(bool forceentirebuffer)1659   int TokenizerClass::countSentences( bool forceentirebuffer ) {
1660     //Return the number of *completed* sentences in the token buffer
1661 
1662     //Performs  extra sanity checks at the same time! Making sure
1663     //BEGINOFSENTENCE and ENDOFSENTENCE always pair up, and that TEMPENDOFSENTENCE roles
1664     //are converted to proper ENDOFSENTENCE markers
1665 
1666     short quotelevel = 0;
1667     int count = 0;
1668     const int size = tokens.size();
1669     int begin = 0;
1670     int i = 0;
1671     for ( auto& token : tokens ) {
1672       if (tokDebug >= 5){
1673 	LOG << "[countSentences] buffer#" <<i
1674 			<< " word=[" << token.us
1675 			<< "] role=" << token.role
1676 			<< ", quotelevel="<< quotelevel << endl;
1677       }
1678       if (token.role & NEWPARAGRAPH) quotelevel = 0;
1679       if (token.role & BEGINQUOTE) quotelevel++;
1680       if (token.role & ENDQUOTE) quotelevel--;
1681       if ( forceentirebuffer
1682 	   && (token.role & TEMPENDOFSENTENCE)
1683 	   && (quotelevel == 0)) {
1684 	//we thought we were in a quote, but we're not... No end quote was found and an end is forced now.
1685 	//Change TEMPENDOFSENTENCE to ENDOFSENTENCE and make sure sentences match up sanely
1686 	token.role &= ~TEMPENDOFSENTENCE;
1687 	token.role |= ENDOFSENTENCE;
1688       }
1689       tokens[begin].role |= BEGINOFSENTENCE;  //sanity check
1690       if ( (token.role & ENDOFSENTENCE)
1691 	   && (quotelevel == 0) ) {
1692 	begin = i + 1;
1693 	count++;
1694 	if (tokDebug >= 5){
1695 	  LOG << "[countSentences] SENTENCE #" << count << " found" << endl;
1696 	}
1697       }
1698       if ( forceentirebuffer
1699 	   && ( i == size - 1)
1700 	   && !(token.role & ENDOFSENTENCE) )  {
1701 	//last token of buffer
1702 	count++;
1703 	token.role |= ENDOFSENTENCE;
1704 	if (tokDebug >= 5){
1705 	  LOG << "[countSentences] SENTENCE #" << count << " *FORCIBLY* ended" << endl;
1706 	}
1707       }
1708       ++i;
1709     }
1710     if (tokDebug >= 5){
1711       LOG << "[countSentences] end of loop: returns " << count << endl;
1712     }
1713     return count;
1714   }
1715 
popSentence()1716   vector<Token> TokenizerClass::popSentence( ) {
1717     vector<Token> outToks;
1718     const int size = tokens.size();
1719     if ( size != 0 ){
1720       short quotelevel = 0;
1721       size_t begin = 0;
1722       for ( int i = 0; i < size; ++i ) {
1723 	if (tokens[i].role & NEWPARAGRAPH) {
1724 	  quotelevel = 0;
1725 	}
1726 	else if (tokens[i].role & ENDQUOTE) {
1727 	  --quotelevel;
1728 	}
1729 	if ( (tokens[i].role & BEGINOFSENTENCE)
1730 	     && (quotelevel == 0)) {
1731 	  begin = i;
1732 	}
1733 	//FBK: QUOTELEVEL GOES UP BEFORE begin IS UPDATED... RESULTS IN DUPLICATE OUTPUT
1734 	if (tokens[i].role & BEGINQUOTE) {
1735 	  ++quotelevel;
1736 	}
1737 
1738 	if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) {
1739 	  size_t end = i;
1740 	  if (tokDebug >= 1){
1741 	    LOG << "[tokenize] extracted sentence, begin=" << begin
1742 		<< ",end="<< end << endl;
1743 	  }
1744 	  for ( size_t index=begin; index <= end; ++index ){
1745 	    outToks.push_back( tokens[index] );
1746 	  }
1747 	  tokens.erase( tokens.begin(), tokens.begin()+end+1 );
1748 	  if ( !passthru ){
1749 	    string lang = get_language( outToks );
1750 	    if ( !settings[lang]->quotes.emptyStack() ) {
1751 	      settings[lang]->quotes.flushStack( end+1 );
1752 	    }
1753 	  }
1754 	  // we are done...
1755 	  return outToks;
1756 	}
1757       }
1758     }
1759     return outToks;
1760   }
1761 
getString(const vector<Token> & v)1762   UnicodeString TokenizerClass::getString( const vector<Token>& v ){
1763     if ( !v.empty() ){
1764       //This only makes sense in non-verbose mode, force verbose=false
1765       const bool tv = verbose;
1766       verbose = false;
1767       UnicodeString res = outputTokens( v );
1768       verbose = tv;
1769       return res;
1770     }
1771     return "";
1772   }
1773 
getUTF8String(const vector<Token> & v)1774   string TokenizerClass::getUTF8String( const vector<Token>& v ){
1775     UnicodeString result = getString( v );
1776     return TiCC::UnicodeToUTF8( result );
1777   }
1778 
getSentences()1779   vector<UnicodeString> TokenizerClass::getSentences() {
1780     vector<UnicodeString> sentences;
1781     if  (tokDebug > 0) {
1782       LOG << "[getSentences()] before countSent " << endl;
1783     }
1784     int numS = countSentences(true); // force buffer to end with END_OF_SENTENCE
1785     if  (tokDebug > 0) {
1786       LOG << "[getSentences] found " << numS << " sentence(s)" << endl;
1787     }
1788     for (int i = 0; i < numS; i++) {
1789       vector<Token> v = popSentence( );
1790       UnicodeString tmp = getString( v );
1791       sentences.push_back( tmp );
1792     }
1793     return sentences;
1794   }
1795 
getUTF8Sentences()1796   vector<string> TokenizerClass::getUTF8Sentences() {
1797     vector<UnicodeString> uv = getSentences();
1798     vector<string> result;
1799     for ( const auto& us : uv ){
1800       result.push_back( TiCC::UnicodeToUTF8(us) );
1801     }
1802     return result;
1803   }
1804 
1805   // FBK: return true if character is a quote.
u_isquote(UChar32 c,const Quoting & quotes) const1806   bool TokenizerClass::u_isquote( UChar32 c, const Quoting& quotes ) const {
1807     bool quote = false;
1808     if ( u_hasBinaryProperty( c, UCHAR_QUOTATION_MARK )
1809 	 || c == '`'
1810 	 || c == U'´' ) {
1811       // M$ users use the spacing grave and acute accents often as a
1812       // quote (apostroph) but is DOESN`T have the UCHAR_QUOTATION_MARK property
1813       // so trick that
1814       quote = true;
1815     }
1816     else {
1817       UnicodeString opening = quotes.lookupOpen( c );
1818       if (!opening.isEmpty()) {
1819 	quote = true;
1820       }
1821       else {
1822 	UnicodeString closing = quotes.lookupClose( c );
1823 	if (!closing.isEmpty()) {
1824 	  quote = true;
1825 	}
1826       }
1827     }
1828     return quote;
1829   }
1830 
1831   //FBK: USED TO CHECK IF CHARACTER AFTER QUOTE IS AN BOS.
1832   //MOSTLY THE SAME AS ABOVE, EXCEPT WITHOUT CHECK FOR PUNCTUATION
1833   //BECAUSE: '"Hoera!", zei de man' MUST NOT BE SPLIT ON ','..
is_BOS(UChar32 c)1834   bool is_BOS( UChar32 c ){
1835     bool is_bos = false;
1836     UBlockCode s = ublock_getCode(c);
1837     //test for languages that distinguish case
1838     if ( (s == UBLOCK_BASIC_LATIN) || (s == UBLOCK_GREEK)
1839 	 || (s == UBLOCK_CYRILLIC) || (s == UBLOCK_GEORGIAN)
1840 	 || (s == UBLOCK_ARMENIAN) || (s == UBLOCK_DESERET)) {
1841       if ( u_isupper(c) || u_istitle(c) ) {
1842 	//next 'word' starts with more punctuation or with uppercase
1843 	is_bos = true;
1844       }
1845     }
1846     return is_bos;
1847   }
1848 
resolveQuote(int endindex,const UnicodeString & open,Quoting & quotes)1849   bool TokenizerClass::resolveQuote( int endindex,
1850 				     const UnicodeString& open,
1851 				     Quoting& quotes ) {
1852     //resolve a quote
1853     int stackindex = -1;
1854     int beginindex = quotes.lookup( open, stackindex );
1855 
1856     if (beginindex >= 0) {
1857       if (tokDebug >= 2) {
1858 	LOG << "[resolveQuote] Quote found, begin="<< beginindex << ", end="<< endindex << endl;
1859       }
1860 
1861       if (beginindex > endindex) {
1862 	throw uRangeError( "Begin index for quote is higher than end index!" );
1863       }
1864 
1865       //We have a quote!
1866 
1867       //resolve sentences within quote, all sentences must be full sentences:
1868       int beginsentence = beginindex + 1;
1869       int expectingend = 0;
1870       int subquote = 0;
1871       int size = tokens.size();
1872       for (int i = beginsentence; i < endindex; i++) {
1873 	if (tokens[i].role & BEGINQUOTE) subquote++;
1874 
1875 	if (subquote == 0) {
1876 	  if (tokens[i].role & BEGINOFSENTENCE) expectingend++;
1877 	  if (tokens[i].role & ENDOFSENTENCE) expectingend--;
1878 
1879 	  if (tokens[i].role & TEMPENDOFSENTENCE) {
1880 	    tokens[i].role &= ~TEMPENDOFSENTENCE;
1881 	    tokens[i].role |= ENDOFSENTENCE;
1882 	    tokens[beginsentence].role |= BEGINOFSENTENCE;
1883 	    beginsentence = i + 1;
1884 	  }
1885 	  // In case of nested quoted sentences, such as:
1886 	  //    MvD: "Nou, Van het Gouden Been ofzo herinner ik mij als kind: 'Waar is mijn gouden been?'"
1887 	  // the BEGINOFSENTENCE is only set for the inner quoted sentence 'Waar is mijn gouden been'. However,
1888 	  // We also need one for the outser sentence.
1889 	}
1890 	else if ( (tokens[i].role & ENDQUOTE)
1891 		  && (tokens[i].role & ENDOFSENTENCE)) {
1892 	  tokens[beginsentence].role |= BEGINOFSENTENCE;
1893 	  beginsentence = i + 1;
1894 	}
1895 	if (tokens[i].role & ENDQUOTE) subquote--;
1896       }
1897       if ((expectingend == 0) && (subquote == 0)) {
1898 	//ok, all good, mark the quote:
1899 	tokens[beginindex].role |= BEGINQUOTE;
1900 	tokens[endindex].role |= ENDQUOTE;
1901 	if ( tokDebug >= 2 ) {
1902 	  LOG << "marked BEGIN: " << tokens[beginindex] << endl;
1903 	  LOG << "marked   END: " << tokens[endindex] << endl;
1904 	}
1905       }
1906       else if ( expectingend == 1
1907 		&& subquote == 0
1908 		&& !( tokens[endindex - 1].role & ENDOFSENTENCE) ) {
1909 	//missing one endofsentence, we can correct, last token in quote token is endofsentence:
1910 	if ( tokDebug >= 2 ) {
1911 	  LOG << "[resolveQuote] Missing endofsentence in quote, fixing... " << expectingend << endl;
1912 	}
1913 	tokens[endindex - 1].role |= ENDOFSENTENCE;
1914 	//mark the quote
1915 	tokens[beginindex].role |= BEGINQUOTE;
1916 	tokens[endindex].role |= ENDQUOTE;
1917       }
1918       else {
1919 	if ( tokDebug >= 2) {
1920 	  LOG << "[resolveQuote] Quote can not be resolved, unbalanced sentences or subquotes within quote, skipping... (expectingend=" << expectingend << ",subquote=" << subquote << ")" << endl;
1921 	}
1922 	//something is wrong. Sentences within quote are not balanced, so we won't mark the quote.
1923       }
1924       //remove from stack (ok, granted, stack is a bit of a misnomer here)
1925       quotes.eraseAtPos( stackindex );
1926       //FBK: ENDQUOTES NEED TO BE MARKED AS ENDOFSENTENCE IF THE PREVIOUS TOKEN
1927       //WAS AN ENDOFSENTENCE. OTHERWISE THE SENTENCES WILL NOT BE SPLIT.
1928       if ( tokens[endindex].role & ENDQUOTE
1929 	   && tokens[endindex-1].role & ENDOFSENTENCE ) {
1930         //FBK: CHECK FOR EOS AFTER QUOTES
1931         if ((endindex+1 == size) || //FBK: endindex EQUALS TOKEN SIZE, MUST BE EOSMARKERS
1932             ((endindex + 1 < size) && (is_BOS(tokens[endindex+1].us[0])))) {
1933 	  tokens[endindex].role |= ENDOFSENTENCE;
1934 	  // FBK: CHECK IF NEXT TOKEN IS A QUOTE AND NEXT TO THE QUOTE A BOS
1935         }
1936 	else if ( endindex + 2 < size
1937 		  && u_isquote( tokens[endindex+1].us[0], quotes )
1938 		  && is_BOS( tokens[endindex+2].us[0] ) ) {
1939 	  tokens[endindex].role |= ENDOFSENTENCE;
1940 	  // If the current token is an ENDQUOTE and the next token is a quote and also the last token,
1941 	  // the current token is an EOS.
1942         }
1943 	else if ( endindex + 2 == size
1944 		  && u_isquote( tokens[endindex+1].us[0], quotes ) ) {
1945 	  tokens[endindex].role |= ENDOFSENTENCE;
1946         }
1947       }
1948       return true;
1949     }
1950     else {
1951       return false;
1952     }
1953   }
1954 
detectEos(size_t i,const UnicodeString & eosmarkers,const Quoting & quotes) const1955   bool TokenizerClass::detectEos( size_t i,
1956 				  const UnicodeString& eosmarkers,
1957 				  const Quoting& quotes ) const {
1958     bool is_eos = false;
1959     UChar32 c = tokens[i].us.char32At(0);
1960     if ( c == '.' || eosmarkers.indexOf( c ) >= 0 ){
1961       if (i + 1 == tokens.size() ) {	//No next character?
1962 	is_eos = true; //Newline after eosmarker
1963       }
1964       else {
1965 	c = tokens[i+1].us.char32At(0);
1966 	if ( u_isquote( c, quotes ) ){
1967 	  // next word is quote
1968 	  if ( detectQuotes ){
1969 	    is_eos = true;
1970 	  }
1971 	  else if ( i + 2 < tokens.size() ) {
1972 	    c = tokens[i+2].us.char32At(0);
1973 	    if ( u_isupper(c) || u_istitle(c) || u_ispunct(c) ){
1974 	      //next 'word' after quote starts with uppercase or is punct
1975 	      is_eos = true;
1976 	    }
1977 	  }
1978 	}
1979 	else if ( tokens[i].us.length() > 1 ){
1980 	  // PUNCTUATION multi...
1981 	  if ( u_isupper(c) || u_istitle(c) )
1982 	    is_eos = true;
1983 	}
1984 	else
1985 	  is_eos = true;
1986       }
1987     }
1988     return is_eos;
1989   }
1990 
detectQuoteBounds(const int i,Quoting & quotes)1991   void TokenizerClass::detectQuoteBounds( const int i,
1992 					  Quoting& quotes ) {
1993     UChar32 c = tokens[i].us.char32At(0);
1994     //Detect Quotation marks
1995     if ((c == '"') || ( UnicodeString(c) == """) ) {
1996       if (tokDebug > 1 ){
1997 	LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl;
1998       }
1999       if (!resolveQuote(i,c,quotes)) {
2000 	if (tokDebug > 1 ) {
2001 	  LOG << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl;
2002 	}
2003 	quotes.push( i, c );
2004       }
2005     }
2006     else if ( c == '\'' ) {
2007       if (tokDebug > 1 ){
2008 	LOG << "[detectQuoteBounds] Standard single-quote (ambiguous) found @i="<< i << endl;
2009       }
2010       if (!resolveQuote(i,c,quotes)) {
2011 	if (tokDebug > 1 ) {
2012 	  LOG << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl;
2013 	}
2014 	quotes.push( i, c );
2015       }
2016     }
2017     else {
2018       UnicodeString close = quotes.lookupOpen( c );
2019       if ( !close.isEmpty() ){ // we have a opening quote
2020 	if ( tokDebug > 1 ) {
2021 	  LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl;
2022 	}
2023 	quotes.push( i, c ); // remember it
2024       }
2025       else {
2026 	UnicodeString open = quotes.lookupClose( c );
2027 	if ( !open.isEmpty() ) { // we have a closing quote
2028 	  if (tokDebug > 1 ) {
2029 	    LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl;
2030 	  }
2031 	  if ( !resolveQuote( i, open, quotes )) {
2032 	    // resolve the matching opening
2033 	    if (tokDebug > 1 ) {
2034 	      LOG << "[detectQuoteBounds] Unable to resolve" << endl;
2035 	    }
2036 	  }
2037 	}
2038       }
2039     }
2040   }
2041 
isClosing(const Token & tok)2042   bool isClosing( const Token& tok ){
2043     if ( tok.us.length() == 1 &&
2044 	 ( tok.us[0] == ')' || tok.us[0] == '}'
2045 	   || tok.us[0] == ']' || tok.us[0] == '>' ) )
2046       return true;
2047     return false;
2048   }
2049 
detectSentenceBounds(const int offset,const string & lang)2050   void TokenizerClass::detectSentenceBounds( const int offset,
2051 					     const string& lang ){
2052     //find sentences
2053     string method;
2054     if ( detectQuotes ){
2055       method = "[detectSentenceBounds-(quoted)]";
2056     }
2057     else {
2058       method = "[detectSentenceBounds]";
2059     }
2060     const int size = tokens.size();
2061     for (int i = offset; i < size; i++) {
2062       if (tokDebug > 1 ){
2063 	LOG << method << " i="<< i << " word=[" << tokens[i].us
2064 	    << "] type=" << tokens[i].type
2065 	    << ", role=" << tokens[i].role << endl;
2066       }
2067       if ( tokens[i].type.startsWith("PUNCTUATION") ){
2068 	if ((tokDebug > 1 )){
2069 	  LOG << method << " PUNCTUATION FOUND @i=" << i << endl;
2070 	}
2071 	// we have some kind of punctuation. Does it mark an eos?
2072 	bool is_eos = detectEos( i,
2073 				 settings[lang]->eosmarkers,
2074 				 settings[lang]->quotes );
2075 	if (is_eos) {
2076 	  // end of sentence found/ so wrap up
2077 	  if ( detectQuotes
2078 	       && !settings[lang]->quotes.emptyStack() ) {
2079 	    // we have some quotes!
2080 	    if ( tokDebug > 1 ){
2081 	      LOG << method << " Unbalances quotes: Preliminary EOS FOUND @i="
2082 		  << i << endl;
2083 	    }
2084 	    // we set a temporary EOS marker,
2085 	    // to be resolved later when full quote is found.
2086 	    tokens[i].role |= TEMPENDOFSENTENCE;
2087 	    // If previous token is also TEMPENDOFSENTENCE,
2088 	    // it stops being so in favour of this one
2089 	    if ( i > 0 ){
2090 	      tokens[i-1].role &= ~TEMPENDOFSENTENCE;
2091 	    }
2092 	  }
2093 	  else {
2094 	    // No quotes
2095 	    if ( tokDebug > 1 ){
2096 	      LOG << method << " EOS FOUND @i=" << i << endl;
2097 	    }
2098 	    tokens[i].role |= ENDOFSENTENCE;
2099 	    // if this is the end of the sentence,
2100 	    // the next token is the beginning of a new one
2101 	    if ( (i + 1) < size ){
2102 	      tokens[i+1].role |= BEGINOFSENTENCE;
2103 	    }
2104 	    // if previous token is EOS and not BOS, it will stop being EOS,
2105 	    // as this one will take its place
2106 	    if ( i > 0
2107 		 && ( tokens[i-1].role & ENDOFSENTENCE )
2108 		 && !( tokens[i-1].role & BEGINOFSENTENCE ) ) {
2109 	      tokens[i-1].role &= ~ENDOFSENTENCE;
2110 	      tokens[i].role &= ~BEGINOFSENTENCE;
2111 	    }
2112 	  }
2113 	}
2114 	else if ( isClosing(tokens[i] ) ) {
2115 	  // we have a closing symbol
2116 	  if ( tokDebug > 1 ){
2117 	    LOG << method << " Close FOUND @i=" << i << endl;
2118 	  }
2119 	  //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place
2120 	  if ( i > 0
2121 	       && ( tokens[i-1].role & ENDOFSENTENCE )
2122 	       && !( tokens[i-1].role & BEGINOFSENTENCE) ) {
2123 	    tokens[i-1].role &= ~ENDOFSENTENCE;
2124 	    tokens[i].role &= ~BEGINOFSENTENCE;
2125 	  }
2126 	}
2127 	if ( detectQuotes ){
2128 	  // check the quotes
2129 	  detectQuoteBounds( i, settings[lang]->quotes );
2130 	}
2131       }
2132     }
2133     for (int i = size-1; i > offset; --i ) {
2134       // at the end of the buffer there may be some PUNCTUATION which
2135       // has spurious ENDOFSENTENCE and BEGINOFSENTENCE annotation
2136       // fix this up to avoid sentences containing only punctuation
2137       // also we don't want a BEGINQUOTE to be an ENDOFSENTENCE
2138       if ( tokDebug > 2 ){
2139 	LOG << method << " fixup-end i="<< i << " word=["
2140 	    << tokens[i].us
2141 	    << "] type=" << tokens[i].type
2142 	    << ", role=" << tokens[i].role << endl;
2143       }
2144       if ( tokens[i].type.startsWith("PUNCTUATION") ) {
2145 	tokens[i].role &= ~BEGINOFSENTENCE;
2146 	if ( !detectQuotes ||
2147 	     (tokens[i].role & BEGINQUOTE) ){
2148 	  if ( i != size-1 ){
2149 	    tokens[i].role &= ~ENDOFSENTENCE;
2150 	  }
2151 	}
2152       }
2153       else
2154 	break;
2155     }
2156   }
2157 
passthruLine(const UnicodeString & input,bool & bos)2158   void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) {
2159     if (tokDebug) {
2160       LOG << "[passthruLine] input: line=[" << input << "]" << endl;
2161     }
2162     bool alpha = false, num = false, punct = false;
2163     UnicodeString word;
2164     StringCharacterIterator sit(input);
2165     while ( sit.hasNext() ){
2166       UChar32 c = sit.current32();
2167       if ( c == u'\u200D' ){
2168 	// a joiner. just ignore
2169 	sit.next32();
2170 	continue;
2171       }
2172       if ( u_isspace(c) ) {
2173 	if ( word.isEmpty() ){
2174 	  // a leading space. Don't waste time on it. SKIP
2175 	  sit.next32();
2176 	  continue;
2177 	}
2178 	// so a trailing space. handle the found word.
2179 	if (tokDebug){
2180 	  LOG << "[passthruLine] word=[" << word << "]" << endl;
2181 	}
2182 	if ( word == eosmark ) {
2183 	  word = "";
2184 	  if (!tokens.empty())
2185 	    tokens.back().role |= ENDOFSENTENCE;
2186 	  bos = true;
2187 	}
2188 	else {
2189 	  UnicodeString type;
2190 	  if (alpha && !num && !punct) {
2191 	    type = type_word;
2192 	  }
2193 	  else if (num && !alpha && !punct) {
2194 	    type = type_number;
2195 	  }
2196 	  else if (punct && !alpha && !num) {
2197 	    type = type_punctuation;
2198 	  }
2199 	  else {
2200 	    type = type_unknown;
2201 	  }
2202 	  if ( doPunctFilter
2203 	       && ( type == type_punctuation || type == type_currency ||
2204 		    type == type_emoticon || type == type_picto ) ) {
2205 	    if (tokDebug >= 2 ){
2206 	      LOG << "   [passThruLine] skipped PUNCTUATION ["
2207 			      << input << "]" << endl;
2208 	    }
2209 	    if ( !tokens.empty() ){
2210 	      tokens.back().role &= ~NOSPACE;
2211 	    }
2212 	  }
2213 	  else {
2214 	    if ( norm_set.find( type ) != norm_set.end() ){
2215 	      word = "{{" + type + "}}";
2216 	    }
2217 	    if (bos) {
2218 	      tokens.push_back( Token( type, word , BEGINOFSENTENCE ) );
2219 	      bos = false;
2220 	    }
2221 	    else {
2222 	      tokens.push_back( Token( type, word ) );
2223 	    }
2224 	  }
2225 	  alpha = false;
2226 	  num = false;
2227 	  punct = false;
2228           word = "";
2229 	}
2230       }
2231       else {
2232 	if ( u_isalpha(c)) {
2233 	  alpha = true;
2234 	}
2235 	else if (u_ispunct(c)) {
2236 	  punct = true;
2237 	}
2238 	else if (u_isdigit(c)) {
2239 	  num = true;
2240 	}
2241 	word += c;
2242       }
2243       sit.next32();
2244     }
2245     if (word != "") {
2246       if ( word == eosmark ) {
2247 	word = "";
2248 	if (!tokens.empty())
2249 	  tokens.back().role |= ENDOFSENTENCE;
2250       }
2251       else {
2252 	UnicodeString type;
2253 	if (alpha && !num && !punct) {
2254 	  type = type_word;
2255 	}
2256 	else if (num && !alpha && !punct) {
2257 	  type = type_number;
2258 	}
2259 	else if (punct && !alpha && !num) {
2260 	  type = type_punctuation;
2261 	}
2262 	else {
2263 	  type = type_unknown;
2264 	}
2265 	if ( doPunctFilter
2266 	     && ( type == type_punctuation || type == type_currency ||
2267 		  type == type_emoticon || type == type_picto ) ) {
2268 	  if (tokDebug >= 2 ){
2269 	    LOG << "   [passThruLine] skipped PUNCTUATION ["
2270 			    << input << "]" << endl;
2271 	  }
2272 	  if ( !tokens.empty() ){
2273 	    tokens.back().role &= ~NOSPACE;
2274 	  }
2275 	}
2276 	else {
2277 	  if ( norm_set.find( type ) != norm_set.end() ){
2278 	    word = "{{" + type + "}}";
2279 	  }
2280 	  if (bos) {
2281 	    tokens.push_back( Token( type, word , BEGINOFSENTENCE ) );
2282 	    bos = false;
2283 	  }
2284 	  else {
2285 	    tokens.push_back( Token( type, word ) );
2286 	  }
2287 	}
2288       }
2289     }
2290     if ( sentenceperlineinput && tokens.size() > 0 ) {
2291       tokens[0].role |= BEGINOFSENTENCE;
2292       tokens.back().role |= ENDOFSENTENCE;
2293     }
2294   }
2295 
checkBOM(istream & in)2296   string TokenizerClass::checkBOM( istream& in ){
2297     string result = inputEncoding;
2298     if ( &in == &cin ){
2299       return result;
2300     }
2301     streampos pos = in.tellg();
2302     string s;
2303     in >> s;
2304     UErrorCode err = U_ZERO_ERROR;
2305     int32_t bomLength = 0;
2306     const char *encoding = ucnv_detectUnicodeSignature( s.c_str(),
2307 							s.length(),
2308 							&bomLength,
2309 							&err);
2310     if ( bomLength ){
2311       if ( tokDebug ){
2312 	LOG << "Autodetected encoding: " << encoding << endl;
2313       }
2314       result = encoding;
2315       if ( result == "UTF16BE"
2316 	   || result == "UTF-16BE" ){
2317 	result = "UTF16BE";
2318       }
2319     }
2320     in.seekg( pos + (streampos)bomLength );
2321     return result;
2322   }
2323 
2324   // string wrapper
tokenizeLine(const string & s,const string & lang)2325   void TokenizerClass::tokenizeLine( const string& s,
2326 				     const string& lang ){
2327     UnicodeString us = convert( s, inputEncoding );
2328     tokenizeLine( us, lang );
2329   }
2330 
2331   // UnicodeString wrapper
tokenizeLine(const UnicodeString & us,const string & lang)2332   void TokenizerClass::tokenizeLine( const UnicodeString& us,
2333 				     const string& lang ){
2334     bool bos = true;
2335     tokenize_one_line( us, bos, lang );
2336     if  (tokDebug > 0) {
2337       LOG << "[tokenizeLine()] before countSent " << endl;
2338     }
2339     countSentences(true); // force the ENDOFSENTENCE
2340   }
2341 
u_isemo(UChar32 c)2342   bool u_isemo( UChar32 c ){
2343     UBlockCode s = ublock_getCode(c);
2344     return s == UBLOCK_EMOTICONS;
2345   }
2346 
u_ispicto(UChar32 c)2347   bool u_ispicto( UChar32 c ){
2348     UBlockCode s = ublock_getCode(c);
2349     return s == UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS ;
2350   }
2351 
u_iscurrency(UChar32 c)2352   bool u_iscurrency( UChar32 c ){
2353     return u_charType( c ) == U_CURRENCY_SYMBOL;
2354   }
2355 
u_issymbol(UChar32 c)2356   bool u_issymbol( UChar32 c ){
2357     return u_charType( c ) == U_CURRENCY_SYMBOL
2358       || u_charType( c ) == U_MATH_SYMBOL
2359       || u_charType( c ) == U_MODIFIER_SYMBOL
2360       || u_charType( c ) == U_OTHER_SYMBOL;
2361   }
2362 
detect_type(UChar32 c)2363   const UnicodeString& detect_type( UChar32 c ){
2364     if ( u_isspace(c)) {
2365       return type_space;
2366     }
2367     else if ( u_iscurrency(c)) {
2368       return type_currency;
2369     }
2370     else if ( u_ispunct(c)) {
2371       return type_punctuation;
2372     }
2373     else if ( u_isemo( c ) ) {
2374       return type_emoticon;
2375     }
2376     else if ( u_ispicto( c ) ) {
2377       return type_picto;
2378     }
2379     else if ( u_isalpha(c)) {
2380       return type_word;
2381     }
2382     else if ( u_isdigit(c)) {
2383       return type_number;
2384     }
2385     else if ( u_issymbol(c)) {
2386       return type_symbol;
2387     }
2388     else {
2389       return type_unknown;
2390     }
2391   }
2392 
toString(int8_t c)2393   std::string toString( int8_t c ){
2394     switch ( c ){
2395     case 0:
2396       return "U_UNASSIGNED";
2397     case 1:
2398       return "U_UPPERCASE_LETTER";
2399     case 2:
2400       return "U_LOWERCASE_LETTER";
2401     case 3:
2402       return "U_TITLECASE_LETTER";
2403     case 4:
2404       return "U_MODIFIER_LETTER";
2405     case 5:
2406       return "U_OTHER_LETTER";
2407     case 6:
2408       return "U_NON_SPACING_MARK";
2409     case 7:
2410       return "U_ENCLOSING_MARK";
2411     case 8:
2412       return "U_COMBINING_SPACING_MARK";
2413     case 9:
2414       return "U_DECIMAL_DIGIT_NUMBER";
2415     case 10:
2416       return "U_LETTER_NUMBER";
2417     case 11:
2418       return "U_OTHER_NUMBER";
2419     case 12:
2420       return "U_SPACE_SEPARATOR";
2421     case 13:
2422       return "U_LINE_SEPARATOR";
2423     case 14:
2424       return "U_PARAGRAPH_SEPARATOR";
2425     case 15:
2426       return "U_CONTROL_CHAR";
2427     case 16:
2428       return "U_FORMAT_CHAR";
2429     case 17:
2430       return "U_PRIVATE_USE_CHAR";
2431     case 18:
2432       return "U_SURROGATE";
2433     case 19:
2434       return "U_DASH_PUNCTUATION";
2435     case 20:
2436       return "U_START_PUNCTUATION";
2437     case 21:
2438       return "U_END_PUNCTUATION";
2439     case 22:
2440       return "U_CONNECTOR_PUNCTUATION";
2441     case 23:
2442       return "U_OTHER_PUNCTUATION";
2443     case 24:
2444       return "U_MATH_SYMBOL";
2445     case 25:
2446       return "U_CURRENCY_SYMBOL";
2447     case 26:
2448       return "U_MODIFIER_SYMBOL";
2449     case 27:
2450       return "U_OTHER_SYMBOL";
2451     case 28:
2452       return "U_INITIAL_PUNCTUATION";
2453     case 29:
2454       return "U_FINAL_PUNCTUATION";
2455     default:
2456       return "OMG NO CLUE WHAT KIND OF SYMBOL THIS IS: "
2457 	+ TiCC::toString( int(c) );
2458     }
2459   }
2460 
internal_tokenize_line(const UnicodeString & originput,const string & _lang)2461   int TokenizerClass::internal_tokenize_line( const UnicodeString& originput,
2462 					      const string& _lang ){
2463     string lang = _lang;
2464     if ( lang.empty() ){
2465       lang = "default";
2466     }
2467     else {
2468       auto const it = settings.find( lang );
2469       if ( it == settings.end() ){
2470 	LOG << "tokenizeLine: no settings found for language=" + lang << endl
2471 	    << "using the default language instead:" << default_language << endl;
2472 	lang = "default";
2473       }
2474     }
2475     if (tokDebug){
2476       LOG << "[tokenizeLine] input: line=["
2477 	  << originput << "] (language= " << lang << ")" << endl;
2478     }
2479     UnicodeString input = normalizer.normalize( originput );
2480     if ( doFilter ){
2481       input = settings[lang]->filter.filter( input );
2482     }
2483     if ( input.isBogus() ){ //only tokenize valid input
2484       LOG << "ERROR: Invalid UTF-8 in line:" << linenum << endl
2485 	  << "   '" << input << "'" << endl;
2486       return 0;
2487     }
2488     int32_t len = input.countChar32();
2489     if (tokDebug){
2490       LOG << "[tokenizeLine] filtered input: line=["
2491 		      << input << "] (" << len
2492 		      << " unicode characters)" << endl;
2493     }
2494     const int begintokencount = tokens.size();
2495     if (tokDebug) {
2496       LOG << "[tokenizeLine] Tokens still in buffer: " << begintokencount << endl;
2497     }
2498 
2499     bool tokenizeword = false;
2500     bool reset = false;
2501     //iterate over all characters
2502     UnicodeString word;
2503     StringCharacterIterator sit(input);
2504     long int i = 0;
2505     long int tok_size = 0;
2506     while ( sit.hasNext() ){
2507       UChar32 c = sit.current32();
2508       bool joiner = false;
2509       if ( c == u'\u200D' ){
2510 	joiner = true;
2511       }
2512       if ( tokDebug > 8 ){
2513 	UnicodeString s = c;
2514 	int8_t charT = u_charType( c );
2515 	LOG << "examine character: " << s << " type= "
2516 	    << toString( charT  ) << endl;
2517       }
2518       if (reset) { //reset values for new word
2519 	reset = false;
2520 	tok_size = 0;
2521 	if ( !joiner && !u_isspace(c) ){
2522 	  word = c;
2523 	}
2524 	else {
2525 	  word = "";
2526 	}
2527 	tokenizeword = false;
2528       }
2529       else if ( !joiner && !u_isspace(c) ){
2530 	word += c;
2531       }
2532       if ( joiner && sit.hasNext() ){
2533 	UChar32 peek = sit.next32();
2534 	if ( u_isspace(peek) ){
2535 	  joiner = false;
2536 	}
2537 	sit.previous32();
2538       }
2539       if ( u_isspace(c) || joiner || i == len-1 ){
2540 	if (tokDebug){
2541 	  LOG << "[tokenizeLine] space detected, word=[" << word << "]" << endl;
2542 	}
2543 	if ( i == len-1 ) {
2544 	  if ( joiner
2545 	       || u_ispunct(c)
2546 	       || u_isdigit(c)
2547 	       || u_isquote( c, settings[lang]->quotes )
2548 	       || u_isemo(c) ){
2549 	    tokenizeword = true;
2550 	  }
2551 	}
2552 	if ( c == '\n' && word.isEmpty() ){
2553 	  if (tokDebug){
2554 	    LOG << "[tokenizeLine] NEW PARAGRAPH upcoming " << endl;
2555 	  }
2556 	  // signal that the next word starts a new Paragraph. (if its there)
2557 	  paragraphsignal_next = true;
2558 	}
2559 	int expliciteosfound = -1;
2560 	if ( word.length() >= eosmark.length() ) {
2561 	  expliciteosfound = word.lastIndexOf(eosmark);
2562 
2563 	  if (expliciteosfound != -1) { // word contains eosmark
2564 	    if ( tokDebug >= 2){
2565 	      LOG << "[tokenizeLine] Found explicit EOS marker @"<<expliciteosfound << endl;
2566 	    }
2567 	    int eospos = tokens.size()-1;
2568 	    if (expliciteosfound > 0) {
2569 	      UnicodeString realword;
2570 	      word.extract(0,expliciteosfound,realword);
2571 	      if (tokDebug >= 2) {
2572 		LOG << "[tokenizeLine] Prefix before EOS: "
2573 				<< realword << endl;
2574 	      }
2575 	      tokenizeWord( realword, false, lang );
2576 	      eospos++;
2577 	    }
2578 	    if ( expliciteosfound + eosmark.length() < word.length() ){
2579 	      UnicodeString realword;
2580 	      word.extract( expliciteosfound+eosmark.length(),
2581 			    word.length() - expliciteosfound - eosmark.length(),
2582 			    realword );
2583 	      if (tokDebug >= 2){
2584 		LOG << "[tokenizeLine] postfix after EOS: "
2585 				<< realword << endl;
2586 	      }
2587 	      tokenizeWord( realword, true, lang );
2588 	    }
2589 	    if ( !tokens.empty() && eospos >= 0 ) {
2590 	      if (tokDebug >= 2){
2591 		LOG << "[tokenizeLine] Assigned EOS" << endl;
2592 	      }
2593 	      tokens[eospos].role |= ENDOFSENTENCE;
2594 	    }
2595 	  }
2596 	}
2597 	if ( word.length() > 0
2598 	     && expliciteosfound == -1 ) {
2599 	  if (tokDebug >= 2){
2600 	    LOG << "[tokenizeLine] Further tokenization necessary for: ["
2601 			    << word << "]" << endl;
2602 	  }
2603 	  if ( tokenizeword ) {
2604 	    tokenizeWord( word, !joiner, lang );
2605 	  }
2606 	  else {
2607 	    tokenizeWord( word, !joiner, lang, type_word );
2608 	  }
2609 	}
2610 	//reset values for new word
2611 	reset = true;
2612       }
2613       else if ( u_ispunct(c)
2614 		|| u_isdigit(c)
2615 		|| u_isquote( c, settings[lang]->quotes )
2616 		|| u_isemo(c) ){
2617 	if (tokDebug){
2618 	  LOG << "[tokenizeLine] punctuation or digit detected, word=["
2619 			  << word << "]" << endl;
2620 	}
2621 	//there is punctuation or digits in this word, mark to run through tokenizer
2622 	tokenizeword = true;
2623       }
2624       sit.next32();
2625       ++i;
2626       ++tok_size;
2627       if ( tok_size > 2500 ){
2628 	LOG << "Ridiculously long word/token (over 2500 characters) detected "
2629 	    << "in line: " << linenum << ". Skipped ..." << endl;
2630 	LOG << "The line starts with " << UnicodeString( word, 0, 75 )
2631 	    << "..." << endl;
2632 	return 0;
2633       }
2634     }
2635     int numNewTokens = tokens.size() - begintokencount;
2636     if (tokDebug >= 10){
2637       LOG << "tokens.size() = " << tokens.size() << endl;
2638       LOG << "begintokencount = " << begintokencount << endl;
2639       LOG << "numnew = " << numNewTokens << endl;
2640     }
2641     if ( numNewTokens > 0 ){
2642       if (paragraphsignal) {
2643 	tokens[begintokencount].role |= NEWPARAGRAPH | BEGINOFSENTENCE;
2644 	paragraphsignal = false;
2645       }
2646       //find sentence boundaries
2647       if (sentenceperlineinput) {
2648 	// force it to be a sentence
2649 	tokens[begintokencount].role |= BEGINOFSENTENCE;
2650 	tokens.back().role |= ENDOFSENTENCE;
2651       }
2652       detectSentenceBounds( begintokencount );
2653     }
2654     return numNewTokens;
2655   }
2656 
tokenizeWord(const UnicodeString & input,bool space,const string & lang,const UnicodeString & assigned_type)2657   void TokenizerClass::tokenizeWord( const UnicodeString& input,
2658 				     bool space,
2659 				     const string& lang,
2660 				     const UnicodeString& assigned_type ) {
2661     bool recurse = !assigned_type.isEmpty();
2662 
2663     int32_t inpLen = input.countChar32();
2664     if ( tokDebug > 2 ){
2665       if ( recurse ){
2666 	LOG << "   [tokenizeWord] Recurse Input: (" << inpLen << ") "
2667 	    << "word=[" << input << "], type=" << assigned_type
2668 	    << " Space=" << (space?"TRUE":"FALSE") << endl;
2669       }
2670       else {
2671 	LOG << "   [tokenizeWord] Input: (" << inpLen << ") "
2672 	    << "word=[" << input << "]"
2673 	    << " Space=" << (space?"TRUE":"FALSE") << endl;      }
2674     }
2675     if ( input == eosmark ) {
2676       if (tokDebug >= 2){
2677 	LOG << "   [tokenizeWord] Found explicit EOS marker" << endl;
2678       }
2679       if (!tokens.empty()) {
2680 	if (tokDebug >= 2){
2681 	  LOG << "   [tokenizeWord] Assigned EOS" << endl;
2682 	}
2683 	tokens.back().role |= ENDOFSENTENCE;
2684       }
2685       else {
2686 	LOG << "[WARNING] Found explicit EOS marker by itself, this will have no effect!" << endl;
2687       }
2688       return;
2689     }
2690 
2691     if ( inpLen == 1) {
2692       //single character, no need to process all rules, do some simpler (faster) detection
2693       UChar32 c = input.char32At(0);
2694       UnicodeString type = detect_type( c );
2695       if ( type == type_space ){
2696 	return;
2697       }
2698       if ( doPunctFilter
2699 	   && ( type == type_punctuation || type == type_currency ||
2700 		type == type_emoticon || type == type_picto ) ) {
2701 	if (tokDebug >= 2 ){
2702 	  LOG << "   [tokenizeWord] skipped PUNCTUATION ["
2703 			  << input << "]" << endl;
2704 	}
2705 	if ( !tokens.empty() ){
2706 	  tokens.back().role &= ~NOSPACE;
2707 	}
2708       }
2709       else {
2710 	UnicodeString word = input;
2711 	if ( norm_set.find( type ) != norm_set.end() ){
2712 	  word = "{{" + type + "}}";
2713 	}
2714 	TokenRole role = (space ? NOROLE : NOSPACE);
2715 	if ( paragraphsignal_next ){
2716 	  role |= NEWPARAGRAPH;
2717 	  paragraphsignal_next = false;
2718 	}
2719 	Token T( type, word, role, lang );
2720 	tokens.push_back( T );
2721 	if (tokDebug >= 2){
2722 	  LOG << "   [tokenizeWord] added token " << T << endl;
2723 	}
2724       }
2725     }
2726     else {
2727       bool a_rule_matched = false;
2728       for ( const auto& rule : settings[lang]->rules ) {
2729 	if ( tokDebug >= 4){
2730 	  LOG << "\tTESTING " << rule->id << endl;
2731 	}
2732 	UnicodeString type = rule->id;
2733 	//Find first matching rule
2734 	UnicodeString pre, post;
2735 	vector<UnicodeString> matches;
2736 	if ( rule->matchAll( input, pre, post, matches ) ){
2737 	  a_rule_matched = true;
2738 	  if ( tokDebug >= 4 ){
2739 	    LOG << "\tMATCH: " << type << endl;
2740 	    LOG << "\tpre=  '" << pre << "'" << endl;
2741 	    LOG << "\tpost= '" << post << "'" << endl;
2742 	    int cnt = 0;
2743 	    for ( const auto& m : matches ){
2744 	      LOG << "\tmatch[" << ++cnt << "]=" << m << endl;
2745 	    }
2746 	  }
2747 	  if ( recurse
2748 	       && ( type == type_word
2749 		    || ( pre.isEmpty()
2750 			 && post.isEmpty() ) ) ){
2751 	    // so only do this recurse step when:
2752 	    //   OR we have a WORD
2753 	    //   OR we have an exact match of the rule (no pre or post)
2754 	    if ( assigned_type != type_word ){
2755 	      // don't change the type when:
2756 	      //   it was already non-WORD
2757 	      if ( tokDebug >= 4 ){
2758 		LOG << "\trecurse, match didn't do anything new for " << input << endl;
2759 	      }
2760 	      TokenRole role = (space ? NOROLE : NOSPACE);
2761 	      if ( paragraphsignal_next ){
2762 		role |= NEWPARAGRAPH;
2763 		paragraphsignal_next = false;
2764 	      }
2765 	      tokens.push_back( Token( assigned_type, input, role, lang ) );
2766 	      return;
2767 	    }
2768 	    else {
2769 	      if ( tokDebug >= 4 ){
2770 		LOG << "\trecurse, match changes the type:"
2771 				<< assigned_type << " to " << type << endl;
2772 	      }
2773 	      TokenRole role = (space ? NOROLE : NOSPACE);
2774 	      if ( paragraphsignal_next ){
2775 		role |= NEWPARAGRAPH;
2776 		paragraphsignal_next = false;
2777 	      }
2778 	      tokens.push_back( Token( type, input, role, lang ) );
2779 	      return;
2780 	    }
2781 	  }
2782 	  if ( pre.length() > 0 ){
2783 	    if ( tokDebug >= 4 ){
2784 	      LOG << "\tTOKEN pre-context (" << pre.length()
2785 			      << "): [" << pre << "]" << endl;
2786 	    }
2787 	    tokenizeWord( pre, false, lang ); //pre-context, no space after
2788 	  }
2789 	  if ( matches.size() > 0 ){
2790 	    int max = matches.size();
2791 	    if ( tokDebug >= 4 ){
2792 	      LOG << "\tTOKEN match #=" << matches.size() << endl;
2793 	    }
2794 	    for ( int m=0; m < max; ++m ){
2795 	      if ( tokDebug >= 4 ){
2796 		LOG << "\tTOKEN match[" << m << "] = " << matches[m]
2797 		    << " Space=" << (space?"TRUE":"FALSE") << endl;
2798 	      }
2799 	      if ( doPunctFilter
2800 		   && (&rule->id)->startsWith("PUNCTUATION") ){
2801 		if (tokDebug >= 2 ){
2802 		  LOG << "   [tokenizeWord] skipped PUNCTUATION ["
2803 				  << matches[m] << "]" << endl;
2804 		}
2805 		if ( !tokens.empty() ){
2806 		  tokens.back().role &= ~NOSPACE;
2807 		}
2808 	      }
2809 	      else {
2810 		bool internal_space = space;
2811 		if ( post.length() > 0 ) {
2812 		  internal_space = false;
2813 		}
2814 		else if ( m < max-1 ){
2815 		  internal_space = false;
2816 		}
2817 		UnicodeString word = matches[m];
2818 		if ( norm_set.find( type ) != norm_set.end() ){
2819 		  word = "{{" + type + "}}";
2820 		  TokenRole role = (internal_space ? NOROLE : NOSPACE);
2821 		  if ( paragraphsignal_next ){
2822 		    role |= NEWPARAGRAPH;
2823 		    paragraphsignal_next = false;
2824 		  }
2825 		  tokens.push_back( Token( type, word, role, lang ) );
2826 		}
2827 		else {
2828 		  if ( recurse ){
2829 		    TokenRole role = (internal_space ? NOROLE : NOSPACE);
2830 		    if ( paragraphsignal_next ){
2831 		      role |= NEWPARAGRAPH;
2832 		      paragraphsignal_next = false;
2833 		    }
2834 		    tokens.push_back( Token( type, word, role, lang ) );
2835 		  }
2836 		  else {
2837 		    tokenizeWord( word, internal_space, lang, type );
2838 		  }
2839 		}
2840 	      }
2841 	    }
2842 	  }
2843 	  else if ( tokDebug >=4 ){
2844 	    // should never come here?
2845 	    LOG << "\tPANIC there's no match" << endl;
2846 	  }
2847 	  if ( post.length() > 0 ){
2848 	    if ( tokDebug >= 4 ){
2849 	      LOG << "\tTOKEN post-context (" << post.length()
2850 			      << "): [" << post << "]" << endl;
2851 	    }
2852 	    tokenizeWord( post, space, lang );
2853 	  }
2854 	  break;
2855 	}
2856       }
2857       if ( !a_rule_matched ){
2858 	// no rule matched
2859 	if ( tokDebug >=4 ){
2860 	  LOG << "\tthere's no match at all" << endl;
2861 	}
2862 	TokenRole role = (space ? NOROLE : NOSPACE);
2863 	if ( paragraphsignal_next ){
2864 	  role |= NEWPARAGRAPH;
2865 	  paragraphsignal_next = false;
2866 	}
2867 	tokens.push_back( Token( assigned_type, input, role, lang ) );
2868       }
2869     }
2870   }
2871 
get_data_version() const2872   string TokenizerClass::get_data_version() const {
2873     return UCTODATA_VERSION;
2874   }
2875 
init(const string & fname,const string & tname)2876   bool TokenizerClass::init( const string& fname, const string& tname ){
2877     if ( tokDebug ){
2878       LOG << "Initiating tokenizer..." << endl;
2879     }
2880     data_version = get_data_version();
2881     Setting *set = new Setting();
2882     if ( !set->read( fname, tname, tokDebug, theErrLog ) ){
2883       LOG << "Cannot read Tokenizer settingsfile " << fname << endl;
2884       LOG << "Unsupported language? (Did you install the uctodata package?)"
2885 	  << endl;
2886       return false;
2887     }
2888     else {
2889       settings["default"] = set;
2890       default_language = "default";
2891       auto pos = fname.find("tokconfig-");
2892       if ( pos != string::npos ){
2893 	default_language = fname.substr(pos+10);
2894 	settings[default_language] = set;
2895       }
2896       else if ( xmlout ){
2897 	LOG << " unable to determine a language. cannot proceed" << endl;
2898 	return false;
2899       }
2900     }
2901     if ( tokDebug ){
2902       LOG << "effective rules: " << endl;
2903       for ( size_t i=0; i < set->rules.size(); ++i ){
2904 	LOG << "rule " << i << " " << *(set->rules[i]) << endl;
2905       }
2906       LOG << "EOS markers: " << set->eosmarkers << endl;
2907       LOG << "Quotations: " << set->quotes << endl;
2908       try {
2909 	LOG << "Filter: " << set->filter << endl;
2910       }
2911       catch (...){
2912       }
2913     }
2914     return true;
2915   }
2916 
init(const vector<string> & languages,const string & tname)2917   bool TokenizerClass::init( const vector<string>& languages,
2918 			     const string& tname ){
2919     if ( tokDebug > 0 ){
2920       LOG << "Initiating tokenizer from language list..." << endl;
2921     }
2922     data_version = get_data_version();
2923     Setting *default_set = 0;
2924     for ( const auto& lang : languages ){
2925       if ( tokDebug > 0 ){
2926 	LOG << "init language=" << lang << endl;
2927       }
2928       string fname = "tokconfig-" + lang;
2929       Setting *set = new Setting();
2930       string add;
2931       if ( default_set == 0 ){
2932 	add = tname;
2933       }
2934       if ( !set->read( fname, add, tokDebug, theErrLog ) ){
2935 	LOG << "problem reading datafile for language: " << lang << endl;
2936 	LOG << "Unsupported language (Did you install the uctodata package?)"
2937 	    << endl;
2938       }
2939       else {
2940 	if ( default_set == 0 ){
2941 	  default_set = set;
2942 	  settings["default"] = set;
2943 	  default_language = lang;
2944 	}
2945 	settings[lang] = set;
2946       }
2947     }
2948     if ( settings.empty() ){
2949       cerr << "ucto: No useful settingsfile(s) could be found (initiating from language list: " << languages << ")" << endl;
2950       return false;
2951     }
2952     return true;
2953   }
2954 
get_language(const vector<Token> & tv)2955   string get_language( const vector<Token>& tv ){
2956     // examine the assigned languages of ALL tokens.
2957     // they should all be the same
2958     // assign that value
2959     string result = "default";
2960     for ( const auto& t : tv ){
2961       if ( !t.lang_code.empty() && t.lang_code != "default" ){
2962 	if ( result == "default" ){
2963 	  result = t.lang_code;
2964 	}
2965 	if ( result != t.lang_code ){
2966 	  throw logic_error( "ucto: conflicting language(s) assigned" );
2967 	}
2968       }
2969     }
2970     return result;
2971   }
2972 
get_setting_info(const std::string & language,std::string & set_file,std::string & version) const2973   bool TokenizerClass::get_setting_info( const std::string& language,
2974 					 std::string& set_file,
2975 					 std::string& version ) const {
2976     set_file.clear();
2977     version.clear();
2978     auto const& it = settings.find( language );
2979     if ( it == settings.end() ){
2980       return false;
2981     }
2982     else {
2983       set_file = it->second->set_file;
2984       version = it->second->version;
2985       return true;
2986     }
2987   }
2988 
2989 } //namespace Tokenizer
2990