1 /*
2   Copyright (c) 2006 - 2021
3   CLST - Radboud University
4   ILK  - Tilburg University
5 
6   This file is part of Ucto
7 
8   Ucto is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   Ucto is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ucto/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 #include <cstdlib>
27 #include <cstring>
28 #include <string>
29 #include <vector>
30 #include <map>
31 #include <set>
32 #include <iostream>
33 #include <fstream>
34 #include "ticcutils/StringOps.h"
35 #include "libfolia/folia.h"
36 #include "ticcutils/CommandLine.h"
37 #include "ticcutils/PrettyPrint.h"
38 #include "ticcutils/Unicode.h"
39 #include "ucto/my_textcat.h"
40 #include "ucto/setting.h"
41 #include "ucto/tokenize.h"
42 #include <unistd.h>
43 
44 using namespace std;
45 using namespace Tokenizer;
46 using TiCC::operator<<;
47 
fix_639_1(const string & language)48 string fix_639_1( const string& language ){
49   string result = language;
50   // support some backward compatability to old ISO 639-1 codes
51   if ( language == "nl" ){
52     result = "nld";
53   }
54   else if ( language == "de" ){
55     result = "deu";
56   }
57   else if ( language == "fr" ){
58     result = "fra";
59   }
60   else if ( language == "pt" ){
61     result = "por";
62   }
63   else if ( language == "es" ){
64     result = "spa";
65   }
66   else if ( language == "fy" ){
67     result = "fry";
68   }
69   else if ( language == "se" ){
70     result = "swe";
71   }
72   else if ( language == "en" ){
73     result = "eng";
74   }
75   else if ( language == "it" ){
76     result = "ita";
77   }
78   else if ( language == "ru" ){
79     result = "rus";
80   }
81   else if ( language == "tr" ){
82     result = "tur";
83   }
84   return result;
85 }
86 
usage()87 void usage(){
88   set<string> languages = Setting::installed_languages();
89   cerr << "Usage: " << endl;
90   cerr << "\tucto [[options]] [input-file] [[output-file]]"  << endl
91        << "Options:" << endl
92        << "\t-c <configfile>   - Explicitly specify a configuration file" << endl
93        << "\t-d <value>        - set debug level" << endl
94        << "\t-e <string>       - set input encoding (default UTF8)" << endl
95        << "\t-N <string>       - set output normalization (default NFC)" << endl
96        << "\t--filter=[YES|NO] - Disable filtering of special characters" << endl
97        << "\t-f                - OBSOLETE. use --filter=NO" << endl
98        << "\t-h or --help      - this message" << endl
99        << "\t-L <language>     - Automatically selects a configuration file by language code." << endl
100        << "\t                  - Available Languages:" << endl
101        << "\t                    ";
102   for( const auto& l : languages ){
103     cerr << l << ",";
104   }
105   cerr << endl;
106   cerr << "\t-l                - Convert to all lowercase" << endl
107        << "\t-u                - Convert to all uppercase" << endl
108        << "\t-n                - One sentence per line (output)" << endl
109        << "\t-m                - One sentence per line (input)" << endl
110        << "\t-v                - Verbose mode" << endl
111        << "\t-s <string>       - End-of-Sentence marker (default: <utt>)" << endl
112        << "\t--passthru        - Don't tokenize, but perform input decoding and simple token role detection" << endl
113        << "\t--normalize=<class1>,class2>,... " << endl
114        << "\t                  - For class1, class2, etc. output the class tokens instead of the tokens itself." << endl
115        << "\t-T or --textredundancy=[full|minimal|none]  - set text redundancy level for text nodes in FoLiA output: " << endl
116        << "\t                    'full' - add text to all levels: <p> <s> <w> etc." << endl
117        << "\t                    'minimal' - don't introduce text on higher levels, but retain what is already there." << endl
118        << "\t                    'none' - only introduce text on <w>, AND remove all text from higher levels" << endl
119        << "\t--allow-word-corrections   - allow tokenization of FoLiA Word elements." << endl
120        << "\t--ignore-tag-hints - Do NOT use tag=\"token\" hints from the FoLiA input. (default is to use them)" << endl
121        << "\t--filterpunct      - remove all punctuation from the output" << endl
122        << "\t--uselanguages=<lang1,lang2,..langn> - Using FoLiA input, only tokenize strings in these languages. Default = 'lang1'" << endl
123        << "\t--detectlanguages=<lang1,lang2,..langn> - try to assign a language to each line of text input. Default = 'lang1'" << endl
124        << "\t--add-tokens='file' - add additional tokens to the [TOKENS] of the" << endl
125        << "\t                    default language. TOKENS are always kept intact." << endl
126        << "\t-P                - Disable paragraph detection" << endl
127        << "\t-Q                - Enable quote detection (experimental)" << endl
128        << "\t-V or --version   - Show version information" << endl
129        << "\t-x <DocID>        - Output FoLiA XML, use the specified Document ID (obsolete)" << endl
130        << "\t-F                - Input file is in FoLiA XML. All untokenized sentences will be tokenized." << endl
131        << "\t                    -F is automatically set when inputfile has extension '.xml'" << endl
132        << "\t-X                - Output FoLiA XML, use the Document ID specified with --id=" << endl
133        << "\t--id <DocID>      - use the specified Document ID to label the FoLia doc." << endl
134        << "                      -X is automatically set when inputfile has extension '.xml'" << endl
135        << "\t--inputclass <class>  - use the specified class to search text in the FoLia doc.(default is 'current')" << endl
136        << "\t--outputclass <class> - use the specified class to output text in the FoLia doc. (default is 'current')" << endl
137        << "\t--textclass <class>   - use the specified class for both input and output of text in the FoLia doc. (default is 'current'). Implies --filter=NO." << endl
138        << "\t                  (-x and -F disable usage of most other options: -nPQVs)" << endl;
139 }
140 
main(int argc,char * argv[])141 int main( int argc, char *argv[] ){
142   int debug = 0;
143   bool tolowercase = false;
144   bool touppercase = false;
145   bool sentenceperlineoutput = false;
146   bool sentenceperlineinput = false;
147   bool paragraphdetection = true;
148   bool quotedetection = false;
149   bool do_language_detect = false;
150   bool dofiltering = true;
151   bool dopunctfilter = false;
152   bool xmlin = false;
153   bool xmlout = false;
154   bool verbose = false;
155   bool docorrectwords = false;
156   string redundancy = "minimal";
157   string eosmarker = "<utt>";
158   string docid = "untitleddoc";
159   string normalization = "NFC";
160   string inputEncoding = "UTF-8";
161   string inputclass  = "current";
162   string outputclass = "current";
163   vector<string> language_list;
164   string cfile;
165   string ifile;
166   string ofile;
167   string c_file;
168   bool pass_thru = false;
169   bool ignore_tags = false;
170   bool sentencesplit = false;
171   string norm_set_string;
172   string add_tokens;
173   string command_line = "ucto";
174   for ( int i=1; i < argc; ++i ){
175     command_line += " " + string(argv[i]);
176   }
177   try {
178     TiCC::CL_Options Opts( "d:e:fhlPQunmN:vVL:c:s:x:FXT:",
179 			   "filter:,filterpunct,passthru,textclass:,inputclass:,outputclass:,normalize:,id:,version,help,detectlanguages:,uselanguages:,textredundancy:,add-tokens:,split,allow-word-corrections,ignore-tag-hints");
180     Opts.init(argc, argv );
181     if ( Opts.extract( 'h' )
182 	 || Opts.extract( "help" ) ){
183       usage();
184       return EXIT_SUCCESS;
185     }
186     if ( Opts.extract( 'V' ) ||
187 	 Opts.extract( "version" ) ){
188       cout << "Ucto - Unicode Tokenizer - version " << Version() << endl
189 	   << "(c) CLST 2015 - 2021, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl
190 	   << "(c) ILK 2009 - 2015, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl
191 	   << "Licensed under the GNU General Public License v3" << endl;
192       cout << "based on [" << folia::VersionName() << "]" << endl;
193       return EXIT_SUCCESS;
194     }
195     Opts.extract('e', inputEncoding );
196     dopunctfilter = Opts.extract( "filterpunct" );
197     docorrectwords = Opts.extract( "allow-word-corrections" );
198     paragraphdetection = !Opts.extract( 'P' );
199     xmlin = Opts.extract( 'F' );
200     quotedetection = Opts.extract( 'Q' );
201     Opts.extract( 's', eosmarker );
202     touppercase = Opts.extract( 'u' );
203     tolowercase = Opts.extract( 'l' );
204     sentencesplit = Opts.extract( "split" );
205     sentenceperlineoutput = Opts.extract( 'n' );
206     sentenceperlineinput = Opts.extract( 'm' );
207     Opts.extract( 'T', redundancy );
208     Opts.extract( "textredundancy", redundancy );
209     if ( redundancy != "full"
210 	 && redundancy != "minimal"
211 	 && redundancy != "none" ){
212       throw TiCC::OptionError( "unknown textredundancy level: " + redundancy );
213     }
214     Opts.extract( 'N', normalization );
215     verbose = Opts.extract( 'v' );
216     if ( Opts.extract( 'x', docid ) ){
217       xmlout = true;
218       if ( Opts.is_present( 'X' ) ){
219 	throw TiCC::OptionError( "conflicting options -x and -X" );
220       }
221       if ( Opts.is_present( "id" ) ){
222 	throw TiCC::OptionError( "conflicting options -x and --id" );
223       }
224     }
225     else {
226       xmlout = Opts.extract( 'X' );
227       Opts.extract( "id", docid );
228     }
229     if ( sentencesplit ){
230       if ( xmlout ){
231 	throw TiCC::OptionError( "conflicting options --split and -x or -X" );
232       }
233       //      sentenceperlineoutput = true;
234     }
235     string textclass;
236     Opts.extract( "textclass", textclass );
237     Opts.extract( "inputclass", inputclass );
238     Opts.extract( "outputclass", outputclass );
239     if ( !textclass.empty() ){
240       if ( inputclass != "current" ){
241 	throw TiCC::OptionError( "--textclass conflicts with --inputclass" );
242       }
243       if ( outputclass != "current" ){
244 	throw TiCC::OptionError( "--textclass conflicts with --outputclass");
245       }
246       inputclass = textclass;
247       outputclass = textclass;
248     }
249     if ( Opts.extract( 'f' ) ){
250       cerr << "ucto: The -f option is used.  Please consider using --filter=NO" << endl;
251       dofiltering = false;
252     }
253     Opts.extract( "add-tokens", add_tokens );
254     string value;
255     if ( Opts.extract( "filter", value ) ){
256       bool result;
257       if ( !TiCC::stringTo( value, result ) ){
258 	throw TiCC::OptionError( "illegal value for '--filter' option. (boolean expected)" );
259       }
260       dofiltering = result;
261     }
262     if ( dofiltering
263 	 && xmlin
264 	 && outputclass == inputclass
265 	 && !docorrectwords ){
266       // we cannot mangle the original inputclass, so disable filtering
267       cerr << "ucto: --filter=NO is automatically set. inputclass equals outputclass!"
268 	   << endl;
269       dofiltering = false;
270     }
271     if ( xmlin && outputclass.empty() ){
272       if ( dopunctfilter ){
273 	throw TiCC::OptionError( "--outputclass required for --filterpunct on FoLiA input ");
274       }
275       if ( touppercase ){
276 	throw TiCC::OptionError( "--outputclass required for -u on FoLiA input ");
277       }
278       if ( tolowercase ){
279 	throw TiCC::OptionError( "--outputclass required for -l on FoLiA input ");
280       }
281     }
282     if ( Opts.extract('d', value ) ){
283       if ( !TiCC::stringTo(value,debug) ){
284 	throw TiCC::OptionError( "invalid value for -d: " + value );
285       }
286     }
287     ignore_tags = Opts.extract( "ignore-tag-hints" );
288     pass_thru = Opts.extract( "passthru" );
289     bool use_lang = Opts.is_present( "uselanguages" );
290     bool detect_lang = Opts.is_present( "detectlanguages" );
291     if ( detect_lang && use_lang ){
292       throw TiCC::OptionError( "--detectlanguages and --uselanguages options conflict. Use only one of these." );
293     }
294     if ( use_lang && pass_thru ){
295       throw TiCC::OptionError( "--passtru an --uselanguages options conflict. Use only one of these." );
296     }
297     if ( detect_lang && pass_thru ){
298       throw TiCC::OptionError( "--passtru an --detectlanguages options conflict. Use only one of these." );
299     }
300     if ( Opts.is_present('L') ) {
301       if ( pass_thru ){
302 	throw TiCC::OptionError( "--passtru an -L options conflict. Use only one of these." );
303       }
304       if ( Opts.is_present('c') ){
305 	throw TiCC::OptionError( "-L and -c options conflict. Use only one of these." );
306       }
307       else if ( detect_lang ){
308 	throw TiCC::OptionError( "-L and --detectlanguages options conflict. Use only one of these." );
309       }
310       else if ( use_lang ) {
311 	throw TiCC::OptionError( "-L and --uselanguages options conflict. Use only one of these." );
312       }
313     }
314     else if ( Opts.is_present( 'c' ) ){
315       if ( detect_lang ){
316 	throw TiCC::OptionError( "-c and --detectlanguages options conflict. Use only one of these" );
317       }
318       else if ( use_lang ){
319 	throw TiCC::OptionError( "-c and --uselanguages options conflict. Use only one of these." );
320       }
321     }
322     Opts.extract( 'c', c_file );
323 
324     if ( !pass_thru ){
325       string languages;
326       Opts.extract( "detectlanguages", languages );
327       if ( languages.empty() ){
328 	Opts.extract( "uselanguages", languages );
329       }
330       else {
331 	do_language_detect = true;
332       }
333       if ( !languages.empty() ){
334 	language_list = TiCC::split_at( languages, "," );
335 	if ( language_list.empty() ){
336 	  throw TiCC::OptionError( "invalid language list: " + languages );
337 	}
338       }
339       else {
340 	// so NOT --detectlanguages or --uselanguages
341 	string language;
342 	if ( Opts.extract('L', language ) ){
343 	  language = fix_639_1( language );
344 	}
345 	if ( !language.empty() ){
346 	  language_list.push_back( language );
347 	}
348       }
349     }
350     Opts.extract("normalize", norm_set_string );
351     if ( !Opts.empty() ){
352       string tomany = Opts.toString();
353       throw TiCC::OptionError( "unhandled option(s): " + tomany );
354     }
355     vector<string> files = Opts.getMassOpts();
356     if ( files.size() > 0 ){
357       ifile = files[0];
358       if ( TiCC::match_back( ifile, ".xml" ) ){
359 	xmlin = true;
360       }
361     }
362     if ( use_lang && !xmlin ){
363       throw TiCC::OptionError( "--uselanguages is only valid for FoLiA input" );
364     }
365     if ( docorrectwords && !xmlin ){
366       throw TiCC::OptionError( "--allow-word-corrections is only valid for FoLiA input" );
367     }
368     if ( files.size() == 2 ){
369       ofile = files[1];
370       if ( TiCC::match_back( ofile, ".xml" ) ){
371 	xmlout = true;
372       }
373     }
374     if ( files.size() > 2 ){
375       cerr << "found additional arguments on the commandline: " << files[2]
376 	   << "...." << endl;
377       return EXIT_FAILURE;
378     }
379   }
380   catch( const TiCC::OptionError& e ){
381     cerr << "ucto: " << e.what() << endl;
382     usage();
383     return EXIT_FAILURE;
384   }
385   if ( !pass_thru ){
386     set<string> available_languages = Setting::installed_languages();
387     if ( !c_file.empty() ){
388       cfile = c_file;
389     }
390     else if ( language_list.empty() ){
391       cerr << "ucto: missing a language specification (-L or --detectlanguages or --uselanguages option)" << endl;
392       if ( available_languages.size() == 1
393 	   && *available_languages.begin() == "generic" ){
394 	cerr << "ucto: The uctodata package seems not to be installed." << endl;
395 	cerr << "ucto: You can use '-L generic' to run a simple default tokenizer."
396 	     << endl;
397 	cerr << "ucto: Installing uctodata is highly recommended." << endl;
398       }
399       else {
400 	cerr << "ucto: Available Languages: ";
401 	for( const auto& l : available_languages ){
402 	  cerr << l << ",";
403 	}
404 	cerr << endl;
405       }
406       return EXIT_FAILURE;
407     }
408     else {
409       for ( const auto& l : language_list ){
410 	if ( available_languages.find(l) == available_languages.end() ){
411 	  cerr << "ucto: unsupported language '" << l << "'" << endl;
412 	  if ( available_languages.size() == 1
413 	       && *available_languages.begin() == "generic" ){
414 	    cerr << "ucto: The uctodata package seems not to be installed." << endl;
415 	    cerr << "ucto: You can use '-L generic' to run a simple default tokenizer."
416 		 << endl;
417 	    cerr << "ucto: Installing uctodata is highly recommended." << endl;
418 	  }
419 	  else {
420 	    cerr << "ucto: Available Languages: ";
421 	    for( const auto& lang : available_languages ){
422 	      cerr << lang << ",";
423 	    }
424 	    cerr << endl;
425 	  }
426 	  return EXIT_FAILURE;
427 	}
428       }
429     }
430   }
431 
432   if ((!ifile.empty()) && (ifile == ofile)) {
433     cerr << "ucto: Output file equals input file! Courageously refusing to start..."  << endl;
434     return EXIT_FAILURE;
435   }
436 
437   cerr << "ucto: inputfile = "  << ifile << endl;
438   cerr << "ucto: outputfile = " << ofile << endl;
439 
440   istream *IN = 0;
441   if (!xmlin) {
442     if ( ifile.empty() ){
443       IN = &cin;
444     }
445     else {
446       IN = new ifstream( ifile );
447       if ( !IN || !IN->good() ){
448 	cerr << "ucto: problems opening inputfile " << ifile << endl;
449 	cerr << "ucto: Courageously refusing to start..."  << endl;
450 	delete IN;
451 	return EXIT_FAILURE;
452       }
453     }
454   }
455 
456   ostream *OUT = 0;
457   if ( ofile.empty() ){
458     OUT = &cout;
459   }
460   else {
461     OUT = new ofstream( ofile );
462     if ( !OUT || !OUT->good() ){
463       cerr << "ucto: problems opening outputfile " << ofile << endl;
464       cerr << "ucto: Courageously refusing to start..."  << endl;
465       delete OUT;
466       if ( IN != &cin ){
467 	delete IN;
468       }
469       return EXIT_FAILURE;
470     }
471   }
472   try {
473     TokenizerClass tokenizer;
474     // set debug first, so init() can be debugged too
475     tokenizer.setDebug( debug );
476     tokenizer.set_command( command_line );
477     tokenizer.setEosMarker( eosmarker );
478     tokenizer.setVerbose( verbose );
479     tokenizer.setSentenceSplit(sentencesplit);
480     tokenizer.setSentencePerLineOutput(sentenceperlineoutput);
481     tokenizer.setSentencePerLineInput(sentenceperlineinput);
482     tokenizer.setLowercase(tolowercase);
483     tokenizer.setUppercase(touppercase);
484     tokenizer.setNormSet(norm_set_string);
485     tokenizer.setParagraphDetection(paragraphdetection);
486     tokenizer.setQuoteDetection(quotedetection);
487     tokenizer.setNormalization( normalization );
488     tokenizer.setInputEncoding( inputEncoding );
489     tokenizer.setFiltering(dofiltering);
490     tokenizer.setWordCorrection(docorrectwords);
491     tokenizer.setLangDetection(do_language_detect);
492     tokenizer.setPunctFilter(dopunctfilter);
493     tokenizer.setInputClass(inputclass);
494     tokenizer.setOutputClass(outputclass);
495     tokenizer.setXMLOutput(xmlout, docid);
496     tokenizer.setXMLInput(xmlin);
497     tokenizer.setTextRedundancy(redundancy);
498     if ( ignore_tags ){
499       tokenizer.setNoTags( true );
500     }
501     if ( pass_thru ){
502       tokenizer.setPassThru( true );
503     }
504     else {
505       // init exept for passthru mode
506       if ( !cfile.empty()
507 	   && !tokenizer.init( cfile, add_tokens ) ){
508 	if ( IN != &cin ){
509 	  delete IN;
510 	}
511 	if ( OUT != &cout ){
512 	  delete OUT;
513 	}
514 	return EXIT_FAILURE;
515       }
516       else if ( !tokenizer.init( language_list, add_tokens ) ){
517 	if ( IN != &cin ){
518 	  delete IN;
519 	}
520 	if ( OUT != &cout ){
521 	  delete OUT;
522 	}
523 	return EXIT_FAILURE;
524       }
525       if ( !cfile.empty() ){
526 	cerr << "ucto: configured from file: " << cfile << endl;
527       }
528       else {
529 	cerr << "ucto: configured for languages: " << language_list << endl;
530       }
531     }
532 
533 
534     if (xmlin) {
535       folia::Document *doc = tokenizer.tokenize_folia( ifile );
536       if ( doc ){
537 	*OUT << doc;
538 	OUT->flush();
539 	delete doc;
540       }
541     }
542     else {
543       tokenizer.tokenize( *IN, *OUT );
544       if ( OUT != &cout )
545 	delete OUT;
546       if ( IN != &cin )
547 	delete IN;
548     }
549   }
550   catch ( exception &e ){
551     cerr << "ucto: " << e.what() << endl;
552     return EXIT_FAILURE;
553   }
554 
555 }
556