1 /*
2 Copyright (c) 2006 - 2021
3 CLST - Radboud University
4 ILK - Tilburg University
5
6 This file is part of Ucto
7
8 Ucto is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
12
13 Ucto is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 For questions and suggestions, see:
22 https://github.com/LanguageMachines/ucto/issues
23 or send mail to:
24 lamasoftware (at ) science.ru.nl
25 */
26 #include <cstdlib>
27 #include <cstring>
28 #include <string>
29 #include <vector>
30 #include <map>
31 #include <set>
32 #include <iostream>
33 #include <fstream>
34 #include "ticcutils/StringOps.h"
35 #include "libfolia/folia.h"
36 #include "ticcutils/CommandLine.h"
37 #include "ticcutils/PrettyPrint.h"
38 #include "ticcutils/Unicode.h"
39 #include "ucto/my_textcat.h"
40 #include "ucto/setting.h"
41 #include "ucto/tokenize.h"
42 #include <unistd.h>
43
44 using namespace std;
45 using namespace Tokenizer;
46 using TiCC::operator<<;
47
fix_639_1(const string & language)48 string fix_639_1( const string& language ){
49 string result = language;
50 // support some backward compatability to old ISO 639-1 codes
51 if ( language == "nl" ){
52 result = "nld";
53 }
54 else if ( language == "de" ){
55 result = "deu";
56 }
57 else if ( language == "fr" ){
58 result = "fra";
59 }
60 else if ( language == "pt" ){
61 result = "por";
62 }
63 else if ( language == "es" ){
64 result = "spa";
65 }
66 else if ( language == "fy" ){
67 result = "fry";
68 }
69 else if ( language == "se" ){
70 result = "swe";
71 }
72 else if ( language == "en" ){
73 result = "eng";
74 }
75 else if ( language == "it" ){
76 result = "ita";
77 }
78 else if ( language == "ru" ){
79 result = "rus";
80 }
81 else if ( language == "tr" ){
82 result = "tur";
83 }
84 return result;
85 }
86
usage()87 void usage(){
88 set<string> languages = Setting::installed_languages();
89 cerr << "Usage: " << endl;
90 cerr << "\tucto [[options]] [input-file] [[output-file]]" << endl
91 << "Options:" << endl
92 << "\t-c <configfile> - Explicitly specify a configuration file" << endl
93 << "\t-d <value> - set debug level" << endl
94 << "\t-e <string> - set input encoding (default UTF8)" << endl
95 << "\t-N <string> - set output normalization (default NFC)" << endl
96 << "\t--filter=[YES|NO] - Disable filtering of special characters" << endl
97 << "\t-f - OBSOLETE. use --filter=NO" << endl
98 << "\t-h or --help - this message" << endl
99 << "\t-L <language> - Automatically selects a configuration file by language code." << endl
100 << "\t - Available Languages:" << endl
101 << "\t ";
102 for( const auto& l : languages ){
103 cerr << l << ",";
104 }
105 cerr << endl;
106 cerr << "\t-l - Convert to all lowercase" << endl
107 << "\t-u - Convert to all uppercase" << endl
108 << "\t-n - One sentence per line (output)" << endl
109 << "\t-m - One sentence per line (input)" << endl
110 << "\t-v - Verbose mode" << endl
111 << "\t-s <string> - End-of-Sentence marker (default: <utt>)" << endl
112 << "\t--passthru - Don't tokenize, but perform input decoding and simple token role detection" << endl
113 << "\t--normalize=<class1>,class2>,... " << endl
114 << "\t - For class1, class2, etc. output the class tokens instead of the tokens itself." << endl
115 << "\t-T or --textredundancy=[full|minimal|none] - set text redundancy level for text nodes in FoLiA output: " << endl
116 << "\t 'full' - add text to all levels: <p> <s> <w> etc." << endl
117 << "\t 'minimal' - don't introduce text on higher levels, but retain what is already there." << endl
118 << "\t 'none' - only introduce text on <w>, AND remove all text from higher levels" << endl
119 << "\t--allow-word-corrections - allow tokenization of FoLiA Word elements." << endl
120 << "\t--ignore-tag-hints - Do NOT use tag=\"token\" hints from the FoLiA input. (default is to use them)" << endl
121 << "\t--filterpunct - remove all punctuation from the output" << endl
122 << "\t--uselanguages=<lang1,lang2,..langn> - Using FoLiA input, only tokenize strings in these languages. Default = 'lang1'" << endl
123 << "\t--detectlanguages=<lang1,lang2,..langn> - try to assign a language to each line of text input. Default = 'lang1'" << endl
124 << "\t--add-tokens='file' - add additional tokens to the [TOKENS] of the" << endl
125 << "\t default language. TOKENS are always kept intact." << endl
126 << "\t-P - Disable paragraph detection" << endl
127 << "\t-Q - Enable quote detection (experimental)" << endl
128 << "\t-V or --version - Show version information" << endl
129 << "\t-x <DocID> - Output FoLiA XML, use the specified Document ID (obsolete)" << endl
130 << "\t-F - Input file is in FoLiA XML. All untokenized sentences will be tokenized." << endl
131 << "\t -F is automatically set when inputfile has extension '.xml'" << endl
132 << "\t-X - Output FoLiA XML, use the Document ID specified with --id=" << endl
133 << "\t--id <DocID> - use the specified Document ID to label the FoLia doc." << endl
134 << " -X is automatically set when inputfile has extension '.xml'" << endl
135 << "\t--inputclass <class> - use the specified class to search text in the FoLia doc.(default is 'current')" << endl
136 << "\t--outputclass <class> - use the specified class to output text in the FoLia doc. (default is 'current')" << endl
137 << "\t--textclass <class> - use the specified class for both input and output of text in the FoLia doc. (default is 'current'). Implies --filter=NO." << endl
138 << "\t (-x and -F disable usage of most other options: -nPQVs)" << endl;
139 }
140
main(int argc,char * argv[])141 int main( int argc, char *argv[] ){
142 int debug = 0;
143 bool tolowercase = false;
144 bool touppercase = false;
145 bool sentenceperlineoutput = false;
146 bool sentenceperlineinput = false;
147 bool paragraphdetection = true;
148 bool quotedetection = false;
149 bool do_language_detect = false;
150 bool dofiltering = true;
151 bool dopunctfilter = false;
152 bool xmlin = false;
153 bool xmlout = false;
154 bool verbose = false;
155 bool docorrectwords = false;
156 string redundancy = "minimal";
157 string eosmarker = "<utt>";
158 string docid = "untitleddoc";
159 string normalization = "NFC";
160 string inputEncoding = "UTF-8";
161 string inputclass = "current";
162 string outputclass = "current";
163 vector<string> language_list;
164 string cfile;
165 string ifile;
166 string ofile;
167 string c_file;
168 bool pass_thru = false;
169 bool ignore_tags = false;
170 bool sentencesplit = false;
171 string norm_set_string;
172 string add_tokens;
173 string command_line = "ucto";
174 for ( int i=1; i < argc; ++i ){
175 command_line += " " + string(argv[i]);
176 }
177 try {
178 TiCC::CL_Options Opts( "d:e:fhlPQunmN:vVL:c:s:x:FXT:",
179 "filter:,filterpunct,passthru,textclass:,inputclass:,outputclass:,normalize:,id:,version,help,detectlanguages:,uselanguages:,textredundancy:,add-tokens:,split,allow-word-corrections,ignore-tag-hints");
180 Opts.init(argc, argv );
181 if ( Opts.extract( 'h' )
182 || Opts.extract( "help" ) ){
183 usage();
184 return EXIT_SUCCESS;
185 }
186 if ( Opts.extract( 'V' ) ||
187 Opts.extract( "version" ) ){
188 cout << "Ucto - Unicode Tokenizer - version " << Version() << endl
189 << "(c) CLST 2015 - 2021, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl
190 << "(c) ILK 2009 - 2015, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl
191 << "Licensed under the GNU General Public License v3" << endl;
192 cout << "based on [" << folia::VersionName() << "]" << endl;
193 return EXIT_SUCCESS;
194 }
195 Opts.extract('e', inputEncoding );
196 dopunctfilter = Opts.extract( "filterpunct" );
197 docorrectwords = Opts.extract( "allow-word-corrections" );
198 paragraphdetection = !Opts.extract( 'P' );
199 xmlin = Opts.extract( 'F' );
200 quotedetection = Opts.extract( 'Q' );
201 Opts.extract( 's', eosmarker );
202 touppercase = Opts.extract( 'u' );
203 tolowercase = Opts.extract( 'l' );
204 sentencesplit = Opts.extract( "split" );
205 sentenceperlineoutput = Opts.extract( 'n' );
206 sentenceperlineinput = Opts.extract( 'm' );
207 Opts.extract( 'T', redundancy );
208 Opts.extract( "textredundancy", redundancy );
209 if ( redundancy != "full"
210 && redundancy != "minimal"
211 && redundancy != "none" ){
212 throw TiCC::OptionError( "unknown textredundancy level: " + redundancy );
213 }
214 Opts.extract( 'N', normalization );
215 verbose = Opts.extract( 'v' );
216 if ( Opts.extract( 'x', docid ) ){
217 xmlout = true;
218 if ( Opts.is_present( 'X' ) ){
219 throw TiCC::OptionError( "conflicting options -x and -X" );
220 }
221 if ( Opts.is_present( "id" ) ){
222 throw TiCC::OptionError( "conflicting options -x and --id" );
223 }
224 }
225 else {
226 xmlout = Opts.extract( 'X' );
227 Opts.extract( "id", docid );
228 }
229 if ( sentencesplit ){
230 if ( xmlout ){
231 throw TiCC::OptionError( "conflicting options --split and -x or -X" );
232 }
233 // sentenceperlineoutput = true;
234 }
235 string textclass;
236 Opts.extract( "textclass", textclass );
237 Opts.extract( "inputclass", inputclass );
238 Opts.extract( "outputclass", outputclass );
239 if ( !textclass.empty() ){
240 if ( inputclass != "current" ){
241 throw TiCC::OptionError( "--textclass conflicts with --inputclass" );
242 }
243 if ( outputclass != "current" ){
244 throw TiCC::OptionError( "--textclass conflicts with --outputclass");
245 }
246 inputclass = textclass;
247 outputclass = textclass;
248 }
249 if ( Opts.extract( 'f' ) ){
250 cerr << "ucto: The -f option is used. Please consider using --filter=NO" << endl;
251 dofiltering = false;
252 }
253 Opts.extract( "add-tokens", add_tokens );
254 string value;
255 if ( Opts.extract( "filter", value ) ){
256 bool result;
257 if ( !TiCC::stringTo( value, result ) ){
258 throw TiCC::OptionError( "illegal value for '--filter' option. (boolean expected)" );
259 }
260 dofiltering = result;
261 }
262 if ( dofiltering
263 && xmlin
264 && outputclass == inputclass
265 && !docorrectwords ){
266 // we cannot mangle the original inputclass, so disable filtering
267 cerr << "ucto: --filter=NO is automatically set. inputclass equals outputclass!"
268 << endl;
269 dofiltering = false;
270 }
271 if ( xmlin && outputclass.empty() ){
272 if ( dopunctfilter ){
273 throw TiCC::OptionError( "--outputclass required for --filterpunct on FoLiA input ");
274 }
275 if ( touppercase ){
276 throw TiCC::OptionError( "--outputclass required for -u on FoLiA input ");
277 }
278 if ( tolowercase ){
279 throw TiCC::OptionError( "--outputclass required for -l on FoLiA input ");
280 }
281 }
282 if ( Opts.extract('d', value ) ){
283 if ( !TiCC::stringTo(value,debug) ){
284 throw TiCC::OptionError( "invalid value for -d: " + value );
285 }
286 }
287 ignore_tags = Opts.extract( "ignore-tag-hints" );
288 pass_thru = Opts.extract( "passthru" );
289 bool use_lang = Opts.is_present( "uselanguages" );
290 bool detect_lang = Opts.is_present( "detectlanguages" );
291 if ( detect_lang && use_lang ){
292 throw TiCC::OptionError( "--detectlanguages and --uselanguages options conflict. Use only one of these." );
293 }
294 if ( use_lang && pass_thru ){
295 throw TiCC::OptionError( "--passtru an --uselanguages options conflict. Use only one of these." );
296 }
297 if ( detect_lang && pass_thru ){
298 throw TiCC::OptionError( "--passtru an --detectlanguages options conflict. Use only one of these." );
299 }
300 if ( Opts.is_present('L') ) {
301 if ( pass_thru ){
302 throw TiCC::OptionError( "--passtru an -L options conflict. Use only one of these." );
303 }
304 if ( Opts.is_present('c') ){
305 throw TiCC::OptionError( "-L and -c options conflict. Use only one of these." );
306 }
307 else if ( detect_lang ){
308 throw TiCC::OptionError( "-L and --detectlanguages options conflict. Use only one of these." );
309 }
310 else if ( use_lang ) {
311 throw TiCC::OptionError( "-L and --uselanguages options conflict. Use only one of these." );
312 }
313 }
314 else if ( Opts.is_present( 'c' ) ){
315 if ( detect_lang ){
316 throw TiCC::OptionError( "-c and --detectlanguages options conflict. Use only one of these" );
317 }
318 else if ( use_lang ){
319 throw TiCC::OptionError( "-c and --uselanguages options conflict. Use only one of these." );
320 }
321 }
322 Opts.extract( 'c', c_file );
323
324 if ( !pass_thru ){
325 string languages;
326 Opts.extract( "detectlanguages", languages );
327 if ( languages.empty() ){
328 Opts.extract( "uselanguages", languages );
329 }
330 else {
331 do_language_detect = true;
332 }
333 if ( !languages.empty() ){
334 language_list = TiCC::split_at( languages, "," );
335 if ( language_list.empty() ){
336 throw TiCC::OptionError( "invalid language list: " + languages );
337 }
338 }
339 else {
340 // so NOT --detectlanguages or --uselanguages
341 string language;
342 if ( Opts.extract('L', language ) ){
343 language = fix_639_1( language );
344 }
345 if ( !language.empty() ){
346 language_list.push_back( language );
347 }
348 }
349 }
350 Opts.extract("normalize", norm_set_string );
351 if ( !Opts.empty() ){
352 string tomany = Opts.toString();
353 throw TiCC::OptionError( "unhandled option(s): " + tomany );
354 }
355 vector<string> files = Opts.getMassOpts();
356 if ( files.size() > 0 ){
357 ifile = files[0];
358 if ( TiCC::match_back( ifile, ".xml" ) ){
359 xmlin = true;
360 }
361 }
362 if ( use_lang && !xmlin ){
363 throw TiCC::OptionError( "--uselanguages is only valid for FoLiA input" );
364 }
365 if ( docorrectwords && !xmlin ){
366 throw TiCC::OptionError( "--allow-word-corrections is only valid for FoLiA input" );
367 }
368 if ( files.size() == 2 ){
369 ofile = files[1];
370 if ( TiCC::match_back( ofile, ".xml" ) ){
371 xmlout = true;
372 }
373 }
374 if ( files.size() > 2 ){
375 cerr << "found additional arguments on the commandline: " << files[2]
376 << "...." << endl;
377 return EXIT_FAILURE;
378 }
379 }
380 catch( const TiCC::OptionError& e ){
381 cerr << "ucto: " << e.what() << endl;
382 usage();
383 return EXIT_FAILURE;
384 }
385 if ( !pass_thru ){
386 set<string> available_languages = Setting::installed_languages();
387 if ( !c_file.empty() ){
388 cfile = c_file;
389 }
390 else if ( language_list.empty() ){
391 cerr << "ucto: missing a language specification (-L or --detectlanguages or --uselanguages option)" << endl;
392 if ( available_languages.size() == 1
393 && *available_languages.begin() == "generic" ){
394 cerr << "ucto: The uctodata package seems not to be installed." << endl;
395 cerr << "ucto: You can use '-L generic' to run a simple default tokenizer."
396 << endl;
397 cerr << "ucto: Installing uctodata is highly recommended." << endl;
398 }
399 else {
400 cerr << "ucto: Available Languages: ";
401 for( const auto& l : available_languages ){
402 cerr << l << ",";
403 }
404 cerr << endl;
405 }
406 return EXIT_FAILURE;
407 }
408 else {
409 for ( const auto& l : language_list ){
410 if ( available_languages.find(l) == available_languages.end() ){
411 cerr << "ucto: unsupported language '" << l << "'" << endl;
412 if ( available_languages.size() == 1
413 && *available_languages.begin() == "generic" ){
414 cerr << "ucto: The uctodata package seems not to be installed." << endl;
415 cerr << "ucto: You can use '-L generic' to run a simple default tokenizer."
416 << endl;
417 cerr << "ucto: Installing uctodata is highly recommended." << endl;
418 }
419 else {
420 cerr << "ucto: Available Languages: ";
421 for( const auto& lang : available_languages ){
422 cerr << lang << ",";
423 }
424 cerr << endl;
425 }
426 return EXIT_FAILURE;
427 }
428 }
429 }
430 }
431
432 if ((!ifile.empty()) && (ifile == ofile)) {
433 cerr << "ucto: Output file equals input file! Courageously refusing to start..." << endl;
434 return EXIT_FAILURE;
435 }
436
437 cerr << "ucto: inputfile = " << ifile << endl;
438 cerr << "ucto: outputfile = " << ofile << endl;
439
440 istream *IN = 0;
441 if (!xmlin) {
442 if ( ifile.empty() ){
443 IN = &cin;
444 }
445 else {
446 IN = new ifstream( ifile );
447 if ( !IN || !IN->good() ){
448 cerr << "ucto: problems opening inputfile " << ifile << endl;
449 cerr << "ucto: Courageously refusing to start..." << endl;
450 delete IN;
451 return EXIT_FAILURE;
452 }
453 }
454 }
455
456 ostream *OUT = 0;
457 if ( ofile.empty() ){
458 OUT = &cout;
459 }
460 else {
461 OUT = new ofstream( ofile );
462 if ( !OUT || !OUT->good() ){
463 cerr << "ucto: problems opening outputfile " << ofile << endl;
464 cerr << "ucto: Courageously refusing to start..." << endl;
465 delete OUT;
466 if ( IN != &cin ){
467 delete IN;
468 }
469 return EXIT_FAILURE;
470 }
471 }
472 try {
473 TokenizerClass tokenizer;
474 // set debug first, so init() can be debugged too
475 tokenizer.setDebug( debug );
476 tokenizer.set_command( command_line );
477 tokenizer.setEosMarker( eosmarker );
478 tokenizer.setVerbose( verbose );
479 tokenizer.setSentenceSplit(sentencesplit);
480 tokenizer.setSentencePerLineOutput(sentenceperlineoutput);
481 tokenizer.setSentencePerLineInput(sentenceperlineinput);
482 tokenizer.setLowercase(tolowercase);
483 tokenizer.setUppercase(touppercase);
484 tokenizer.setNormSet(norm_set_string);
485 tokenizer.setParagraphDetection(paragraphdetection);
486 tokenizer.setQuoteDetection(quotedetection);
487 tokenizer.setNormalization( normalization );
488 tokenizer.setInputEncoding( inputEncoding );
489 tokenizer.setFiltering(dofiltering);
490 tokenizer.setWordCorrection(docorrectwords);
491 tokenizer.setLangDetection(do_language_detect);
492 tokenizer.setPunctFilter(dopunctfilter);
493 tokenizer.setInputClass(inputclass);
494 tokenizer.setOutputClass(outputclass);
495 tokenizer.setXMLOutput(xmlout, docid);
496 tokenizer.setXMLInput(xmlin);
497 tokenizer.setTextRedundancy(redundancy);
498 if ( ignore_tags ){
499 tokenizer.setNoTags( true );
500 }
501 if ( pass_thru ){
502 tokenizer.setPassThru( true );
503 }
504 else {
505 // init exept for passthru mode
506 if ( !cfile.empty()
507 && !tokenizer.init( cfile, add_tokens ) ){
508 if ( IN != &cin ){
509 delete IN;
510 }
511 if ( OUT != &cout ){
512 delete OUT;
513 }
514 return EXIT_FAILURE;
515 }
516 else if ( !tokenizer.init( language_list, add_tokens ) ){
517 if ( IN != &cin ){
518 delete IN;
519 }
520 if ( OUT != &cout ){
521 delete OUT;
522 }
523 return EXIT_FAILURE;
524 }
525 if ( !cfile.empty() ){
526 cerr << "ucto: configured from file: " << cfile << endl;
527 }
528 else {
529 cerr << "ucto: configured for languages: " << language_list << endl;
530 }
531 }
532
533
534 if (xmlin) {
535 folia::Document *doc = tokenizer.tokenize_folia( ifile );
536 if ( doc ){
537 *OUT << doc;
538 OUT->flush();
539 delete doc;
540 }
541 }
542 else {
543 tokenizer.tokenize( *IN, *OUT );
544 if ( OUT != &cout )
545 delete OUT;
546 if ( IN != &cin )
547 delete IN;
548 }
549 }
550 catch ( exception &e ){
551 cerr << "ucto: " << e.what() << endl;
552 return EXIT_FAILURE;
553 }
554
555 }
556