1 /*
2 * Copyright (C) 2005-2019 Universitat d'Alacant / Universidad de Alicante
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <https://www.gnu.org/licenses/>.
16 */
17 #include <lttoolbox/fst_processor.h>
18 #include <lttoolbox/compression.h>
19 #include <lttoolbox/exception.h>
20 #include <lttoolbox/xml_parse_util.h>
21
22 #include <iostream>
23 #include <cerrno>
24 #include <climits>
25 #include <cwctype>
26
27 #if defined(_WIN32) && !defined(_MSC_VER)
28 #include <utf8_fwrap.h>
29 #endif
30
31 using namespace std;
32
33
FSTProcessor()34 FSTProcessor::FSTProcessor() :
35 default_weight(0.0000),
36 outOfWord(false),
37 isLastBlankTM(false)
38 {
39 // escaped_chars chars
40 escaped_chars.insert(L'[');
41 escaped_chars.insert(L']');
42 escaped_chars.insert(L'{');
43 escaped_chars.insert(L'}');
44 escaped_chars.insert(L'^');
45 escaped_chars.insert(L'$');
46 escaped_chars.insert(L'/');
47 escaped_chars.insert(L'\\');
48 escaped_chars.insert(L'@');
49 escaped_chars.insert(L'<');
50 escaped_chars.insert(L'>');
51
52 caseSensitive = false;
53 dictionaryCase = false;
54 do_decomposition = false;
55 nullFlush = false;
56 nullFlushGeneration = false;
57 useIgnoredChars = false;
58 useDefaultIgnoredChars = true;
59 useRestoreChars = false;
60 displayWeightsMode = false;
61 showControlSymbols = false;
62 biltransSurfaceForms = false;
63 maxAnalyses = INT_MAX;
64 maxWeightClasses = INT_MAX;
65 compoundOnlyLSymbol = 0;
66 compoundRSymbol = 0;
67 compound_max_elements = 4;
68
69 if(useDefaultIgnoredChars)
70 {
71 initDefaultIgnoredCharacters();
72 }
73 }
74
75 void
streamError()76 FSTProcessor::streamError()
77 {
78 throw Exception("Error: Malformed input stream.");
79 }
80
81 void
parseICX(string const & file)82 FSTProcessor::parseICX(string const &file)
83 {
84 if(useIgnoredChars)
85 {
86 reader = xmlReaderForFile(file.c_str(), NULL, 0);
87 if(reader == NULL)
88 {
89 cerr << "Error: cannot open '" << file << "'." << endl;
90 exit(EXIT_FAILURE);
91 }
92 int ret = xmlTextReaderRead(reader);
93 while(ret == 1)
94 {
95 procNodeICX();
96 ret = xmlTextReaderRead(reader);
97 }
98 // No point trying to process ignored chars if there are none
99 if(ignored_chars.size() == 0)
100 {
101 useIgnoredChars = false;
102 }
103 }
104 }
105
106 void
parseRCX(string const & file)107 FSTProcessor::parseRCX(string const &file)
108 {
109 if(useRestoreChars)
110 {
111 reader = xmlReaderForFile(file.c_str(), NULL, 0);
112 if(reader == NULL)
113 {
114 cerr << "Error: cannot open '" << file << "'." << endl;
115 exit(EXIT_FAILURE);
116 }
117 int ret = xmlTextReaderRead(reader);
118 while(ret == 1)
119 {
120 procNodeRCX();
121 ret = xmlTextReaderRead(reader);
122 }
123 }
124 }
125
126 void
procNodeICX()127 FSTProcessor::procNodeICX()
128 {
129 xmlChar const *xname = xmlTextReaderConstName(reader);
130 wstring name = XMLParseUtil::towstring(xname);
131 if(name == L"#text")
132 {
133 /* ignore */
134 }
135 else if(name == L"ignored-chars")
136 {
137 /* ignore */
138 }
139 else if(name == L"char")
140 {
141 ignored_chars.insert(static_cast<int>(XMLParseUtil::attrib(reader, L"value")[0]));
142 }
143 else if(name == L"#comment")
144 {
145 /* ignore */
146 }
147 else
148 {
149 wcerr << L"Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader);
150 wcerr << L"): Invalid node '<" << name << L">'." << endl;
151 exit(EXIT_FAILURE);
152 }
153 }
154
155 void
initDefaultIgnoredCharacters()156 FSTProcessor::initDefaultIgnoredCharacters()
157 {
158 ignored_chars.insert(173); // '\u00AD', soft hyphen
159 }
160
161 void
procNodeRCX()162 FSTProcessor::procNodeRCX()
163 {
164 xmlChar const *xname = xmlTextReaderConstName(reader);
165 wstring name = XMLParseUtil::towstring(xname);
166 if(name == L"#text")
167 {
168 /* ignore */
169 }
170 else if(name == L"restore-chars")
171 {
172 /* ignore */
173 }
174 else if(name == L"char")
175 {
176 rcx_current_char = static_cast<int>(XMLParseUtil::attrib(reader, L"value")[0]);
177 }
178 else if(name == L"restore-char")
179 {
180 rcx_map[rcx_current_char].insert(static_cast<int>(XMLParseUtil::attrib(reader, L"value")[0]));
181 }
182 else if(name == L"#comment")
183 {
184 /* ignore */
185 }
186 else
187 {
188 wcerr << L"Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader);
189 wcerr << L"): Invalid node '<" << name << L">'." << endl;
190 exit(EXIT_FAILURE);
191 }
192 }
193
194 wchar_t
readEscaped(FILE * input)195 FSTProcessor::readEscaped(FILE *input)
196 {
197 if(feof(input))
198 {
199 streamError();
200 }
201
202 wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
203
204 if(feof(input))
205 {
206 streamError();
207 }
208
209 return val;
210 }
211
212 wstring
readFullBlock(FILE * input,wchar_t const delim1,wchar_t const delim2)213 FSTProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2)
214 {
215 wstring result = L"";
216 result += delim1;
217 wchar_t c = delim1;
218
219 while(!feof(input) && c != delim2)
220 {
221 c = static_cast<wchar_t>(fgetwc_unlocked(input));
222 result += c;
223 if(c != L'\\')
224 {
225 continue;
226 }
227 else
228 {
229 result += static_cast<wchar_t>(readEscaped(input));
230 }
231 }
232
233 if(c != delim2)
234 {
235 streamError();
236 }
237
238 return result;
239 }
240
241 wstring
readWblank(FILE * input)242 FSTProcessor::readWblank(FILE *input)
243 {
244 wstring result = L"";
245 result += L"[[";
246 wchar_t c = 0;
247
248 while(!feof(input))
249 {
250 c = static_cast<wchar_t>(fgetwc_unlocked(input));
251 result += c;
252
253 if(c == L'\\')
254 {
255 result += static_cast<wchar_t>(readEscaped(input));
256 }
257 else if(c == L']')
258 {
259 c = static_cast<wchar_t>(fgetwc_unlocked(input));
260 result += c;
261
262 if(c == L']')
263 {
264 break;
265 }
266 }
267 }
268
269 if(c != L']')
270 {
271 streamError();
272 }
273
274 return result;
275 }
276
277 bool
wblankPostGen(FILE * input,FILE * output)278 FSTProcessor::wblankPostGen(FILE *input, FILE *output)
279 {
280 wstring result = L"";
281 result += L"[[";
282 wchar_t c = 0;
283 bool in_content = false;
284
285 while(!feof(input))
286 {
287 c = static_cast<wchar_t>(fgetwc_unlocked(input));
288
289 if(in_content && c == L'~')
290 {
291 if(result[result.size()-1] == L']') {
292 // We just saw the end of a wblank, may want to merge
293 wblankqueue.push(result);
294 }
295 else {
296 // wake-up-mark happened some characters into the wblanked word
297 fputws(result.c_str(), output);
298 }
299 return true;
300 }
301 else
302 {
303 result += c;
304 }
305
306 if(c == L'\\')
307 {
308 result += static_cast<wchar_t>(readEscaped(input));
309 }
310 else if(c == L']')
311 {
312 c = static_cast<wchar_t>(fgetwc_unlocked(input));
313 result += c;
314
315 if(c == L']')
316 {
317 int resultlen = result.size();
318 if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]]
319 {
320 fputws(result.c_str(), output);
321 break;
322 }
323 else
324 {
325 in_content = true; // Assumption: No nested wblanks, always balanced
326 }
327 }
328 }
329 }
330
331 if(c != L']')
332 {
333 streamError();
334 }
335
336 return false;
337 }
338
339 int
readAnalysis(FILE * input)340 FSTProcessor::readAnalysis(FILE *input)
341 {
342 if(!input_buffer.isEmpty())
343 {
344 return input_buffer.next();
345 }
346
347 wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
348 int altval = 0;
349 if(feof(input))
350 {
351 input_buffer.add(0); // so it's treated like the NUL byte
352 return 0;
353 }
354
355 if((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end())
356 {
357 input_buffer.add(val);
358 val = static_cast<wchar_t>(fgetwc_unlocked(input));
359 }
360
361 if(escaped_chars.find(val) != escaped_chars.end())
362 {
363 switch(val)
364 {
365 case L'<':
366 altval = static_cast<int>(alphabet(readFullBlock(input, L'<', L'>')));
367 input_buffer.add(altval);
368 return altval;
369
370 case L'[':
371 val = static_cast<wchar_t>(fgetwc_unlocked(input));
372
373 if(val == L'[')
374 {
375 blankqueue.push(readWblank(input));
376 }
377 else
378 {
379 ungetwc_unlocked(val, input);
380 blankqueue.push(readFullBlock(input, L'[', L']'));
381 }
382
383 input_buffer.add(static_cast<int>(L' '));
384 return static_cast<int>(L' ');
385
386 case L'\\':
387 val = static_cast<wchar_t>(fgetwc_unlocked(input));
388 input_buffer.add(static_cast<int>(val));
389 return val;
390
391 default:
392 streamError();
393 }
394 }
395 if(val == L' ') {
396 blankqueue.push(L" ");
397 }
398
399 input_buffer.add(val);
400 return val;
401 }
402
403 int
readTMAnalysis(FILE * input)404 FSTProcessor::readTMAnalysis(FILE *input)
405 {
406 isLastBlankTM = false;
407 if(!input_buffer.isEmpty())
408 {
409 return input_buffer.next();
410 }
411
412 wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
413 int altval = 0;
414 if(feof(input))
415 {
416 return 0;
417 }
418
419 if(escaped_chars.find(val) != escaped_chars.end() || iswdigit(val))
420 {
421 switch(val)
422 {
423 case L'<':
424 altval = static_cast<int>(alphabet(readFullBlock(input, L'<', L'>')));
425 input_buffer.add(altval);
426 return altval;
427
428 case L'[':
429 val = static_cast<wchar_t>(fgetwc_unlocked(input));
430
431 if(val == L'[')
432 {
433 blankqueue.push(readWblank(input));
434 }
435 else
436 {
437 ungetwc_unlocked(val, input);
438 blankqueue.push(readFullBlock(input, L'[', L']'));
439 }
440
441 input_buffer.add(static_cast<int>(L' '));
442 isLastBlankTM = true;
443 return static_cast<int>(L' ');
444
445 case L'\\':
446 val = static_cast<wchar_t>(fgetwc_unlocked(input));
447 input_buffer.add(static_cast<int>(val));
448 return val;
449 case L'0':
450 case L'1':
451 case L'2':
452 case L'3':
453 case L'4':
454 case L'5':
455 case L'6':
456 case L'7':
457 case L'8':
458 case L'9':
459 {
460 wstring ws = L"";
461 do
462 {
463 ws += val;
464 val = static_cast<wchar_t>(fgetwc_unlocked(input));
465 } while(iswdigit(val));
466 ungetwc_unlocked(val, input);
467 input_buffer.add(alphabet(L"<n>"));
468 numbers.push_back(ws);
469 return alphabet(L"<n>");
470 }
471 break;
472
473 default:
474 streamError();
475 }
476 }
477
478 input_buffer.add(val);
479 return val;
480 }
481
482 int
readPostgeneration(FILE * input,FILE * output)483 FSTProcessor::readPostgeneration(FILE *input, FILE *output)
484 {
485 if(!input_buffer.isEmpty())
486 {
487 return input_buffer.next();
488 }
489
490 wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
491 int altval = 0;
492 is_wblank = false;
493 if(feof(input))
494 {
495 return 0;
496 }
497
498 switch(val)
499 {
500 case L'<':
501 altval = static_cast<int>(alphabet(readFullBlock(input, L'<', L'>')));
502 input_buffer.add(altval);
503 return altval;
504
505 case L'[':
506 val = static_cast<wchar_t>(fgetwc_unlocked(input));
507
508 if(val == L'[')
509 {
510 if(collect_wblanks)
511 {
512 wblankqueue.push(readWblank(input));
513 is_wblank = true;
514 return static_cast<int>(L' ');
515 }
516 else if(wblankPostGen(input, output))
517 {
518 return static_cast<int>(L'~');
519 }
520 else
521 {
522 is_wblank = true;
523 return static_cast<int>(L' ');
524 }
525 }
526 else
527 {
528 ungetwc_unlocked(val, input);
529 blankqueue.push(readFullBlock(input, L'[', L']'));
530
531 input_buffer.add(static_cast<int>(L' '));
532 return static_cast<int>(L' ');
533 }
534
535 case L'\\':
536 val = static_cast<wchar_t>(fgetwc_unlocked(input));
537 input_buffer.add(static_cast<int>(val));
538 return val;
539
540 default:
541 input_buffer.add(val);
542 return val;
543 }
544 }
545
546 void
skipUntil(FILE * input,FILE * output,wint_t const character)547 FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character)
548 {
549 while(true)
550 {
551 wint_t val = fgetwc_unlocked(input);
552 if(feof(input))
553 {
554 return;
555 }
556
557 switch(val)
558 {
559 case L'\\':
560 val = fgetwc_unlocked(input);
561 if(feof(input))
562 {
563 return;
564 }
565 fputwc_unlocked(L'\\', output);
566 fputwc_unlocked(val, output);
567 break;
568
569 case L'\0':
570 fputwc_unlocked(val, output);
571 if(nullFlushGeneration)
572 {
573 fflush(output);
574 }
575 break;
576
577 default:
578 if(val == character)
579 {
580 return;
581 }
582 else
583 {
584 fputwc_unlocked(val, output);
585 }
586 break;
587 }
588 }
589 }
590
591 int
readGeneration(FILE * input,FILE * output)592 FSTProcessor::readGeneration(FILE *input, FILE *output)
593 {
594 wint_t val = fgetwc_unlocked(input);
595
596 if(feof(input))
597 {
598 return 0x7fffffff;
599 }
600
601 if(outOfWord)
602 {
603 if(val == L'^')
604 {
605 val = fgetwc_unlocked(input);
606 if(feof(input))
607 {
608 return 0x7fffffff;
609 }
610 }
611 else if(val == L'\\')
612 {
613 fputwc_unlocked(val, output);
614 val = fgetwc_unlocked(input);
615 if(feof(input))
616 {
617 return 0x7fffffff;
618 }
619 fputwc_unlocked(val,output);
620 skipUntil(input, output, L'^');
621 val = fgetwc_unlocked(input);
622 if(feof(input))
623 {
624 return 0x7fffffff;
625 }
626 }
627 else
628 {
629 fputwc_unlocked(val, output);
630 skipUntil(input, output, L'^');
631 val = fgetwc_unlocked(input);
632 if(feof(input))
633 {
634 return 0x7fffffff;
635 }
636 }
637 outOfWord = false;
638 }
639
640 if(val == L'\\')
641 {
642 val = fgetwc_unlocked(input);
643 return static_cast<int>(val);
644 }
645 else if(val == L'$')
646 {
647 outOfWord = true;
648 return static_cast<int>(L'$');
649 }
650 else if(val == L'<')
651 {
652 wstring cad = L"";
653 cad += static_cast<wchar_t>(val);
654
655 while((val = fgetwc_unlocked(input)) != L'>')
656 {
657 if(feof(input))
658 {
659 streamError();
660 }
661 cad += static_cast<wchar_t>(val);
662 }
663 cad += static_cast<wchar_t>(val);
664
665 return alphabet(cad);
666 }
667 else if(val == L'[')
668 {
669 val = fgetwc_unlocked(input);
670 if(val == L'[')
671 {
672 fputws_unlocked(readWblank(input).c_str(), output);
673 }
674 else
675 {
676 ungetwc_unlocked(val, input);
677 fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output);
678 }
679
680 return readGeneration(input, output);
681 }
682 else
683 {
684 return static_cast<int>(val);
685 }
686
687 return 0x7fffffff;
688 }
689
690 pair<wstring, int>
readBilingual(FILE * input,FILE * output)691 FSTProcessor::readBilingual(FILE *input, FILE *output)
692 {
693 wint_t val = fgetwc_unlocked(input);
694 wstring symbol = L"";
695
696 if(feof(input))
697 {
698 return pair<wstring, int>(symbol, 0x7fffffff);
699 }
700
701 if(outOfWord)
702 {
703 if(val == L'^')
704 {
705 val = fgetwc_unlocked(input);
706 if(feof(input))
707 {
708 return pair<wstring, int>(symbol, 0x7fffffff);
709 }
710 }
711 else if(val == L'\\')
712 {
713 fputwc_unlocked(val, output);
714 val = fgetwc_unlocked(input);
715 if(feof(input))
716 {
717 return pair<wstring, int>(symbol, 0x7fffffff);
718 }
719 fputwc_unlocked(val,output);
720 skipUntil(input, output, L'^');
721 val = fgetwc_unlocked(input);
722 if(feof(input))
723 {
724 return pair<wstring, int>(symbol, 0x7fffffff);
725 }
726 }
727 else
728 {
729 fputwc_unlocked(val, output);
730 skipUntil(input, output, L'^');
731 val = fgetwc_unlocked(input);
732 if(feof(input))
733 {
734 return pair<wstring, int>(symbol, 0x7fffffff);
735 }
736 }
737 outOfWord = false;
738 }
739
740 if(val == L'\\')
741 {
742 val = fgetwc_unlocked(input);
743 return pair<wstring, int>(symbol, val);
744 }
745 else if(val == L'$')
746 {
747 outOfWord = true;
748 return pair<wstring, int>(symbol, static_cast<int>(L'$'));
749 }
750 else if(val == L'<')
751 {
752 wstring cad = L"";
753 cad += static_cast<wchar_t>(val);
754 while((val = fgetwc_unlocked(input)) != L'>')
755 {
756 if(feof(input))
757 {
758 streamError();
759 }
760 cad += static_cast<wchar_t>(val);
761 }
762 cad += static_cast<wchar_t>(val);
763
764 int res = alphabet(cad);
765
766 if (res == 0)
767 {
768 symbol = cad;
769 }
770 return pair<wstring, int>(symbol, res);
771 }
772 else if(val == L'[')
773 {
774 val = fgetwc_unlocked(input);
775 if(val == L'[')
776 {
777 fputws_unlocked(readWblank(input).c_str(), output);
778 }
779 else
780 {
781 ungetwc_unlocked(val, input);
782 fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output);
783 }
784
785 return readBilingual(input, output);
786 }
787
788 return pair<wstring, int>(symbol, val);
789 }
790
791 void
flushBlanks(FILE * output)792 FSTProcessor::flushBlanks(FILE *output)
793 {
794 for(size_t i = blankqueue.size(); i > 0; i--)
795 {
796 fputws_unlocked(blankqueue.front().c_str(), output);
797 blankqueue.pop();
798 }
799 }
800
801 void
flushWblanks(FILE * output)802 FSTProcessor::flushWblanks(FILE *output)
803 {
804 while(wblankqueue.size() > 0)
805 {
806 fputws_unlocked(wblankqueue.front().c_str(), output);
807 wblankqueue.pop();
808 }
809 }
810
811 wstring
combineWblanks()812 FSTProcessor::combineWblanks()
813 {
814 wstring final_wblank;
815 wstring last_wblank = L"";
816 bool seen_wblank = false;
817
818 while(wblankqueue.size() > 0)
819 {
820 if(wblankqueue.front().compare(L"[[/]]") == 0)
821 {
822 if(seen_wblank) {
823 if(final_wblank.empty())
824 {
825 final_wblank += L"[[";
826 }
827 else if(final_wblank.size() > 2)
828 {
829 final_wblank += L"; ";
830 }
831
832 final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]]
833 }
834 else {
835 need_end_wblank = true;
836 }
837 last_wblank.clear();
838 }
839 else
840 {
841 seen_wblank = true;
842 last_wblank = wblankqueue.front();
843 }
844 wblankqueue.pop();
845 }
846
847 if(!last_wblank.empty())
848 {
849 wblankqueue.push(last_wblank);
850 }
851
852 if(!final_wblank.empty())
853 {
854 final_wblank += L"]]";
855 need_end_wblank = true;
856 }
857 return final_wblank;
858 }
859
860 void
calcInitial()861 FSTProcessor::calcInitial()
862 {
863 for(map<wstring, TransExe, Ltstr>::iterator it = transducers.begin(),
864 limit = transducers.end();
865 it != limit; it++)
866 {
867 root.addTransition(0, 0, it->second.getInitial(), default_weight);
868 }
869
870 initial_state.init(&root);
871 }
872
873 bool
endsWith(wstring const & str,wstring const & suffix)874 FSTProcessor::endsWith(wstring const &str, wstring const &suffix)
875 {
876 if(str.size() < suffix.size())
877 {
878 return false;
879 }
880 else
881 {
882 return str.substr(str.size()-suffix.size()) == suffix;
883 }
884 }
885
886 void
classifyFinals()887 FSTProcessor::classifyFinals()
888 {
889 for(map<wstring, TransExe, Ltstr>::iterator it = transducers.begin(),
890 limit = transducers.end();
891 it != limit; it++)
892 {
893 if(endsWith(it->first, L"@inconditional"))
894 {
895 inconditional.insert(it->second.getFinals().begin(),
896 it->second.getFinals().end());
897 }
898 else if(endsWith(it->first, L"@standard"))
899 {
900 standard.insert(it->second.getFinals().begin(),
901 it->second.getFinals().end());
902 }
903 else if(endsWith(it->first, L"@postblank"))
904 {
905 postblank.insert(it->second.getFinals().begin(),
906 it->second.getFinals().end());
907 }
908 else if(endsWith(it->first, L"@preblank"))
909 {
910 preblank.insert(it->second.getFinals().begin(),
911 it->second.getFinals().end());
912 }
913 else
914 {
915 wcerr << L"Error: Unsupported transducer type for '";
916 wcerr << it->first << L"'." << endl;
917 exit(EXIT_FAILURE);
918 }
919 }
920 }
921
922 void
writeEscaped(wstring const & str,FILE * output)923 FSTProcessor::writeEscaped(wstring const &str, FILE *output)
924 {
925 for(unsigned int i = 0, limit = str.size(); i < limit; i++)
926 {
927 if(escaped_chars.find(str[i]) != escaped_chars.end())
928 {
929 fputwc_unlocked(L'\\', output);
930 }
931 fputwc_unlocked(str[i], output);
932 }
933 }
934
935 size_t
writeEscapedPopBlanks(wstring const & str,FILE * output)936 FSTProcessor::writeEscapedPopBlanks(wstring const &str, FILE *output)
937 {
938 size_t postpop = 0;
939 for (unsigned int i = 0, limit = str.size(); i < limit; i++)
940 {
941 if (escaped_chars.find(str[i]) != escaped_chars.end()) {
942 fputwc_unlocked(L'\\', output);
943 }
944 fputwc_unlocked(str[i], output);
945 if (str[i] == L' ') {
946 if (blankqueue.front() == L" ") {
947 blankqueue.pop();
948 } else {
949 postpop++;
950 }
951 }
952 }
953 return postpop;
954 }
955
956 void
writeEscapedWithTags(wstring const & str,FILE * output)957 FSTProcessor::writeEscapedWithTags(wstring const &str, FILE *output)
958 {
959 for(unsigned int i = 0, limit = str.size(); i < limit; i++)
960 {
961 if(str[i] == L'<' && i >=1 && str[i-1] != L'\\')
962 {
963 fputws_unlocked(str.substr(i).c_str(), output);
964 return;
965 }
966
967 if(escaped_chars.find(str[i]) != escaped_chars.end())
968 {
969 fputwc_unlocked(L'\\', output);
970 }
971 fputwc_unlocked(str[i], output);
972 }
973 }
974
975
976
977 void
printWord(wstring const & sf,wstring const & lf,FILE * output)978 FSTProcessor::printWord(wstring const &sf, wstring const &lf, FILE *output)
979 {
980 fputwc_unlocked(L'^', output);
981 writeEscaped(sf, output);
982 fputws_unlocked(lf.c_str(), output);
983 fputwc_unlocked(L'$', output);
984 }
985
986 void
printWordPopBlank(wstring const & sf,wstring const & lf,FILE * output)987 FSTProcessor::printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output)
988 {
989 fputwc_unlocked(L'^', output);
990 size_t postpop = writeEscapedPopBlanks(sf, output);
991 fputws_unlocked(lf.c_str(), output);
992 fputwc_unlocked(L'$', output);
993 while (postpop-- && blankqueue.size() > 0)
994 {
995 fputws(blankqueue.front().c_str(), output);
996 blankqueue.pop();
997 }
998 }
999
1000 void
printWordBilingual(wstring const & sf,wstring const & lf,FILE * output)1001 FSTProcessor::printWordBilingual(wstring const &sf, wstring const &lf, FILE *output)
1002 {
1003 fputwc_unlocked(L'^', output);
1004 fputws_unlocked(sf.c_str(), output);
1005 fputws_unlocked(lf.c_str(), output);
1006 fputwc_unlocked(L'$', output);
1007 }
1008
1009 void
printUnknownWord(wstring const & sf,FILE * output)1010 FSTProcessor::printUnknownWord(wstring const &sf, FILE *output)
1011 {
1012 fputwc_unlocked(L'^', output);
1013 writeEscaped(sf, output);
1014 fputwc_unlocked(L'/', output);
1015 fputwc_unlocked(L'*', output);
1016 writeEscaped(sf, output);
1017 fputwc_unlocked(L'$', output);
1018 }
1019
1020 unsigned int
lastBlank(wstring const & str)1021 FSTProcessor::lastBlank(wstring const &str)
1022 {
1023 for(int i = static_cast<int>(str.size())-1; i >= 0; i--)
1024 {
1025 if(alphabetic_chars.find(str[i]) == alphabetic_chars.end())
1026 {
1027 return static_cast<unsigned int>(i);
1028 }
1029 }
1030
1031 return 0;
1032 }
1033
1034 void
printSpace(wchar_t const val,FILE * output)1035 FSTProcessor::printSpace(wchar_t const val, FILE *output)
1036 {
1037 if(blankqueue.size() > 0)
1038 {
1039 flushBlanks(output);
1040 }
1041 else
1042 {
1043 fputwc_unlocked(val, output);
1044 }
1045 }
1046
1047 bool
isEscaped(wchar_t const c) const1048 FSTProcessor::isEscaped(wchar_t const c) const
1049 {
1050 return escaped_chars.find(c) != escaped_chars.end();
1051 }
1052
1053 bool
isAlphabetic(wchar_t const c) const1054 FSTProcessor::isAlphabetic(wchar_t const c) const
1055 {
1056 return (bool)std::iswalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end();
1057 }
1058
1059 void
load(FILE * input)1060 FSTProcessor::load(FILE *input)
1061 {
1062 fpos_t pos;
1063 if (fgetpos(input, &pos) == 0) {
1064 char header[4]{};
1065 fread(header, 1, 4, input);
1066 if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) {
1067 auto features = read_le<uint64_t>(input);
1068 if (features >= LTF_UNKNOWN) {
1069 throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!");
1070 }
1071 }
1072 else {
1073 // Old binary format
1074 fsetpos(input, &pos);
1075 }
1076 }
1077
1078 // letters
1079 int len = Compression::multibyte_read(input);
1080 while(len > 0)
1081 {
1082 alphabetic_chars.insert(static_cast<wchar_t>(Compression::multibyte_read(input)));
1083 len--;
1084 }
1085
1086 // symbols
1087 alphabet.read(input);
1088
1089 len = Compression::multibyte_read(input);
1090
1091 while(len > 0)
1092 {
1093 int len2 = Compression::multibyte_read(input);
1094 wstring name = L"";
1095 while(len2 > 0)
1096 {
1097 name += static_cast<wchar_t>(Compression::multibyte_read(input));
1098 len2--;
1099 }
1100 transducers[name].read(input, alphabet);
1101 len--;
1102 }
1103 }
1104
1105 void
lsx_wrapper_null_flush(FILE * input,FILE * output)1106 FSTProcessor::lsx_wrapper_null_flush(FILE *input, FILE *output)
1107 {
1108 setNullFlush(false);
1109 //nullFlushGeneration = true;
1110
1111 while(!feof(input))
1112 {
1113 lsx(input, output);
1114 fputwc_unlocked(L'\0', output);
1115 int code = fflush(output);
1116 if(code != 0)
1117 {
1118 wcerr << L"Could not flush output " << errno << endl;
1119 }
1120 }
1121 }
1122
1123 void
lsx(FILE * input,FILE * output)1124 FSTProcessor::lsx(FILE *input, FILE *output)
1125 {
1126 if(getNullFlush())
1127 {
1128 lsx_wrapper_null_flush(input, output);
1129 }
1130
1131 vector<State> new_states, alive_states;
1132 wstring blank, out, in, alt_out, alt_in;
1133 bool outOfWord = true;
1134 bool finalFound = false;
1135 bool plus_thing = false;
1136
1137 alive_states.push_back(initial_state);
1138
1139 int val = -1;
1140
1141 while(!feof(input) && val != 0)
1142 {
1143 val = fgetwc_unlocked(input);
1144
1145 if(val == L'+' && isEscaped(val) && !outOfWord)
1146 {
1147 val = L'$';
1148 plus_thing = true;
1149 }
1150
1151 if((val == L'^' && isEscaped(val) && outOfWord) || feof(input) || val == 0)
1152 {
1153 blankqueue.push(blank);
1154
1155 if(alive_states.size() == 0)
1156 {
1157 if(blankqueue.size() > 0)
1158 {
1159 fputws(blankqueue.front().c_str(), output);
1160 fflush(output);
1161 blankqueue.pop();
1162 }
1163
1164 alive_states.push_back(initial_state);
1165
1166 alt_in = L"";
1167 for(int i=0; i < (int) in.size(); i++) // FIXME indexing
1168 {
1169 alt_in += in[i];
1170 if(in[i] == L'$' && in[i+1] == L'^' && blankqueue.size() > 0)
1171 {
1172 // in.insert(i+1, blankqueue.front().c_str());
1173 alt_in += blankqueue.front().c_str();
1174 blankqueue.pop();
1175 }
1176 }
1177 in = alt_in;
1178 fputws(in.c_str(), output);
1179 fflush(output);
1180 in = L"";
1181 finalFound = false;
1182 }
1183 else if(finalFound && alive_states.size() == 1)
1184 {
1185 finalFound = false;
1186 }
1187
1188 blank = L"";
1189 in += val;
1190 outOfWord = false;
1191 continue;
1192 }
1193
1194 // wcerr << L"\n[!] " << (wchar_t)val << L" ||| " << outOfWord << endl;
1195
1196 if(outOfWord)
1197 {
1198 blank += val;
1199 continue;
1200 }
1201
1202 if((val == 0 || feof(input) || val == L'$') && !outOfWord) // && isEscaped(val)
1203 {
1204 new_states.clear();
1205 for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++)
1206 {
1207 State s = *it;
1208 //wcerr << endl << L"[0] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl;
1209 s.step(alphabet(L"<$>"));
1210 //wcerr << endl << L"[1] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl;
1211 if(s.size() > 0)
1212 {
1213 new_states.push_back(s);
1214 }
1215
1216 /*if(s.isFinal(all_finals))
1217 {
1218 out += s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses);
1219 new_states.push_back(*initial_state);
1220 }*/
1221
1222 if(s.isFinal(all_finals))
1223 {
1224 new_states.clear();
1225 new_states.push_back(initial_state);
1226 out = s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses);
1227
1228 alt_out = L"";
1229 for (int i=0; i < (int) out.size(); i++)
1230 {
1231 wchar_t c = out.at(i);
1232 if(c == L'/')
1233 {
1234 alt_out += L'^';
1235 }
1236 else if(out[i-1] == L'<' && c == L'$' && out[i+1] == L'>') // indexing
1237 {
1238 alt_out += c;
1239 alt_out += L'^';
1240 }
1241 else if(!(c == L'<' && out[i+1] == L'$' && out[i+2] == L'>') && !(out[i-2] == L'<' && out[i-1] == L'$' && c == L'>'))
1242 {
1243 alt_out += c;
1244 }
1245 }
1246 out = alt_out;
1247
1248
1249 if(out[out.length()-1] == L'^')
1250 {
1251 out = out.substr(0, out.length()-1); // extra ^ at the end
1252 if(plus_thing)
1253 {
1254 out[out.size()-1] = L'+';
1255 plus_thing = false;
1256 }
1257 }
1258 else // take# out ... of
1259 {
1260 for(int i=out.length()-1; i>=0; i--) // indexing
1261 {
1262 if(out.at(i) == L'$')
1263 {
1264 out.insert(i+1, L" ");
1265 break;
1266 }
1267 }
1268 out += L'$';
1269 }
1270
1271 if(blankqueue.size() > 0)
1272 {
1273 fputws(blankqueue.front().c_str(), output);
1274 blankqueue.pop();
1275 }
1276
1277 alt_out = L"";
1278 for(int i=0; i < (int) out.size(); i++) // indexing
1279 {
1280 if((out.at(i) == L'$') && blankqueue.size() > 0)
1281 {
1282 alt_out += out.at(i);
1283 alt_out += blankqueue.front().c_str();
1284 blankqueue.pop();
1285 }
1286 else if((out.at(i) == L'$') && blankqueue.size() == 0 && i != (int) out.size()-1)
1287 {
1288 alt_out += out.at(i);
1289 alt_out += L' ';
1290 }
1291 else if(out.at(i) == L' ' && blankqueue.size() > 0)
1292 {
1293 alt_out += blankqueue.front().c_str();
1294 blankqueue.pop();
1295 }
1296 else
1297 {
1298 alt_out += out.at(i);
1299 }
1300 }
1301 out = alt_out;
1302
1303 fputws(out.c_str(), output);
1304 flushBlanks(output);
1305 finalFound = true;
1306 out = L"";
1307 in = L"";
1308 }
1309 }
1310
1311 alive_states.swap(new_states);
1312 outOfWord = true;
1313
1314 if(!finalFound)
1315 {
1316 in += val; //do not remove
1317 }
1318 continue;
1319 }
1320
1321 if(!outOfWord) // && (!(feof(input) || val == L'$')))
1322 {
1323 if(val == L'<') // tag
1324 {
1325 wstring tag = readFullBlock(input, L'<', L'>');
1326 in += tag;
1327 if(!alphabet.isSymbolDefined(tag))
1328 {
1329 alphabet.includeSymbol(tag);
1330 }
1331 val = static_cast<int>(alphabet(tag));
1332 }
1333 else
1334 {
1335 in += (wchar_t) val;
1336 }
1337
1338 new_states.clear();
1339 for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++)
1340 {
1341 State s = *it;
1342 if(val < 0)
1343 {
1344 s.step_override(val, alphabet(L"<ANY_TAG>"), val);
1345 }
1346 else if(val > 0)
1347 {
1348 int val_lowercase = towlower(val);
1349 s.step_override(val_lowercase, alphabet(L"<ANY_CHAR>"), val); // FIXME deal with cases! in step_override
1350 }
1351
1352 if(s.size() > 0)
1353 {
1354 new_states.push_back(s);
1355 }
1356
1357 }
1358 alive_states.swap(new_states);
1359 }
1360 }
1361
1362 flushBlanks(output);
1363 }
1364
1365 void
initAnalysis()1366 FSTProcessor::initAnalysis()
1367 {
1368 calcInitial();
1369 classifyFinals();
1370 all_finals = standard;
1371 all_finals.insert(inconditional.begin(), inconditional.end());
1372 all_finals.insert(postblank.begin(), postblank.end());
1373 all_finals.insert(preblank.begin(), preblank.end());
1374 }
1375
1376 void
initTMAnalysis()1377 FSTProcessor::initTMAnalysis()
1378 {
1379 calcInitial();
1380
1381 for(map<wstring, TransExe, Ltstr>::iterator it = transducers.begin(),
1382 limit = transducers.end();
1383 it != limit; it++)
1384 {
1385 all_finals.insert(it->second.getFinals().begin(),
1386 it->second.getFinals().end());
1387 }
1388 }
1389
1390 void
initGeneration()1391 FSTProcessor::initGeneration()
1392 {
1393 setIgnoredChars(false);
1394 calcInitial();
1395 for(map<wstring, TransExe, Ltstr>::iterator it = transducers.begin(),
1396 limit = transducers.end();
1397 it != limit; it++)
1398 {
1399 all_finals.insert(it->second.getFinals().begin(),
1400 it->second.getFinals().end());
1401 }
1402 }
1403
1404 void
initPostgeneration()1405 FSTProcessor::initPostgeneration()
1406 {
1407 initGeneration();
1408 }
1409
1410 void
initBiltrans()1411 FSTProcessor::initBiltrans()
1412 {
1413 initGeneration();
1414 }
1415
1416
1417 wstring
compoundAnalysis(wstring input_word,bool uppercase,bool firstupper)1418 FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupper)
1419 {
1420 const int MAX_COMBINATIONS = 32767;
1421
1422 State current_state = initial_state;
1423
1424 for(unsigned int i=0; i<input_word.size(); i++)
1425 {
1426 wchar_t val=input_word.at(i);
1427
1428 current_state.step_case(val, caseSensitive);
1429
1430 if(current_state.size() > MAX_COMBINATIONS)
1431 {
1432 wcerr << L"Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << L"'" << endl;
1433 wcerr << L" gave up at char " << i << L" '" << val << L"'." << endl;
1434
1435 wstring nullString = L"";
1436 return nullString;
1437 }
1438
1439 if(i < input_word.size()-1)
1440 {
1441 current_state.restartFinals(all_finals, compoundOnlyLSymbol, &initial_state, '+');
1442 }
1443
1444 if(current_state.size()==0)
1445 {
1446 wstring nullString = L"";
1447 return nullString;
1448 }
1449 }
1450
1451 current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements);
1452 wstring result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper);
1453
1454 return result;
1455 }
1456
1457
1458
1459 void
initDecompositionSymbols()1460 FSTProcessor::initDecompositionSymbols()
1461 {
1462 if((compoundOnlyLSymbol=alphabet(L"<:co:only-L>")) == 0
1463 && (compoundOnlyLSymbol=alphabet(L"<:compound:only-L>")) == 0
1464 && (compoundOnlyLSymbol=alphabet(L"<@co:only-L>")) == 0
1465 && (compoundOnlyLSymbol=alphabet(L"<@compound:only-L>")) == 0
1466 && (compoundOnlyLSymbol=alphabet(L"<compound-only-L>")) == 0)
1467 {
1468 wcerr << L"Warning: Decomposition symbol <:compound:only-L> not found" << endl;
1469 }
1470 else if(!showControlSymbols)
1471 {
1472 alphabet.setSymbol(compoundOnlyLSymbol, L"");
1473 }
1474
1475 if((compoundRSymbol=alphabet(L"<:co:R>")) == 0
1476 && (compoundRSymbol=alphabet(L"<:compound:R>")) == 0
1477 && (compoundRSymbol=alphabet(L"<@co:R>")) == 0
1478 && (compoundRSymbol=alphabet(L"<@compound:R>")) == 0
1479 && (compoundRSymbol=alphabet(L"<compound-R>")) == 0)
1480 {
1481 wcerr << L"Warning: Decomposition symbol <:compound:R> not found" << endl;
1482 }
1483 else if(!showControlSymbols)
1484 {
1485 alphabet.setSymbol(compoundRSymbol, L"");
1486 }
1487 }
1488
1489
1490 void
initDecomposition()1491 FSTProcessor::initDecomposition()
1492 {
1493 do_decomposition = true;
1494 initAnalysis();
1495 initDecompositionSymbols();
1496 }
1497
1498 void
analysis(FILE * input,FILE * output)1499 FSTProcessor::analysis(FILE *input, FILE *output)
1500 {
1501 if(getNullFlush())
1502 {
1503 analysis_wrapper_null_flush(input, output);
1504 }
1505
1506 bool last_incond = false;
1507 bool last_postblank = false;
1508 bool last_preblank = false;
1509 State current_state = initial_state;
1510 wstring lf = L""; //lexical form
1511 wstring sf = L""; //surface form
1512 int last = 0;
1513 bool firstupper = false, uppercase = false;
1514 map<int, set<int> >::iterator rcx_map_ptr;
1515
1516 wchar_t val;
1517 do
1518 {
1519 val = readAnalysis(input);
1520 // test for final states
1521 if(current_state.isFinal(all_finals))
1522 {
1523 if(current_state.isFinal(inconditional))
1524 {
1525 if(!dictionaryCase)
1526 {
1527 firstupper = iswupper(sf[0]);
1528 uppercase = firstupper && iswupper(sf[sf.size()-1]);
1529 }
1530
1531 if(do_decomposition && compoundOnlyLSymbol != 0)
1532 {
1533 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
1534 }
1535 lf = current_state.filterFinals(all_finals, alphabet,
1536 escaped_chars,
1537 displayWeightsMode, maxAnalyses, maxWeightClasses,
1538 uppercase, firstupper);
1539 last_incond = true;
1540 last = input_buffer.getPos();
1541 }
1542 else if(current_state.isFinal(postblank))
1543 {
1544 if(!dictionaryCase)
1545 {
1546 firstupper = iswupper(sf[0]);
1547 uppercase = firstupper && iswupper(sf[sf.size()-1]);
1548 }
1549
1550 if(do_decomposition && compoundOnlyLSymbol != 0)
1551 {
1552 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
1553 }
1554 lf = current_state.filterFinals(all_finals, alphabet,
1555 escaped_chars,
1556 displayWeightsMode, maxAnalyses, maxWeightClasses,
1557 uppercase, firstupper);
1558 last_postblank = true;
1559 last = input_buffer.getPos();
1560 }
1561 else if(current_state.isFinal(preblank))
1562 {
1563 if(!dictionaryCase)
1564 {
1565 firstupper = iswupper(sf[0]);
1566 uppercase = firstupper && iswupper(sf[sf.size()-1]);
1567 }
1568
1569 if(do_decomposition && compoundOnlyLSymbol != 0)
1570 {
1571 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
1572 }
1573 lf = current_state.filterFinals(all_finals, alphabet,
1574 escaped_chars,
1575 displayWeightsMode, maxAnalyses, maxWeightClasses,
1576 uppercase, firstupper);
1577 last_preblank = true;
1578 last = input_buffer.getPos();
1579 }
1580 else if(!isAlphabetic(val))
1581 {
1582 if(!dictionaryCase)
1583 {
1584 firstupper = iswupper(sf[0]);
1585 uppercase = firstupper && iswupper(sf[sf.size()-1]);
1586 }
1587
1588 if(do_decomposition && compoundOnlyLSymbol != 0)
1589 {
1590 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
1591 }
1592 lf = current_state.filterFinals(all_finals, alphabet,
1593 escaped_chars,
1594 displayWeightsMode, maxAnalyses, maxWeightClasses,
1595 uppercase, firstupper);
1596 last_postblank = false;
1597 last_preblank = false;
1598 last_incond = false;
1599 last = input_buffer.getPos();
1600 }
1601 }
1602 else if(sf == L"" && iswspace(val))
1603 {
1604 lf = L"/*";
1605 lf.append(sf);
1606 last_postblank = false;
1607 last_preblank = false;
1608 last_incond = false;
1609 last = input_buffer.getPos();
1610 }
1611
1612 if(useRestoreChars && rcx_map.find(val) != rcx_map.end())
1613 {
1614 rcx_map_ptr = rcx_map.find(val);
1615 set<int> tmpset = rcx_map_ptr->second;
1616 if(!iswupper(val) || caseSensitive)
1617 {
1618 current_state.step(val, tmpset);
1619 }
1620 else if(rcx_map.find(towlower(val)) != rcx_map.end())
1621 {
1622 rcx_map_ptr = rcx_map.find(tolower(val));
1623 tmpset.insert(tolower(val));
1624 tmpset.insert(rcx_map_ptr->second.begin(), rcx_map_ptr->second.end());
1625 current_state.step(val, tmpset);
1626 }
1627 else
1628 {
1629 tmpset.insert(tolower(val));
1630 current_state.step(val, tmpset);
1631 }
1632 }
1633 else
1634 {
1635 if(!iswupper(val) || caseSensitive)
1636 {
1637 current_state.step(val);
1638 }
1639 else
1640 {
1641 current_state.step(val, towlower(val));
1642 }
1643 }
1644
1645 if(current_state.size() != 0)
1646 {
1647 if(val != 0)
1648 {
1649 alphabet.getSymbol(sf, val);
1650 }
1651 }
1652 else
1653 {
1654 if(!isAlphabetic(val) && sf == L"")
1655 {
1656 if(iswspace(val))
1657 {
1658 if (blankqueue.size() > 0)
1659 {
1660 fputws_unlocked(blankqueue.front().c_str(), output);
1661 blankqueue.pop();
1662 }
1663 else
1664 {
1665 fputwc_unlocked(val, output);
1666 }
1667 }
1668 else
1669 {
1670 if(isEscaped(val))
1671 {
1672 fputwc_unlocked(L'\\', output);
1673 }
1674 if(val)
1675 {
1676 fputwc_unlocked(val, output);
1677 }
1678 }
1679 }
1680 else if(last_postblank)
1681 {
1682 printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)),
1683 lf, output);
1684 fputwc_unlocked(L' ', output);
1685 input_buffer.setPos(last);
1686 input_buffer.back(1);
1687 }
1688 else if(last_preblank)
1689 {
1690 fputwc_unlocked(L' ', output);
1691 printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)),
1692 lf, output);
1693 input_buffer.setPos(last);
1694 input_buffer.back(1);
1695 }
1696 else if(last_incond)
1697 {
1698 printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)),
1699 lf, output);
1700 input_buffer.setPos(last);
1701 input_buffer.back(1);
1702 }
1703 else if(isAlphabetic(val) &&
1704 ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) ||
1705 lf == L""))
1706 {
1707 do
1708 {
1709 alphabet.getSymbol(sf, val);
1710 }
1711 while((val = readAnalysis(input)) && isAlphabetic(val));
1712
1713 unsigned int limit = firstNotAlpha(sf);
1714 unsigned int size = sf.size();
1715 limit = (limit == static_cast<unsigned int>(wstring::npos)?size:limit);
1716 if(limit == 0)
1717 {
1718 input_buffer.back(sf.size());
1719 writeEscaped(sf.substr(0,1), output);
1720 }
1721 else
1722 {
1723 input_buffer.back(1+(size-limit));
1724 wstring unknown_word = sf.substr(0, limit);
1725 if(do_decomposition)
1726 {
1727 if(!dictionaryCase)
1728 {
1729 firstupper = iswupper(sf[0]);
1730 uppercase = firstupper && iswupper(sf[sf.size()-1]);
1731 }
1732
1733 wstring compound = L"";
1734 compound = compoundAnalysis(unknown_word, uppercase, firstupper);
1735 if(compound != L"")
1736 {
1737 printWord(unknown_word, compound, output);
1738 }
1739 else
1740 {
1741 printUnknownWord(unknown_word, output);
1742 }
1743 }
1744 else
1745 {
1746 printUnknownWord(unknown_word, output);
1747 }
1748 }
1749 }
1750 else if(lf == L"")
1751 {
1752 unsigned int limit = firstNotAlpha(sf);
1753 unsigned int size = sf.size();
1754 limit = (limit == static_cast<unsigned int >(wstring::npos)?size:limit);
1755 if(limit == 0)
1756 {
1757 input_buffer.back(sf.size());
1758 writeEscaped(sf.substr(0,1), output);
1759 }
1760 else
1761 {
1762 input_buffer.back(1+(size-limit));
1763 wstring unknown_word = sf.substr(0, limit);
1764 if(do_decomposition)
1765 {
1766 if(!dictionaryCase)
1767 {
1768 firstupper = iswupper(sf[0]);
1769 uppercase = firstupper && iswupper(sf[sf.size()-1]);
1770 }
1771
1772 wstring compound = L"";
1773 compound = compoundAnalysis(unknown_word, uppercase, firstupper);
1774 if(compound != L"")
1775 {
1776 printWord(unknown_word, compound, output);
1777 }
1778 else
1779 {
1780 printUnknownWord(unknown_word, output);
1781 }
1782 }
1783 else
1784 {
1785 printUnknownWord(unknown_word, output);
1786 }
1787
1788 }
1789 }
1790 else
1791 {
1792 printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)),
1793 lf, output);
1794 input_buffer.setPos(last);
1795 input_buffer.back(1);
1796 }
1797 if(val == 0) {
1798 if(!input_buffer.isEmpty()) {
1799 input_buffer.setPos(last+1);
1800 }
1801 }
1802
1803 current_state = initial_state;
1804 lf = L"";
1805 sf = L"";
1806 last_incond = false;
1807 last_postblank = false;
1808 last_preblank = false;
1809 }
1810 }
1811 while(val);
1812
1813 // print remaining blanks
1814 flushBlanks(output);
1815 }
1816
1817 void
analysis_wrapper_null_flush(FILE * input,FILE * output)1818 FSTProcessor::analysis_wrapper_null_flush(FILE *input, FILE *output)
1819 {
1820 setNullFlush(false);
1821 while(!feof(input))
1822 {
1823 analysis(input, output);
1824 fputwc_unlocked(L'\0', output);
1825 int code = fflush(output);
1826 if(code != 0)
1827 {
1828 wcerr << L"Could not flush output " << errno << endl;
1829 }
1830 }
1831 }
1832
1833 void
generation_wrapper_null_flush(FILE * input,FILE * output,GenerationMode mode)1834 FSTProcessor::generation_wrapper_null_flush(FILE *input, FILE *output,
1835 GenerationMode mode)
1836 {
1837 setNullFlush(false);
1838 nullFlushGeneration = true;
1839
1840 while(!feof(input))
1841 {
1842 generation(input, output, mode);
1843 fputwc_unlocked(L'\0', output);
1844 int code = fflush(output);
1845 if(code != 0)
1846 {
1847 wcerr << L"Could not flush output " << errno << endl;
1848 }
1849 }
1850 }
1851
1852 void
postgeneration_wrapper_null_flush(FILE * input,FILE * output)1853 FSTProcessor::postgeneration_wrapper_null_flush(FILE *input, FILE *output)
1854 {
1855 setNullFlush(false);
1856 while(!feof(input))
1857 {
1858 postgeneration(input, output);
1859 fputwc_unlocked(L'\0', output);
1860 int code = fflush(output);
1861 if(code != 0)
1862 {
1863 wcerr << L"Could not flush output " << errno << endl;
1864 }
1865 }
1866 }
1867
1868 void
intergeneration_wrapper_null_flush(FILE * input,FILE * output)1869 FSTProcessor::intergeneration_wrapper_null_flush(FILE *input, FILE *output)
1870 {
1871 setNullFlush(false);
1872 while (!feof(input))
1873 {
1874 intergeneration(input, output);
1875 fputwc_unlocked(L'\0', output);
1876 int code = fflush(output);
1877 if (code != 0)
1878 {
1879 wcerr << L"Could not flush output " << errno << endl;
1880 }
1881 }
1882 }
1883
1884 void
transliteration_wrapper_null_flush(FILE * input,FILE * output)1885 FSTProcessor::transliteration_wrapper_null_flush(FILE *input, FILE *output)
1886 {
1887 setNullFlush(false);
1888 while(!feof(input))
1889 {
1890 transliteration(input, output);
1891 fputwc_unlocked(L'\0', output);
1892 int code = fflush(output);
1893 if(code != 0)
1894 {
1895 wcerr << L"Could not flush output " << errno << endl;
1896 }
1897 }
1898 }
1899
1900 void
tm_analysis(FILE * input,FILE * output)1901 FSTProcessor::tm_analysis(FILE *input, FILE *output)
1902 {
1903 State current_state = initial_state;
1904 wstring lf = L""; //lexical form
1905 wstring sf = L""; //surface form
1906 int last = 0;
1907
1908 while(wchar_t val = readTMAnalysis(input))
1909 {
1910 // test for final states
1911 if(current_state.isFinal(all_finals))
1912 {
1913 if(iswpunct(val))
1914 {
1915 lf = current_state.filterFinalsTM(all_finals, alphabet,
1916 escaped_chars,
1917 blankqueue, numbers).substr(1);
1918 last = input_buffer.getPos();
1919 numbers.clear();
1920 }
1921 }
1922 else if(sf == L"" && iswspace(val))
1923 {
1924 lf.append(sf);
1925 last = input_buffer.getPos();
1926 }
1927
1928 if(!iswupper(val))
1929 {
1930 current_state.step(val);
1931 }
1932 else
1933 {
1934 current_state.step(val, towlower(val));
1935 }
1936
1937 if(current_state.size() != 0)
1938 {
1939 if(val == -1)
1940 {
1941 sf.append(numbers[numbers.size()-1]);
1942 }
1943 else if(isLastBlankTM && val == L' ')
1944 {
1945 sf.append(blankqueue.back());
1946 }
1947 else
1948 {
1949 alphabet.getSymbol(sf, val);
1950 }
1951 }
1952 else
1953 {
1954 if((iswspace(val) || iswpunct(val)) && sf == L"")
1955 {
1956 if(iswspace(val))
1957 {
1958 printSpace(val, output);
1959 }
1960 else
1961 {
1962 if(isEscaped(val))
1963 {
1964 fputwc_unlocked(L'\\', output);
1965 }
1966 fputwc_unlocked(val, output);
1967 }
1968 }
1969 else if(!iswspace(val) && !iswpunct(val) &&
1970 ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) ||
1971 lf == L""))
1972 {
1973
1974 do
1975 {
1976 if(val == -1)
1977 {
1978 sf.append(numbers[numbers.size()-1]);
1979 }
1980 else if(isLastBlankTM && val == L' ')
1981 {
1982 sf.append(blankqueue.back());
1983 }
1984 else
1985 {
1986 alphabet.getSymbol(sf, val);
1987 }
1988 }
1989 while((val = readTMAnalysis(input)) && !iswspace(val) && !iswpunct(val));
1990
1991 if(val == 0)
1992 {
1993 fputws_unlocked(sf.c_str(), output);
1994 return;
1995 }
1996
1997 input_buffer.back(1);
1998 fputws_unlocked(sf.c_str(), output);
1999
2000 while(blankqueue.size() > 0)
2001 {
2002 if(blankqueue.size() == 1 && isLastBlankTM)
2003 {
2004 break;
2005 }
2006 blankqueue.pop();
2007 }
2008
2009 /*
2010 unsigned int limit = sf.find(L' ');
2011 unsigned int size = sf.size();
2012 limit = (limit == static_cast<unsigned int>(wstring::npos)?size:limit);
2013 input_buffer.back(1+(size-limit));
2014 fputws_unlocked(sf.substr(0, limit).c_str(), output);
2015 */ }
2016 else if(lf == L"")
2017 {
2018 /* unsigned int limit = sf.find(L' ');
2019 unsigned int size = sf.size();
2020 limit = (limit == static_cast<unsigned int >(wstring::npos)?size:limit);
2021 input_buffer.back(1+(size-limit));
2022 fputws_unlocked(sf.substr(0, limit).c_str(), output);
2023 */
2024 input_buffer.back(1);
2025 fputws_unlocked(sf.c_str(), output);
2026
2027 while(blankqueue.size() > 0)
2028 {
2029 if(blankqueue.size() == 1 && isLastBlankTM)
2030 {
2031 break;
2032 }
2033 blankqueue.pop();
2034 }
2035
2036 }
2037 else
2038 {
2039 fputwc_unlocked(L'[', output);
2040 fputws_unlocked(lf.c_str(), output);
2041 fputwc_unlocked(L']', output);
2042 input_buffer.setPos(last);
2043 input_buffer.back(1);
2044 }
2045
2046 current_state = initial_state;
2047 lf = L"";
2048 sf = L"";
2049 }
2050 }
2051
2052 // print remaining blanks
2053 flushBlanks(output);
2054 }
2055
2056
2057 void
generation(FILE * input,FILE * output,GenerationMode mode)2058 FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode)
2059 {
2060 if(getNullFlush())
2061 {
2062 generation_wrapper_null_flush(input, output, mode);
2063 }
2064
2065 State current_state = initial_state;
2066 wstring sf = L"";
2067
2068 outOfWord = false;
2069
2070 skipUntil(input, output, L'^');
2071 int val;
2072
2073 while((val = readGeneration(input, output)) != 0x7fffffff)
2074 {
2075 if(sf == L"" && val == L'=')
2076 {
2077 fputwc(L'=', output);
2078 val = readGeneration(input, output);
2079 }
2080
2081 if(val == L'$' && outOfWord)
2082 {
2083 if(sf[0] == L'*' || sf[0] == L'%')
2084 {
2085 if(mode != gm_clean && mode != gm_tagged_nm)
2086 {
2087 writeEscaped(sf, output);
2088 }
2089 else if (mode == gm_clean)
2090 {
2091 writeEscaped(sf.substr(1), output);
2092 }
2093 else if(mode == gm_tagged_nm)
2094 {
2095 fputwc_unlocked(L'^', output);
2096 writeEscaped(removeTags(sf.substr(1)), output);
2097 fputwc_unlocked(L'/', output);
2098 writeEscapedWithTags(sf, output);
2099 fputwc_unlocked(L'$', output);
2100 }
2101 }
2102 else if(sf[0] == L'@')
2103 {
2104 if(mode == gm_all)
2105 {
2106 writeEscaped(sf, output);
2107 }
2108 else if(mode == gm_clean)
2109 {
2110 writeEscaped(removeTags(sf.substr(1)), output);
2111 }
2112 else if(mode == gm_unknown)
2113 {
2114 writeEscaped(removeTags(sf), output);
2115 }
2116 else if(mode == gm_tagged)
2117 {
2118 writeEscaped(removeTags(sf), output);
2119 }
2120 else if(mode == gm_tagged_nm)
2121 {
2122 fputwc_unlocked(L'^', output);
2123 writeEscaped(removeTags(sf.substr(1)), output);
2124 fputwc_unlocked(L'/', output);
2125 writeEscapedWithTags(sf, output);
2126 fputwc_unlocked(L'$', output);
2127 }
2128 }
2129 else if(current_state.isFinal(all_finals))
2130 {
2131 bool firstupper = false, uppercase = false;
2132 if(!dictionaryCase)
2133 {
2134 uppercase = sf.size() > 1 && iswupper(sf[1]);
2135 firstupper= iswupper(sf[0]);
2136 }
2137
2138 if(mode == gm_tagged || mode == gm_tagged_nm)
2139 {
2140 fputwc_unlocked(L'^', output);
2141 }
2142
2143 fputws_unlocked(current_state.filterFinals(all_finals, alphabet,
2144 escaped_chars,
2145 displayWeightsMode, maxAnalyses, maxWeightClasses,
2146 uppercase, firstupper).substr(1).c_str(), output);
2147 if(mode == gm_tagged || mode == gm_tagged_nm)
2148 {
2149 fputwc_unlocked(L'/', output);
2150 writeEscapedWithTags(sf, output);
2151 fputwc_unlocked(L'$', output);
2152 }
2153
2154 }
2155 else
2156 {
2157 if(mode == gm_all)
2158 {
2159 fputwc_unlocked(L'#', output);
2160 writeEscaped(sf, output);
2161 }
2162 else if(mode == gm_clean)
2163 {
2164 writeEscaped(removeTags(sf), output);
2165 }
2166 else if(mode == gm_unknown)
2167 {
2168 if(sf != L"")
2169 {
2170 fputwc_unlocked(L'#', output);
2171 writeEscaped(removeTags(sf), output);
2172 }
2173 }
2174 else if(mode == gm_tagged)
2175 {
2176 fputwc_unlocked(L'#', output);
2177 writeEscaped(removeTags(sf), output);
2178 }
2179 else if(mode == gm_tagged_nm)
2180 {
2181 fputwc_unlocked(L'^', output);
2182 writeEscaped(removeTags(sf), output);
2183 fputwc_unlocked(L'/', output);
2184 fputwc_unlocked(L'#', output);
2185 writeEscapedWithTags(sf, output);
2186 fputwc_unlocked(L'$', output);
2187 }
2188 }
2189
2190 current_state = initial_state;
2191 sf = L"";
2192 }
2193 else if(iswspace(val) && sf.size() == 0)
2194 {
2195 // do nothing
2196 }
2197 else if(sf.size() > 0 && (sf[0] == L'*' || sf[0] == L'%' ))
2198 {
2199 alphabet.getSymbol(sf, val);
2200 }
2201 else
2202 {
2203 alphabet.getSymbol(sf,val);
2204 if(current_state.size() > 0)
2205 {
2206 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
2207 {
2208 if(mode == gm_carefulcase)
2209 {
2210 current_state.step_careful(val, towlower(val));
2211 }
2212 else
2213 {
2214 current_state.step(val, towlower(val));
2215 }
2216 }
2217 else
2218 {
2219 current_state.step(val);
2220 }
2221 }
2222 }
2223 }
2224 }
2225
2226 void
postgeneration(FILE * input,FILE * output)2227 FSTProcessor::postgeneration(FILE *input, FILE *output)
2228 {
2229 if(getNullFlush())
2230 {
2231 postgeneration_wrapper_null_flush(input, output);
2232 }
2233
2234 bool skip_mode = true;
2235 collect_wblanks = false;
2236 need_end_wblank = false;
2237 State current_state = initial_state;
2238 wstring lf = L"";
2239 wstring sf = L"";
2240 int last = 0;
2241 set<wchar_t> empty_escaped_chars;
2242
2243 while(wchar_t val = readPostgeneration(input, output))
2244 {
2245 if(val == L'~')
2246 {
2247 skip_mode = false;
2248 collect_wblanks = true;
2249 }
2250
2251 if(is_wblank && skip_mode)
2252 {
2253 //do nothing
2254 }
2255 else if(skip_mode)
2256 {
2257 if(iswspace(val))
2258 {
2259 if(need_end_wblank)
2260 {
2261 fputws_unlocked(L"[[/]]", output);
2262 need_end_wblank = false;
2263 }
2264
2265 printSpace(val, output);
2266 }
2267 else
2268 {
2269 if(!need_end_wblank)
2270 {
2271 flushWblanks(output);
2272 }
2273
2274 if(isEscaped(val))
2275 {
2276 fputwc_unlocked(L'\\', output);
2277 }
2278 fputwc_unlocked(val, output);
2279
2280 if(need_end_wblank)
2281 {
2282 fputws_unlocked(L"[[/]]", output);
2283 need_end_wblank = false;
2284 }
2285 }
2286 }
2287 else
2288 {
2289 if(is_wblank)
2290 {
2291 continue;
2292 }
2293
2294 // test for final states
2295 if(current_state.isFinal(all_finals))
2296 {
2297 bool firstupper = iswupper(sf[1]);
2298 bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]);
2299 lf = current_state.filterFinals(all_finals, alphabet,
2300 empty_escaped_chars,
2301 displayWeightsMode, maxAnalyses, maxWeightClasses,
2302 uppercase, firstupper, 0);
2303
2304 // case of the beggining of the next word
2305
2306 wstring mybuf = L"";
2307 for(size_t i = sf.size(); i > 0; --i)
2308 {
2309 if(!isalpha(sf[i-1]))
2310 {
2311 break;
2312 }
2313 else
2314 {
2315 mybuf = sf[i-1] + mybuf;
2316 }
2317 }
2318
2319 if(mybuf.size() > 0)
2320 {
2321 bool myfirstupper = iswupper(mybuf[0]);
2322 bool myuppercase = mybuf.size() > 1 && iswupper(mybuf[1]);
2323
2324 for(size_t i = lf.size(); i > 0; --i)
2325 {
2326 if(!isalpha(lf[i-1]))
2327 {
2328 if(myfirstupper && i != lf.size())
2329 {
2330 lf[i] = towupper(lf[i]);
2331 }
2332 else
2333 {
2334 lf[i] = towlower(lf[i]);
2335 }
2336 break;
2337 }
2338 else
2339 {
2340 if(myuppercase)
2341 {
2342 lf[i-1] = towupper(lf[i-1]);
2343 }
2344 else
2345 {
2346 lf[i-1] = towlower(lf[i-1]);
2347 }
2348 }
2349 }
2350 }
2351
2352 last = input_buffer.getPos();
2353 }
2354
2355 if(!iswupper(val) || caseSensitive)
2356 {
2357 current_state.step(val);
2358 }
2359 else
2360 {
2361 current_state.step(val, towlower(val));
2362 }
2363
2364 if(current_state.size() != 0)
2365 {
2366 alphabet.getSymbol(sf, val);
2367 }
2368 else
2369 {
2370 wstring final_wblank = combineWblanks();
2371 fputws_unlocked(final_wblank.c_str(), output);
2372
2373 if(lf == L"")
2374 {
2375 unsigned int mark = sf.size();
2376 unsigned int space_index = sf.size();
2377
2378 for(unsigned int i = 1, limit = sf.size(); i < limit; i++)
2379 {
2380 if(sf[i] == L'~')
2381 {
2382 mark = i;
2383 break;
2384 }
2385 else if(sf[i] == L' ')
2386 {
2387 space_index = i;
2388 }
2389 }
2390
2391 if(space_index != sf.size())
2392 {
2393 fputws_unlocked(sf.substr(1, space_index-1).c_str(), output);
2394
2395 if(need_end_wblank)
2396 {
2397 fputws_unlocked(L"[[/]]", output);
2398 need_end_wblank = false;
2399 fputwc_unlocked(sf[space_index], output);
2400 flushWblanks(output);
2401 }
2402 else
2403 {
2404 fputwc_unlocked(sf[space_index], output);
2405 }
2406
2407 fputws_unlocked(sf.substr(space_index+1, mark-space_index-1).c_str(), output);
2408 }
2409 else
2410 {
2411 flushWblanks(output);
2412 fputws_unlocked(sf.substr(1, mark-1).c_str(), output);
2413 }
2414
2415 if(mark == sf.size())
2416 {
2417 input_buffer.back(1);
2418 }
2419 else
2420 {
2421 input_buffer.back(sf.size()-mark);
2422 }
2423 }
2424 else
2425 {
2426 fputws_unlocked(lf.substr(1,lf.size()-3).c_str(), output);
2427 input_buffer.setPos(last);
2428 input_buffer.back(2);
2429 val = lf[lf.size()-2];
2430 if(iswspace(val))
2431 {
2432 printSpace(val, output);
2433 }
2434 else
2435 {
2436 if(isEscaped(val))
2437 {
2438 fputwc_unlocked(L'\\', output);
2439 }
2440 fputwc_unlocked(val, output);
2441 }
2442 }
2443
2444 current_state = initial_state;
2445 lf = L"";
2446 sf = L"";
2447 skip_mode = true;
2448 collect_wblanks = false;
2449 }
2450 }
2451 }
2452
2453 // print remaining blanks
2454 flushBlanks(output);
2455 }
2456
2457 void
intergeneration(FILE * input,FILE * output)2458 FSTProcessor::intergeneration(FILE *input, FILE *output)
2459 {
2460 if (getNullFlush())
2461 {
2462 intergeneration_wrapper_null_flush(input, output);
2463 }
2464
2465 bool skip_mode = true;
2466 State current_state = initial_state;
2467 wstring target = L"";
2468 wstring source = L"";
2469 int last = 0;
2470 set<wchar_t> empty_escaped_chars;
2471
2472 while (true)
2473 {
2474 wchar_t val = readPostgeneration(input, output);
2475
2476 if (val == L'~')
2477 {
2478 skip_mode = false;
2479 }
2480
2481 if (skip_mode)
2482 {
2483 if (iswspace(val))
2484 {
2485 printSpace(val, output);
2486 }
2487 else
2488 {
2489 if(val != L'\0')
2490 {
2491 if (isEscaped(val))
2492 {
2493 fputwc_unlocked(L'\\', output);
2494 }
2495 fputwc_unlocked(val, output);
2496 }
2497 }
2498 }
2499 else
2500 {
2501 // test for final states
2502 if (current_state.isFinal(all_finals))
2503 {
2504 bool firstupper = iswupper(source[1]);
2505 bool uppercase = source.size() > 1 && firstupper && iswupper(source[2]);
2506 target = current_state.filterFinals(all_finals, alphabet,
2507 empty_escaped_chars,
2508 displayWeightsMode, maxAnalyses, maxWeightClasses,
2509 uppercase, firstupper, 0);
2510
2511 last = input_buffer.getPos();
2512 }
2513
2514 if (val != L'\0')
2515 {
2516 if (!iswupper(val) || caseSensitive)
2517 {
2518 current_state.step(val);
2519 }
2520 else
2521 {
2522 current_state.step(val, towlower(val));
2523 }
2524 }
2525
2526 if (val != L'\0' && current_state.size() != 0)
2527 {
2528 alphabet.getSymbol(source, val);
2529 }
2530 else
2531 {
2532 if (target == L"") // no match
2533 {
2534 if (val == L'\0')
2535 {
2536 // flush source
2537 fputws_unlocked(source.c_str(), output);
2538 }
2539 else
2540 {
2541 fputwc_unlocked(source[0], output);
2542
2543 unsigned int mark, limit;
2544 for (mark = 1, limit = source.size(); mark < limit && source[mark] != L'~' ; mark++)
2545 {
2546 fputwc_unlocked(source[mark], output);
2547 }
2548
2549 if (mark != source.size())
2550 {
2551 int back = source.size() - mark;
2552 input_buffer.back(back);
2553 }
2554
2555 if (val == L'~')
2556 {
2557 input_buffer.back(1);
2558 } else {
2559 fputwc_unlocked(val, output);
2560 }
2561 }
2562 }
2563 else
2564 {
2565 for(unsigned int i=1; i<target.size(); i++) {
2566 wchar_t c = target[i];
2567
2568 if (iswspace(c))
2569 {
2570 printSpace(c, output);
2571 }
2572 else
2573 {
2574 if (isEscaped(c))
2575 {
2576 fputwc_unlocked(L'\\', output);
2577 }
2578 fputwc_unlocked(c, output);
2579 }
2580 }
2581
2582 if (val != L'\0')
2583 {
2584 input_buffer.setPos(last);
2585 input_buffer.back(1);
2586 }
2587 }
2588
2589 current_state = initial_state;
2590 target = L"";
2591 source = L"";
2592 skip_mode = true;
2593 }
2594 }
2595
2596 if (val == L'\0')
2597 {
2598 break;
2599 }
2600 }
2601
2602 // print remaining blanks
2603 flushBlanks(output);
2604 }
2605
2606 void
transliteration(FILE * input,FILE * output)2607 FSTProcessor::transliteration(FILE *input, FILE *output)
2608 {
2609 if(getNullFlush())
2610 {
2611 transliteration_wrapper_null_flush(input, output);
2612 }
2613
2614 State current_state = initial_state;
2615 wstring lf = L"";
2616 wstring sf = L"";
2617 int last = 0;
2618
2619 while(wchar_t val = readPostgeneration(input, output))
2620 {
2621 if(iswpunct(val) || iswspace(val))
2622 {
2623 bool firstupper = iswupper(sf[1]);
2624 bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]);
2625 lf = current_state.filterFinals(all_finals, alphabet, escaped_chars,
2626 displayWeightsMode, maxAnalyses, maxWeightClasses,
2627 uppercase, firstupper, 0);
2628 if(!lf.empty())
2629 {
2630 fputws_unlocked(lf.substr(1).c_str(), output);
2631 current_state = initial_state;
2632 lf = L"";
2633 sf = L"";
2634 }
2635 if(iswspace(val))
2636 {
2637 printSpace(val, output);
2638 }
2639 else
2640 {
2641 if(isEscaped(val))
2642 {
2643 fputwc_unlocked(L'\\', output);
2644 }
2645 fputwc_unlocked(val, output);
2646 }
2647 }
2648 else
2649 {
2650 if(current_state.isFinal(all_finals))
2651 {
2652 bool firstupper = iswupper(sf[1]);
2653 bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]);
2654 lf = current_state.filterFinals(all_finals, alphabet, escaped_chars,
2655 displayWeightsMode, maxAnalyses, maxWeightClasses,
2656 uppercase, firstupper, 0);
2657 last = input_buffer.getPos();
2658 }
2659
2660 current_state.step(val);
2661 if(current_state.size() != 0)
2662 {
2663 alphabet.getSymbol(sf, val);
2664 }
2665 else
2666 {
2667 if(!lf.empty())
2668 {
2669 fputws_unlocked(lf.substr(1).c_str(), output);
2670 input_buffer.setPos(last);
2671 input_buffer.back(1);
2672 val = lf[lf.size()-1];
2673 }
2674 else
2675 {
2676 if(iswspace(val))
2677 {
2678 printSpace(val, output);
2679 }
2680 else
2681 {
2682 if(isEscaped(val))
2683 {
2684 fputwc_unlocked(L'\\', output);
2685 }
2686 fputwc_unlocked(val, output);
2687 }
2688 }
2689 current_state = initial_state;
2690 lf = L"";
2691 sf = L"";
2692 }
2693 }
2694 }
2695 // print remaining blanks
2696 flushBlanks(output);
2697 }
2698
2699 wstring
biltransfull(wstring const & input_word,bool with_delim)2700 FSTProcessor::biltransfull(wstring const &input_word, bool with_delim)
2701 {
2702 State current_state = initial_state;
2703 wstring result = L"";
2704 unsigned int start_point = 1;
2705 unsigned int end_point = input_word.size()-2;
2706 wstring queue = L"";
2707 bool mark = false;
2708
2709 if(with_delim == false)
2710 {
2711 start_point = 0;
2712 end_point = input_word.size()-1;
2713 }
2714
2715 if(input_word[start_point] == L'*')
2716 {
2717 return input_word;
2718 }
2719
2720 if(input_word[start_point] == L'=')
2721 {
2722 start_point++;
2723 mark = true;
2724 }
2725
2726 bool firstupper = iswupper(input_word[start_point]);
2727 bool uppercase = firstupper && iswupper(input_word[start_point+1]);
2728
2729 for(unsigned int i = start_point; i <= end_point; i++)
2730 {
2731 int val;
2732 wstring symbol = L"";
2733
2734 if(input_word[i] == L'\\')
2735 {
2736 i++;
2737 val = static_cast<int>(input_word[i]);
2738 }
2739 else if(input_word[i] == L'<')
2740 {
2741 symbol = L'<';
2742 for(unsigned int j = i + 1; j <= end_point; j++)
2743 {
2744 symbol += input_word[j];
2745 if(input_word[j] == L'>')
2746 {
2747 i = j;
2748 break;
2749 }
2750 }
2751 val = alphabet(symbol);
2752 }
2753 else
2754 {
2755 val = static_cast<int>(input_word[i]);
2756 }
2757 if(current_state.size() != 0)
2758 {
2759 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
2760 {
2761 current_state.step(val, towlower(val));
2762 }
2763 else
2764 {
2765 current_state.step(val);
2766 }
2767 }
2768 if(current_state.isFinal(all_finals))
2769 {
2770 result = current_state.filterFinals(all_finals, alphabet,
2771 escaped_chars,
2772 displayWeightsMode, maxAnalyses, maxWeightClasses,
2773 uppercase, firstupper, 0);
2774 if(with_delim)
2775 {
2776 if(mark)
2777 {
2778 result = L"^="+result.substr(1);
2779 }
2780 else
2781 {
2782 result[0] = L'^';
2783 }
2784 }
2785 else
2786 {
2787 if(mark)
2788 {
2789 result = L"=" + result.substr(1);
2790 }
2791 else
2792 {
2793 result = result.substr(1);
2794 }
2795 }
2796 }
2797
2798 if(current_state.size() == 0)
2799 {
2800 if(symbol != L"" && result != L"")
2801 {
2802 queue.append(symbol);
2803 }
2804 else
2805 {
2806 // word is not present
2807 if(with_delim)
2808 {
2809 result = L"^@" + input_word.substr(1);
2810 }
2811 else
2812 {
2813 result = L"@" + input_word;
2814 }
2815 return result;
2816 }
2817 }
2818 }
2819
2820 if(start_point < (end_point - 3))
2821 {
2822 return L"^$";
2823 }
2824 // attach unmatched queue automatically
2825
2826 if(queue != L"")
2827 {
2828 wstring result_with_queue = L"";
2829 for(unsigned int i = 0, limit = result.size(); i != limit; i++)
2830 {
2831 switch(result[i])
2832 {
2833 case L'\\':
2834 result_with_queue += L'\\';
2835 i++;
2836 break;
2837
2838 case L'/':
2839 result_with_queue.append(queue);
2840 break;
2841
2842 default:
2843 break;
2844 }
2845 result_with_queue += result[i];
2846 }
2847 result_with_queue.append(queue);
2848
2849 if(with_delim)
2850 {
2851 result_with_queue += L'$';
2852 }
2853 return result_with_queue;
2854 }
2855 else
2856 {
2857 if(with_delim)
2858 {
2859 result += L'$';
2860 }
2861 return result;
2862 }
2863 }
2864
2865
2866
2867 wstring
biltrans(wstring const & input_word,bool with_delim)2868 FSTProcessor::biltrans(wstring const &input_word, bool with_delim)
2869 {
2870 State current_state = initial_state;
2871 wstring result = L"";
2872 unsigned int start_point = 1;
2873 unsigned int end_point = input_word.size()-2;
2874 wstring queue = L"";
2875 bool mark = false;
2876
2877 if(with_delim == false)
2878 {
2879 start_point = 0;
2880 end_point = input_word.size()-1;
2881 }
2882
2883 if(input_word[start_point] == L'*')
2884 {
2885 return input_word;
2886 }
2887
2888 if(input_word[start_point] == L'=')
2889 {
2890 start_point++;
2891 mark = true;
2892 }
2893
2894 bool firstupper = iswupper(input_word[start_point]);
2895 bool uppercase = firstupper && iswupper(input_word[start_point+1]);
2896
2897 for(unsigned int i = start_point; i <= end_point; i++)
2898 {
2899 int val;
2900 wstring symbol = L"";
2901
2902 if(input_word[i] == L'\\')
2903 {
2904 i++;
2905 val = static_cast<int>(input_word[i]);
2906 }
2907 else if(input_word[i] == L'<')
2908 {
2909 symbol = L'<';
2910 for(unsigned int j = i + 1; j <= end_point; j++)
2911 {
2912 symbol += input_word[j];
2913 if(input_word[j] == L'>')
2914 {
2915 i = j;
2916 break;
2917 }
2918 }
2919 val = alphabet(symbol);
2920 }
2921 else
2922 {
2923 val = static_cast<int>(input_word[i]);
2924 }
2925 if(current_state.size() != 0)
2926 {
2927 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
2928 {
2929 current_state.step(val, towlower(val));
2930 }
2931 else
2932 {
2933 current_state.step(val);
2934 }
2935 }
2936 if(current_state.isFinal(all_finals))
2937 {
2938 result = current_state.filterFinals(all_finals, alphabet,
2939 escaped_chars,
2940 displayWeightsMode, maxAnalyses, maxWeightClasses,
2941 uppercase, firstupper, 0);
2942 if(with_delim)
2943 {
2944 if(mark)
2945 {
2946 result = L"^="+result.substr(1);
2947 }
2948 else
2949 {
2950 result[0] = L'^';
2951 }
2952 }
2953 else
2954 {
2955 if(mark)
2956 {
2957 result = L"=" + result.substr(1);
2958 }
2959 else
2960 {
2961 result = result.substr(1);
2962 }
2963 }
2964 }
2965
2966 if(current_state.size() == 0)
2967 {
2968 if(symbol != L"" && result != L"")
2969 {
2970 queue.append(symbol);
2971 }
2972 else
2973 {
2974 // word is not present
2975 if(with_delim)
2976 {
2977 result = L"^@" + input_word.substr(1);
2978 }
2979 else
2980 {
2981 result = L"@" + input_word;
2982 }
2983 return result;
2984 }
2985 }
2986 }
2987
2988 // attach unmatched queue automatically
2989
2990 if(queue != L"")
2991 {
2992 wstring result_with_queue = L"";
2993 for(unsigned int i = 0, limit = result.size(); i != limit; i++)
2994 {
2995 switch(result[i])
2996 {
2997 case L'\\':
2998 result_with_queue += L'\\';
2999 i++;
3000 break;
3001
3002 case L'/':
3003 result_with_queue.append(queue);
3004 break;
3005
3006 default:
3007 break;
3008 }
3009 result_with_queue += result[i];
3010 }
3011 result_with_queue.append(queue);
3012
3013 if(with_delim)
3014 {
3015 result_with_queue += L'$';
3016 }
3017 return result_with_queue;
3018 }
3019 else
3020 {
3021 if(with_delim)
3022 {
3023 result += L'$';
3024 }
3025 return result;
3026 }
3027 }
3028
3029 void
bilingual_wrapper_null_flush(FILE * input,FILE * output,GenerationMode mode)3030 FSTProcessor::bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode)
3031 {
3032 setNullFlush(false);
3033 nullFlushGeneration = true;
3034
3035 while(!feof(input))
3036 {
3037 bilingual(input, output, mode);
3038 fputwc_unlocked(L'\0', output);
3039 int code = fflush(output);
3040 if(code != 0)
3041 {
3042 wcerr << L"Could not flush output " << errno << endl;
3043 }
3044 }
3045 }
3046
3047 wstring
compose(wstring const & lexforms,wstring const & queue) const3048 FSTProcessor::compose(wstring const &lexforms, wstring const &queue) const
3049 {
3050 wstring result = L"";
3051
3052 for(unsigned int i = 1; i< lexforms.size(); i++)
3053 {
3054 if(lexforms[i] == L'\\')
3055 {
3056 result += L'\\';
3057 i++;
3058 }
3059 else if(lexforms[i] == L'/')
3060 {
3061 result.append(queue);
3062 }
3063 result += lexforms[i];
3064 }
3065
3066 return L"/" + result + queue;
3067 }
3068
3069 void
bilingual(FILE * input,FILE * output,GenerationMode mode)3070 FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode)
3071 {
3072 if(getNullFlush())
3073 {
3074 bilingual_wrapper_null_flush(input, output, mode);
3075 }
3076
3077 State current_state = initial_state;
3078 wstring sf = L""; // source language analysis
3079 wstring queue = L""; // symbols to be added to each target
3080 wstring result = L""; // result of looking up analysis in bidix
3081
3082 outOfWord = false;
3083
3084 skipUntil(input, output, L'^');
3085 pair<wstring,int> tr; // readBilingual return value, containing:
3086 int val; // the alphabet value of current symbol, and
3087 wstring symbol = L""; // the current symbol as a string
3088 bool seentags = false; // have we seen any tags at all in the analysis?
3089
3090 bool seensurface = false;
3091 wstring surface = L"";
3092
3093 while(true) // ie. while(val != 0x7fffffff)
3094 {
3095 tr = readBilingual(input, output);
3096 symbol = tr.first;
3097 val = tr.second;
3098
3099 //fwprintf(stderr, L"> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second);
3100 if(biltransSurfaceForms && !seensurface && !outOfWord)
3101 {
3102 while(val != L'/' && val != 0x7fffffff)
3103 {
3104 surface = surface + symbol;
3105 alphabet.getSymbol(surface, val);
3106 tr = readBilingual(input, output);
3107 symbol = tr.first;
3108 val = tr.second;
3109 //fwprintf(stderr, L" == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str());
3110 }
3111 seensurface = true;
3112 tr = readBilingual(input, output);
3113 symbol = tr.first;
3114 val = tr.second;
3115 }
3116
3117 if (val == 0x7fffffff)
3118 {
3119 break;
3120 }
3121
3122 if(val == L'$' && outOfWord)
3123 {
3124 if(!seentags) // if no tags: only return complete matches
3125 {
3126 bool uppercase = sf.size() > 1 && iswupper(sf[1]);
3127 bool firstupper= iswupper(sf[0]);
3128
3129 result = current_state.filterFinals(all_finals, alphabet,
3130 escaped_chars,
3131 displayWeightsMode, maxAnalyses, maxWeightClasses,
3132 uppercase, firstupper, 0);
3133 }
3134
3135 if(sf[0] == L'*')
3136 {
3137 if (mode == gm_clean) {
3138 printWordBilingual(sf, L"/" + sf.substr(1), output);
3139 }
3140 else {
3141 printWordBilingual(sf, L"/" + sf, output);
3142 }
3143 }
3144 else if(result != L"")
3145 {
3146 printWordBilingual(sf, compose(result, queue), output);
3147 }
3148 else
3149 { //xxx
3150 if(biltransSurfaceForms)
3151 {
3152 printWordBilingual(surface, L"/@"+surface, output);
3153 }
3154 else
3155 {
3156 printWordBilingual(sf, L"/@"+sf, output);
3157 }
3158 }
3159 seensurface = false;
3160 surface = L"";
3161 queue = L"";
3162 result = L"";
3163 current_state = initial_state;
3164 sf = L"";
3165 seentags = false;
3166 }
3167 else if(iswspace(val) && sf.size() == 0)
3168 {
3169 // do nothing
3170 }
3171 else if(sf.size() > 0 && sf[0] == L'*')
3172 {
3173 if(escaped_chars.find(val) != escaped_chars.end())
3174 {
3175 sf += L'\\';
3176 }
3177 alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic
3178 if(val == 0) // non-alphabetic, possibly unknown tag; add to sf
3179 {
3180 sf += symbol;
3181 }
3182 }
3183 else
3184 {
3185 if(escaped_chars.find(val) != escaped_chars.end())
3186 {
3187 sf += L'\\';
3188 }
3189 alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic
3190 if(val == 0) // non-alphabetic, possibly unknown tag; add to sf
3191 {
3192 sf += symbol;
3193 }
3194 if(alphabet.isTag(val) || val == 0)
3195 {
3196 seentags = true;
3197 }
3198 if(current_state.size() != 0)
3199 {
3200 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
3201 {
3202 current_state.step(val, towlower(val));
3203 }
3204 else
3205 {
3206 current_state.step(val);
3207 }
3208 }
3209 if(current_state.isFinal(all_finals))
3210 {
3211 bool uppercase = sf.size() > 1 && iswupper(sf[1]);
3212 bool firstupper= iswupper(sf[0]);
3213
3214 queue = L""; // the intervening tags were matched
3215 result = current_state.filterFinals(all_finals, alphabet,
3216 escaped_chars,
3217 displayWeightsMode, maxAnalyses, maxWeightClasses,
3218 uppercase, firstupper, 0);
3219 }
3220 else if(result != L"")
3221 {
3222 // We already have a result, but there is still more to read
3223 // of the analysis; following tags are not consumed, but
3224 // output as target language tags (added to result on
3225 // end-of-word). This queue is reset if result is changed.
3226 if(alphabet.isTag(val)) // known tag
3227 {
3228 alphabet.getSymbol(queue, val);
3229 }
3230 else if (val == 0) // non-alphabetic, possibly unknown tag
3231 {
3232 queue += symbol;
3233 }
3234 else if(current_state.size() == 0)
3235 {
3236 // There are no more alive transductions and the current symbol is not a tag -- unknown word!
3237 result = L"";
3238 }
3239 }
3240 }
3241 }
3242 }
3243
3244 pair<wstring, int>
biltransWithQueue(wstring const & input_word,bool with_delim)3245 FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim)
3246 {
3247 State current_state = initial_state;
3248 wstring result = L"";
3249 unsigned int start_point = 1;
3250 unsigned int end_point = input_word.size()-2;
3251 wstring queue = L"";
3252 bool mark = false;
3253 bool seentags = false; // have we seen any tags at all in the analysis?
3254
3255 if(with_delim == false)
3256 {
3257 start_point = 0;
3258 end_point = input_word.size()-1;
3259 }
3260
3261 if(input_word[start_point] == L'*')
3262 {
3263 return pair<wstring, int>(input_word, 0);
3264 }
3265
3266 if(input_word[start_point] == L'=')
3267 {
3268 start_point++;
3269 mark = true;
3270 }
3271
3272 bool firstupper = iswupper(input_word[start_point]);
3273 bool uppercase = firstupper && iswupper(input_word[start_point+1]);
3274
3275 for(unsigned int i = start_point; i <= end_point; i++)
3276 {
3277 int val = 0;
3278 wstring symbol = L"";
3279
3280 if(input_word[i] == L'\\')
3281 {
3282 i++;
3283 val = input_word[i];
3284 }
3285 else if(input_word[i] == L'<')
3286 {
3287 seentags = true;
3288 symbol = L'<';
3289 for(unsigned int j = i + 1; j <= end_point; j++)
3290 {
3291 symbol += input_word[j];
3292 if(input_word[j] == L'>')
3293 {
3294 i = j;
3295 break;
3296 }
3297 }
3298 val = alphabet(symbol);
3299 }
3300 else
3301 {
3302 val = input_word[i];
3303 }
3304 if(current_state.size() != 0)
3305 {
3306 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
3307 {
3308 current_state.step(val, towlower(val));
3309 }
3310 else
3311 {
3312 current_state.step(val);
3313 }
3314 }
3315 if(current_state.isFinal(all_finals))
3316 {
3317 result = current_state.filterFinals(all_finals, alphabet,
3318 escaped_chars,
3319 displayWeightsMode, maxAnalyses, maxWeightClasses,
3320 uppercase, firstupper, 0);
3321 if(with_delim)
3322 {
3323 if(mark)
3324 {
3325 result = L"^=" + result.substr(1);
3326 }
3327 else
3328 {
3329 result[0] = L'^';
3330 }
3331 }
3332 else
3333 {
3334 if(mark)
3335 {
3336 result = L"=" + result.substr(1);
3337 }
3338 else
3339 {
3340 result = result.substr(1);
3341 }
3342 }
3343 }
3344
3345 if(current_state.size() == 0)
3346 {
3347 if(symbol != L"" && result != L"")
3348 {
3349 queue.append(symbol);
3350 }
3351 else
3352 {
3353 // word is not present
3354 if(with_delim)
3355 {
3356 result = L"^@" + input_word.substr(1);
3357 }
3358 else
3359 {
3360 result = L"@" + input_word;
3361 }
3362 return pair<wstring, int>(result, 0);
3363 }
3364 }
3365 }
3366
3367 if (!seentags
3368 && L"" == current_state.filterFinals(all_finals, alphabet,
3369 escaped_chars,
3370 displayWeightsMode, maxAnalyses, maxWeightClasses,
3371 uppercase, firstupper, 0))
3372 {
3373 // word is not present
3374 if(with_delim)
3375 {
3376 result = L"^@" + input_word.substr(1);
3377 }
3378 else
3379 {
3380 result = L"@" + input_word;
3381 }
3382 return pair<wstring, int>(result, 0);
3383 }
3384
3385
3386
3387 // attach unmatched queue automatically
3388
3389 if(queue != L"")
3390 {
3391 wstring result_with_queue = L"";
3392 for(unsigned int i = 0, limit = result.size(); i != limit; i++)
3393 {
3394 switch(result[i])
3395 {
3396 case L'\\':
3397 result_with_queue += L'\\';
3398 i++;
3399 break;
3400
3401 case L'/':
3402 result_with_queue.append(queue);
3403 break;
3404
3405 default:
3406 break;
3407 }
3408 result_with_queue += result[i];
3409 }
3410 result_with_queue.append(queue);
3411
3412 if(with_delim)
3413 {
3414 result_with_queue += L'$';
3415 }
3416 return pair<wstring, int>(result_with_queue, queue.size());
3417 }
3418 else
3419 {
3420 if(with_delim)
3421 {
3422 result += L'$';
3423 }
3424 return pair<wstring, int>(result, 0);
3425 }
3426 }
3427
3428 wstring
biltransWithoutQueue(wstring const & input_word,bool with_delim)3429 FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim)
3430 {
3431 State current_state = initial_state;
3432 wstring result = L"";
3433 unsigned int start_point = 1;
3434 unsigned int end_point = input_word.size()-2;
3435 bool mark = false;
3436
3437 if(with_delim == false)
3438 {
3439 start_point = 0;
3440 end_point = input_word.size()-1;
3441 }
3442
3443 if(input_word[start_point] == L'*')
3444 {
3445 return input_word;
3446 }
3447
3448 if(input_word[start_point] == L'=')
3449 {
3450 start_point++;
3451 mark = true;
3452 }
3453
3454 bool firstupper = iswupper(input_word[start_point]);
3455 bool uppercase = firstupper && iswupper(input_word[start_point+1]);
3456
3457 for(unsigned int i = start_point; i <= end_point; i++)
3458 {
3459 int val;
3460 wstring symbol = L"";
3461
3462 if(input_word[i] == L'\\')
3463 {
3464 i++;
3465 val = static_cast<int>(input_word[i]);
3466 }
3467 else if(input_word[i] == L'<')
3468 {
3469 symbol = L'<';
3470 for(unsigned int j = i + 1; j <= end_point; j++)
3471 {
3472 symbol += input_word[j];
3473 if(input_word[j] == L'>')
3474 {
3475 i = j;
3476 break;
3477 }
3478 }
3479 val = alphabet(symbol);
3480 }
3481 else
3482 {
3483 val = static_cast<int>(input_word[i]);
3484 }
3485 if(current_state.size() != 0)
3486 {
3487 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
3488 {
3489 current_state.step(val, towlower(val));
3490 }
3491 else
3492 {
3493 current_state.step(val);
3494 }
3495 }
3496 if(current_state.isFinal(all_finals))
3497 {
3498 result = current_state.filterFinals(all_finals, alphabet,
3499 escaped_chars,
3500 displayWeightsMode, maxAnalyses, maxWeightClasses,
3501 uppercase, firstupper, 0);
3502 if(with_delim)
3503 {
3504 if(mark)
3505 {
3506 result = L"^=" + result.substr(1);
3507 }
3508 else
3509 {
3510 result[0] = L'^';
3511 }
3512 }
3513 else
3514 {
3515 if(mark)
3516 {
3517 result = L"=" + result.substr(1);
3518 }
3519 else
3520 {
3521 result = result.substr(1);
3522 }
3523 }
3524 }
3525
3526 if(current_state.size() == 0)
3527 {
3528 if(symbol == L"")
3529 {
3530 // word is not present
3531 if(with_delim)
3532 {
3533 result = L"^@" + input_word.substr(1);
3534 }
3535 else
3536 {
3537 result = L"@" + input_word;
3538 }
3539 return result;
3540 }
3541 }
3542 }
3543
3544 if(with_delim)
3545 {
3546 result += L'$';
3547 }
3548 return result;
3549 }
3550
3551
3552 bool
valid() const3553 FSTProcessor::valid() const
3554 {
3555 if(initial_state.isFinal(all_finals))
3556 {
3557 wcerr << L"Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl;
3558 return false;
3559 }
3560 else
3561 {
3562 State s = initial_state;
3563 s.step(L' ');
3564 if(s.size() != 0)
3565 {
3566 wcerr << L"Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl;
3567 return false;
3568 }
3569 }
3570
3571 return true;
3572 }
3573
3574 int
readSAO(FILE * input)3575 FSTProcessor::readSAO(FILE *input)
3576 {
3577 if(!input_buffer.isEmpty())
3578 {
3579 return input_buffer.next();
3580 }
3581
3582 wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
3583 if(feof(input))
3584 {
3585 return 0;
3586 }
3587
3588 if(escaped_chars.find(val) != escaped_chars.end())
3589 {
3590 if(val == L'<')
3591 {
3592 wstring str = readFullBlock(input, L'<', L'>');
3593 if(str.substr(0, 9) == L"<![CDATA[")
3594 {
3595 while(str.substr(str.size()-3) != L"]]>")
3596 {
3597 str.append(readFullBlock(input, L'<', L'>').substr(1));
3598 }
3599 blankqueue.push(str);
3600 input_buffer.add(static_cast<int>(L' '));
3601 return static_cast<int>(L' ');
3602 }
3603 else
3604 {
3605 streamError();
3606 }
3607 }
3608 else if (val == L'\\') {
3609 val = static_cast<wchar_t>(fgetwc_unlocked(input));
3610 if(isEscaped(val))
3611 {
3612 input_buffer.add(val);
3613 return static_cast<int>(val);
3614 }
3615 else
3616 streamError();
3617 }
3618 else
3619 {
3620 streamError();
3621 }
3622 }
3623
3624 input_buffer.add(val);
3625 return static_cast<int>(val);
3626 }
3627
3628 void
printSAOWord(wstring const & lf,FILE * output)3629 FSTProcessor::printSAOWord(wstring const &lf, FILE *output)
3630 {
3631 for(unsigned int i = 1, limit = lf.size(); i != limit; i++)
3632 {
3633 if(lf[i] == L'/')
3634 {
3635 break;
3636 }
3637 fputwc_unlocked(lf[i], output);
3638 }
3639 }
3640
3641 void
SAO(FILE * input,FILE * output)3642 FSTProcessor::SAO(FILE *input, FILE *output)
3643 {
3644 bool last_incond = false;
3645 bool last_postblank = false;
3646 State current_state = initial_state;
3647 wstring lf = L"";
3648 wstring sf = L"";
3649 int last = 0;
3650
3651 escaped_chars.clear();
3652 escaped_chars.insert(static_cast<wchar_t>(L'\\'));
3653 escaped_chars.insert(static_cast<wchar_t>(L'<'));
3654 escaped_chars.insert(static_cast<wchar_t>(L'>'));
3655
3656 while(wchar_t val = readSAO(input))
3657 {
3658 // test for final states
3659 if(current_state.isFinal(all_finals))
3660 {
3661 if(current_state.isFinal(inconditional))
3662 {
3663 bool firstupper = iswupper(sf[0]);
3664 bool uppercase = firstupper && iswupper(sf[sf.size()-1]);
3665
3666 lf = current_state.filterFinalsSAO(all_finals, alphabet,
3667 escaped_chars,
3668 uppercase, firstupper);
3669 last_incond = true;
3670 last = input_buffer.getPos();
3671 }
3672 else if(current_state.isFinal(postblank))
3673 {
3674 bool firstupper = iswupper(sf[0]);
3675 bool uppercase = firstupper && iswupper(sf[sf.size()-1]);
3676
3677 lf = current_state.filterFinalsSAO(all_finals, alphabet,
3678 escaped_chars,
3679 uppercase, firstupper);
3680 last_postblank = true;
3681 last = input_buffer.getPos();
3682 }
3683 else if(!isAlphabetic(val))
3684 {
3685 bool firstupper = iswupper(sf[0]);
3686 bool uppercase = firstupper && iswupper(sf[sf.size()-1]);
3687
3688 lf = current_state.filterFinalsSAO(all_finals, alphabet,
3689 escaped_chars,
3690 uppercase, firstupper);
3691 last_postblank = false;
3692 last_incond = false;
3693 last = input_buffer.getPos();
3694 }
3695 }
3696 else if(sf == L"" && iswspace(val))
3697 {
3698 lf = L"/*";
3699 lf.append(sf);
3700 last_postblank = false;
3701 last_incond = false;
3702 last = input_buffer.getPos();
3703 }
3704
3705 if(!iswupper(val) || caseSensitive)
3706 {
3707 current_state.step(val);
3708 }
3709 else
3710 {
3711 current_state.step(val, towlower(val));
3712 }
3713
3714 if(current_state.size() != 0)
3715 {
3716 alphabet.getSymbol(sf, val);
3717 }
3718 else
3719 {
3720 if(!isAlphabetic(val) && sf == L"")
3721 {
3722 if(iswspace(val))
3723 {
3724 printSpace(val, output);
3725 }
3726 else
3727 {
3728 if(isEscaped(val))
3729 {
3730 fputwc_unlocked(L'\\', output);
3731 }
3732 fputwc_unlocked(val, output);
3733 }
3734 }
3735 else if(last_incond)
3736 {
3737 printSAOWord(lf, output);
3738 input_buffer.setPos(last);
3739 input_buffer.back(1);
3740 }
3741 else if(last_postblank)
3742 {
3743 printSAOWord(lf, output);
3744 fputwc_unlocked(L' ', output);
3745 input_buffer.setPos(last);
3746 input_buffer.back(1);
3747 }
3748 else if(isAlphabetic(val) &&
3749 ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) ||
3750 lf == L""))
3751 {
3752 do
3753 {
3754 alphabet.getSymbol(sf, val);
3755 }
3756 while((val = readSAO(input)) && isAlphabetic(val));
3757
3758 unsigned int limit = firstNotAlpha(sf);
3759 unsigned int size = sf.size();
3760 limit = (limit == static_cast<unsigned int>(wstring::npos)?size:limit);
3761 input_buffer.back(1+(size-limit));
3762 fputws_unlocked(L"<d>", output);
3763 fputws_unlocked(sf.c_str(), output);
3764 fputws_unlocked(L"</d>", output);
3765 }
3766 else if(lf == L"")
3767 {
3768 unsigned int limit = firstNotAlpha(sf);
3769 unsigned int size = sf.size();
3770 limit = (limit == static_cast<unsigned int>(wstring::npos)?size:limit);
3771 input_buffer.back(1+(size-limit));
3772 fputws_unlocked(L"<d>", output);
3773 fputws_unlocked(sf.c_str(), output);
3774 fputws_unlocked(L"</d>", output);
3775 }
3776 else
3777 {
3778 printSAOWord(lf, output);
3779 input_buffer.setPos(last);
3780 input_buffer.back(1);
3781 }
3782
3783 current_state = initial_state;
3784 lf = L"";
3785 sf = L"";
3786 last_incond = false;
3787 last_postblank = false;
3788 }
3789 }
3790
3791 // print remaining blanks
3792 flushBlanks(output);
3793 }
3794
3795 wstring
removeTags(wstring const & str)3796 FSTProcessor::removeTags(wstring const &str)
3797 {
3798 for(unsigned int i = 0; i < str.size(); i++)
3799 {
3800 if(str[i] == L'<' && i >=1 && str[i-1] != L'\\')
3801 {
3802 return str.substr(0, i);
3803 }
3804 }
3805
3806 return str;
3807 }
3808
3809
3810 void
setBiltransSurfaceForms(bool const value)3811 FSTProcessor::setBiltransSurfaceForms(bool const value)
3812 {
3813 biltransSurfaceForms = value;
3814 }
3815
3816 void
setCaseSensitiveMode(bool const value)3817 FSTProcessor::setCaseSensitiveMode(bool const value)
3818 {
3819 caseSensitive = value;
3820 }
3821
3822 void
setDictionaryCaseMode(bool const value)3823 FSTProcessor::setDictionaryCaseMode(bool const value)
3824 {
3825 dictionaryCase = value;
3826 }
3827
3828 void
setNullFlush(bool const value)3829 FSTProcessor::setNullFlush(bool const value)
3830 {
3831 nullFlush = value;
3832 }
3833
3834 void
setIgnoredChars(bool const value)3835 FSTProcessor::setIgnoredChars(bool const value)
3836 {
3837 useIgnoredChars = value;
3838 }
3839
3840 void
setRestoreChars(bool const value)3841 FSTProcessor::setRestoreChars(bool const value)
3842 {
3843 useRestoreChars = value;
3844 }
3845
3846 void
setUseDefaultIgnoredChars(bool const value)3847 FSTProcessor::setUseDefaultIgnoredChars(bool const value)
3848 {
3849 useDefaultIgnoredChars = value;
3850 }
3851
3852 void
setDisplayWeightsMode(bool const value)3853 FSTProcessor::setDisplayWeightsMode(bool const value)
3854 {
3855 displayWeightsMode = value;
3856 }
3857
3858 void
setMaxAnalysesValue(int const value)3859 FSTProcessor::setMaxAnalysesValue(int const value)
3860 {
3861 maxAnalyses = value;
3862 }
3863
3864 void
setMaxWeightClassesValue(int const value)3865 FSTProcessor::setMaxWeightClassesValue(int const value)
3866 {
3867 maxWeightClasses = value;
3868 }
3869
3870 bool
getDecompoundingMode()3871 FSTProcessor::getDecompoundingMode()
3872 {
3873 return do_decomposition;
3874 }
3875
3876 bool
getNullFlush()3877 FSTProcessor::getNullFlush()
3878 {
3879 return nullFlush;
3880 }
3881
3882 size_t
firstNotAlpha(wstring const & sf)3883 FSTProcessor::firstNotAlpha(wstring const &sf)
3884 {
3885 for(size_t i = 0, limit = sf.size(); i < limit; i++)
3886 {
3887 if(!isAlphabetic(sf[i]))
3888 {
3889 return i;
3890 }
3891 }
3892
3893 return wstring::npos;
3894 }
3895