1 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
2 //
3 // This file is a bundle of all sources and headers of UDPipe library.
4 // Comments and copyrights of all individual files are kept.
5 
6 #include <algorithm>
7 #include <atomic>
8 #include <cassert>
9 #include <cmath>
10 #include <cstddef>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <cstring>
14 #include <fstream>
15 #include <functional>
16 #include <initializer_list>
17 #include <iomanip>
18 #include <iostream>
19 #include <iterator>
20 #include <limits>
21 #include <list>
22 #include <map>
23 #include <memory>
24 #include <random>
25 #include <sstream>
26 #include <stdexcept>
27 #include <string>
28 #include <unordered_map>
29 #include <unordered_set>
30 #include <utility>
31 #include <vector>
32 
33 namespace ufal {
34 namespace udpipe {
35 
36 /////////
37 // File: utils/common.h
38 /////////
39 
40 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
41 //
42 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
43 // Mathematics and Physics, Charles University in Prague, Czech Republic.
44 //
45 // This Source Code Form is subject to the terms of the Mozilla Public
46 // License, v. 2.0. If a copy of the MPL was not distributed with this
47 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
48 
49 // Headers available in all sources
50 
51 namespace utils {
52 
53 using namespace std;
54 
55 // Assert that int is at least 4B
56 static_assert(sizeof(int) >= sizeof(int32_t), "Int must be at least 4B wide!");
57 
58 // Assert that we are on a little endian system
59 #ifdef __BYTE_ORDER__
60 static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Only little endian systems are supported!");
61 #endif
62 
63 #define runtime_failure(message) exit((cerr << message << endl, 1))
64 
65 } // namespace utils
66 
67 /////////
68 // File: utils/string_piece.h
69 /////////
70 
71 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
72 //
73 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
74 // Mathematics and Physics, Charles University in Prague, Czech Republic.
75 //
76 // This Source Code Form is subject to the terms of the Mozilla Public
77 // License, v. 2.0. If a copy of the MPL was not distributed with this
78 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
79 
80 namespace utils {
81 
82 struct string_piece {
83   const char* str;
84   size_t len;
85 
string_pieceufal::udpipe::utils::string_piece86   string_piece() : str(nullptr), len(0) {}
string_pieceufal::udpipe::utils::string_piece87   string_piece(const char* str) : str(str), len(strlen(str)) {}
string_pieceufal::udpipe::utils::string_piece88   string_piece(const char* str, size_t len) : str(str), len(len) {}
string_pieceufal::udpipe::utils::string_piece89   string_piece(const string& str) : str(str.c_str()), len(str.size()) {}
90 };
91 
operator <<(ostream & os,const string_piece & str)92 inline ostream& operator<<(ostream& os, const string_piece& str) {
93   return os.write(str.str, str.len);
94 }
95 
operator ==(const string_piece & a,const string_piece & b)96 inline bool operator==(const string_piece& a, const string_piece& b) {
97   return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
98 }
99 
operator !=(const string_piece & a,const string_piece & b)100 inline bool operator!=(const string_piece& a, const string_piece& b) {
101   return a.len != b.len || memcmp(a.str, b.str, a.len) != 0;
102 }
103 
104 } // namespace utils
105 
106 /////////
107 // File: common.h
108 /////////
109 
110 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
111 //
112 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
113 // Mathematics and Physics, Charles University in Prague, Czech Republic.
114 //
115 // This Source Code Form is subject to the terms of the Mozilla Public
116 // License, v. 2.0. If a copy of the MPL was not distributed with this
117 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
118 
119 using namespace utils;
120 
121 /////////
122 // File: sentence/empty_node.h
123 /////////
124 
125 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
126 //
127 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
128 // Mathematics and Physics, Charles University in Prague, Czech Republic.
129 //
130 // This Source Code Form is subject to the terms of the Mozilla Public
131 // License, v. 2.0. If a copy of the MPL was not distributed with this
132 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
133 
134 class empty_node {
135  public:
136   int id;         // 0 is root, >0 is sentence word, <0 is undefined
137   int index;      // index for the current id, should be numbered from 1, 0=undefined
138   string form;    // form
139   string lemma;   // lemma
140   string upostag; // universal part-of-speech tag
141   string xpostag; // language-specific part-of-speech tag
142   string feats;   // list of morphological features
143   string deps;    // secondary dependencies
144   string misc;    // miscellaneous information
145 
empty_node(int id=-1,int index=0)146   empty_node(int id = -1, int index = 0) : id(id), index(index) {}
147 };
148 
149 /////////
150 // File: sentence/token.h
151 /////////
152 
153 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
154 //
155 // Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of
156 // Mathematics and Physics, Charles University in Prague, Czech Republic.
157 //
158 // This Source Code Form is subject to the terms of the Mozilla Public
159 // License, v. 2.0. If a copy of the MPL was not distributed with this
160 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
161 
162 class token {
163  public:
164   string form;
165   string misc;
166 
167   token(string_piece form = string_piece(), string_piece misc = string_piece());
168 
169   // CoNLL-U defined SpaceAfter=No feature
170   bool get_space_after() const;
171   void set_space_after(bool space_after);
172 
173   // UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
174   void get_spaces_before(string& spaces_before) const;
175   void set_spaces_before(string_piece spaces_before);
176   void get_spaces_after(string& spaces_after) const;
177   void set_spaces_after(string_piece spaces_after);
178   void get_spaces_in_token(string& spaces_in_token) const;
179   void set_spaces_in_token(string_piece spaces_in_token);
180 
181   // UDPipe-specific TokenRange feature
182   bool get_token_range(size_t& start, size_t& end) const;
183   void set_token_range(size_t start, size_t end);
184 
185  private:
186   bool get_misc_field(string_piece name, string_piece& value) const;
187   void remove_misc_field(string_piece name);
188   string& start_misc_field(string_piece name);
189 
190   void append_escaped_spaces(string_piece spaces, string& escaped_spaces) const;
191   void unescape_spaces(string_piece escaped_spaces, string& spaces) const;
192 };
193 
194 /////////
195 // File: sentence/multiword_token.h
196 /////////
197 
198 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
199 //
200 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
201 // Mathematics and Physics, Charles University in Prague, Czech Republic.
202 //
203 // This Source Code Form is subject to the terms of the Mozilla Public
204 // License, v. 2.0. If a copy of the MPL was not distributed with this
205 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
206 
207 class multiword_token : public token {
208  public:
209   // form and misc are inherited from token
210   int id_first, id_last;
211 
multiword_token(int id_first=-1,int id_last=-1,string_piece form=string_piece (),string_piece misc=string_piece ())212   multiword_token(int id_first = -1, int id_last = -1, string_piece form = string_piece(), string_piece misc = string_piece())
213       : token(form, misc), id_first(id_first), id_last(id_last) {}
214 };
215 
216 /////////
217 // File: sentence/word.h
218 /////////
219 
220 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
221 //
222 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
223 // Mathematics and Physics, Charles University in Prague, Czech Republic.
224 //
225 // This Source Code Form is subject to the terms of the Mozilla Public
226 // License, v. 2.0. If a copy of the MPL was not distributed with this
227 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
228 
229 class word : public token {
230  public:
231   // form and misc are inherited from token
232   int id;         // 0 is root, >0 is sentence word, <0 is undefined
233   string lemma;   // lemma
234   string upostag; // universal part-of-speech tag
235   string xpostag; // language-specific part-of-speech tag
236   string feats;   // list of morphological features
237   int head;       // head, 0 is root, <0 is undefined
238   string deprel;  // dependency relation to the head
239   string deps;    // secondary dependencies
240 
241   vector<int> children;
242 
word(int id=-1,string_piece form=string_piece ())243   word(int id = -1, string_piece form = string_piece()) : token(form), id(id), head(-1) {}
244 };
245 
246 /////////
247 // File: sentence/sentence.h
248 /////////
249 
250 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
251 //
252 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
253 // Mathematics and Physics, Charles University in Prague, Czech Republic.
254 //
255 // This Source Code Form is subject to the terms of the Mozilla Public
256 // License, v. 2.0. If a copy of the MPL was not distributed with this
257 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
258 
259 class sentence {
260  public:
261   sentence();
262 
263   vector<word> words;
264   vector<multiword_token> multiword_tokens;
265   vector<empty_node> empty_nodes;
266   vector<string> comments;
267   static const string root_form;
268 
269   // Basic sentence modifications
270   bool empty();
271   void clear();
272   word& add_word(string_piece form = string_piece());
273   void set_head(int id, int head, const string& deprel);
274   void unlink_all_words();
275 
276   // CoNLL-U defined comments
277   bool get_new_doc(string* id = nullptr) const;
278   void set_new_doc(bool new_doc, string_piece id = string_piece());
279   bool get_new_par(string* id = nullptr) const;
280   void set_new_par(bool new_par, string_piece id = string_piece());
281   bool get_sent_id(string& id) const;
282   void set_sent_id(string_piece id);
283   bool get_text(string& text) const;
284   void set_text(string_piece text);
285 
286  private:
287   bool get_comment(string_piece name, string* value) const;
288   void remove_comment(string_piece name);
289   void set_comment(string_piece name, string_piece value = string_piece());
290 };
291 
292 /////////
293 // File: sentence/input_format.h
294 /////////
295 
296 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
297 //
298 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
299 // Mathematics and Physics, Charles University in Prague, Czech Republic.
300 //
301 // This Source Code Form is subject to the terms of the Mozilla Public
302 // License, v. 2.0. If a copy of the MPL was not distributed with this
303 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
304 
305 class input_format {
306  public:
~input_format()307   virtual ~input_format() {}
308 
309   virtual bool read_block(istream& is, string& block) const = 0;
310   virtual void reset_document(string_piece id = string_piece()) = 0;
311   virtual void set_text(string_piece text, bool make_copy = false) = 0;
312   virtual bool next_sentence(sentence& s, string& error) = 0;
313 
314   // Static factory methods
315   static input_format* new_input_format(const string& name);
316   static input_format* new_conllu_input_format(const string& options = string());
317   static input_format* new_generic_tokenizer_input_format(const string& options = string());
318   static input_format* new_horizontal_input_format(const string& options = string());
319   static input_format* new_vertical_input_format(const string& options = string());
320 
321   static input_format* new_presegmented_tokenizer(input_format* tokenizer);
322 
323   static const string CONLLU_V1;
324   static const string CONLLU_V2;
325   static const string GENERIC_TOKENIZER_NORMALIZED_SPACES;
326   static const string GENERIC_TOKENIZER_PRESEGMENTED;
327   static const string GENERIC_TOKENIZER_RANGES;
328 };
329 
330 /////////
331 // File: model/model.h
332 /////////
333 
334 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
335 //
336 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
337 // Mathematics and Physics, Charles University in Prague, Czech Republic.
338 //
339 // This Source Code Form is subject to the terms of the Mozilla Public
340 // License, v. 2.0. If a copy of the MPL was not distributed with this
341 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
342 
343 class model {
344  public:
~model()345   virtual ~model() {}
346 
347   static model* load(const char* fname);
348   static model* load(istream& is);
349 
350   virtual input_format* new_tokenizer(const string& options) const = 0;
351   virtual bool tag(sentence& s, const string& options, string& error) const = 0;
352   virtual bool parse(sentence& s, const string& options, string& error) const = 0;
353 
354   static const string DEFAULT;
355   static const string TOKENIZER_NORMALIZED_SPACES;
356   static const string TOKENIZER_PRESEGMENTED;
357   static const string TOKENIZER_RANGES;
358 };
359 
360 /////////
361 // File: model/evaluator.h
362 /////////
363 
364 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
365 //
366 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
367 // Mathematics and Physics, Charles University in Prague, Czech Republic.
368 //
369 // This Source Code Form is subject to the terms of the Mozilla Public
370 // License, v. 2.0. If a copy of the MPL was not distributed with this
371 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
372 
373 class evaluator {
374  public:
375   evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser);
376 
377   void set_model(const model* m);
378   void set_tokenizer(const string& tokenizer);
379   void set_tagger(const string& tagger);
380   void set_parser(const string& parser);
381 
382   bool evaluate(istream& is, ostream& os, string& error) const;
383 
384   static const string DEFAULT;
385   static const string NONE;
386 
387  private:
388   const model* m;
389   string tokenizer, tagger, parser;
390 
391   struct f1_info { size_t total_system, total_gold; double precision, recall, f1; };
392   template <class T>
393   static f1_info evaluate_f1(const vector<pair<size_t, T>>& system, const vector<pair<size_t, T>>& gold);
394 
395   class evaluation_data {
396    public:
397     struct word_data {
398       size_t start, end;
399       bool is_multiword;
400       word w;
401 
402       word_data(size_t start, size_t end, int id, bool is_multiword, const word& w);
403     };
404 
405     void add_sentence(const sentence& s);
406 
407     u32string chars;
408     vector<pair<size_t, size_t>> sentences, tokens;
409     vector<pair<size_t, string>> multiwords;
410     vector<word_data> words;
411   };
412 
413   class word_alignment {
414    public:
415     struct pair_system_gold {
416       word system; const word& gold;
pair_system_goldufal::udpipe::evaluator::word_alignment::pair_system_gold417       pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {}
418     };
419     vector<pair_system_gold> matched;
420     size_t total_system, total_gold;
421 
422     template <class Equals>
423     f1_info evaluate_f1(Equals equals);
424 
425     static bool perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment);
426     static void best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment);
427   };
428 };
429 
430 /////////
431 // File: unilib/unicode.h
432 /////////
433 
434 // This file is part of UniLib <http://github.com/ufal/unilib/>.
435 //
436 // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
437 // Mathematics and Physics, Charles University in Prague, Czech Republic.
438 //
439 // This Source Code Form is subject to the terms of the Mozilla Public
440 // License, v. 2.0. If a copy of the MPL was not distributed with this
441 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
442 //
443 // UniLib version: 3.1.1
444 // Unicode version: 8.0.0
445 
446 namespace unilib {
447 
448 class unicode {
449   enum : uint8_t {
450     _Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5,
451     _Mn = 6, _Mc = 7, _Me = 8,
452     _Nd = 9, _Nl = 10, _No = 11,
453     _Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18,
454     _Sm = 19, _Sc = 20, _Sk = 21, _So = 22,
455     _Zs = 23, _Zl = 24, _Zp = 25,
456     _Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30
457   };
458 
459  public:
460   typedef uint32_t category_t;
461   enum : category_t {
462     Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt,
463       Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo,
464     Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me,
465     Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No,
466     Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi,
467       Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po,
468     Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So,
469     Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp,
470     Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn
471   };
472 
473   static inline category_t category(char32_t chr);
474 
475   static inline char32_t lowercase(char32_t chr);
476   static inline char32_t uppercase(char32_t chr);
477   static inline char32_t titlecase(char32_t chr);
478 
479  private:
480   static const char32_t CHARS = 0x110000;
481   static const int32_t DEFAULT_CAT = Cn;
482 
483   static const uint8_t category_index[CHARS >> 8];
484   static const uint8_t category_block[][256];
485   static const uint8_t othercase_index[CHARS >> 8];
486   static const char32_t othercase_block[][256];
487 
488   enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, LOWER_THEN_UPPER = 3, UPPER_THEN_TITLE = 4, TITLE_THEN_LOWER = 5 };
489 };
490 
category(char32_t chr)491 unicode::category_t unicode::category(char32_t chr) {
492   return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
493 }
494 
lowercase(char32_t chr)495 char32_t unicode::lowercase(char32_t chr) {
496   if (chr < CHARS) {
497     char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
498     if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8;
499     if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8;
500     if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
501   }
502   return chr;
503 }
504 
uppercase(char32_t chr)505 char32_t unicode::uppercase(char32_t chr) {
506   if (chr < CHARS) {
507     char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
508     if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
509     if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8;
510     if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
511   }
512   return chr;
513 }
514 
titlecase(char32_t chr)515 char32_t unicode::titlecase(char32_t chr) {
516   if (chr < CHARS) {
517     char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
518     if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
519     if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8;
520     if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
521   }
522   return chr;
523 }
524 
525 } // namespace unilib
526 
527 /////////
528 // File: unilib/utf8.h
529 /////////
530 
531 // This file is part of UniLib <http://github.com/ufal/unilib/>.
532 //
533 // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
534 // Mathematics and Physics, Charles University in Prague, Czech Republic.
535 //
536 // This Source Code Form is subject to the terms of the Mozilla Public
537 // License, v. 2.0. If a copy of the MPL was not distributed with this
538 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
539 //
540 // UniLib version: 3.1.1
541 // Unicode version: 8.0.0
542 
543 namespace unilib {
544 
545 class utf8 {
546  public:
547   static bool valid(const char* str);
548   static bool valid(const char* str, size_t len);
549   static inline bool valid(const std::string& str);
550 
551   static inline char32_t decode(const char*& str);
552   static inline char32_t decode(const char*& str, size_t& len);
553   static inline char32_t first(const char* str);
554   static inline char32_t first(const char* str, size_t len);
555   static inline char32_t first(const std::string& str);
556 
557   static void decode(const char* str, std::u32string& decoded);
558   static void decode(const char* str, size_t len, std::u32string& decoded);
559   static inline void decode(const std::string& str, std::u32string& decoded);
560 
561   class string_decoder {
562    public:
563     class iterator;
564     inline iterator begin();
565     inline iterator end();
566    private:
567     inline string_decoder(const char* str);
568     const char* str;
569     friend class utf8;
570   };
571   static inline string_decoder decoder(const char* str);
572   static inline string_decoder decoder(const std::string& str);
573 
574   class buffer_decoder {
575    public:
576     class iterator;
577     inline iterator begin();
578     inline iterator end();
579    private:
580     inline buffer_decoder(const char* str, size_t len);
581     const char* str;
582     size_t len;
583     friend class utf8;
584   };
585   static inline buffer_decoder decoder(const char* str, size_t len);
586 
587   static inline void append(char*& str, char32_t chr);
588   static inline void append(std::string& str, char32_t chr);
589   static void encode(const std::u32string& str, std::string& encoded);
590 
591   template<class F> static void map(F f, const char* str, std::string& result);
592   template<class F> static void map(F f, const char* str, size_t len, std::string& result);
593   template<class F> static void map(F f, const std::string& str, std::string& result);
594 
595  private:
596   static const char REPLACEMENT_CHAR = '?';
597 };
598 
valid(const std::string & str)599 bool utf8::valid(const std::string& str) {
600   return valid(str.c_str());
601 }
602 
decode(const char * & str)603 char32_t utf8::decode(const char*& str) {
604   if (((unsigned char)*str) < 0x80) return (unsigned char)*str++;
605   else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR;
606   else if (((unsigned char)*str) < 0xE0) {
607     char32_t res = (((unsigned char)*str++) & 0x1F) << 6;
608     if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
609     return res + (((unsigned char)*str++) & 0x3F);
610   } else if (((unsigned char)*str) < 0xF0) {
611     char32_t res = (((unsigned char)*str++) & 0x0F) << 12;
612     if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
613     res += (((unsigned char)*str++) & 0x3F) << 6;
614     if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
615     return res + (((unsigned char)*str++) & 0x3F);
616   } else if (((unsigned char)*str) < 0xF8) {
617     char32_t res = (((unsigned char)*str++) & 0x07) << 18;
618     if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
619     res += (((unsigned char)*str++) & 0x3F) << 12;
620     if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
621     res += (((unsigned char)*str++) & 0x3F) << 6;
622     if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
623     return res + (((unsigned char)*str++) & 0x3F);
624   } else return ++str, REPLACEMENT_CHAR;
625 }
626 
decode(const char * & str,size_t & len)627 char32_t utf8::decode(const char*& str, size_t& len) {
628   if (!len) return 0;
629   --len;
630   if (((unsigned char)*str) < 0x80) return (unsigned char)*str++;
631   else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR;
632   else if (((unsigned char)*str) < 0xE0) {
633     char32_t res = (((unsigned char)*str++) & 0x1F) << 6;
634     if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
635     return res + ((--len, ((unsigned char)*str++)) & 0x3F);
636   } else if (((unsigned char)*str) < 0xF0) {
637     char32_t res = (((unsigned char)*str++) & 0x0F) << 12;
638     if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
639     res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6;
640     if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
641     return res + ((--len, ((unsigned char)*str++)) & 0x3F);
642   } else if (((unsigned char)*str) < 0xF8) {
643     char32_t res = (((unsigned char)*str++) & 0x07) << 18;
644     if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
645     res += ((--len, ((unsigned char)*str++)) & 0x3F) << 12;
646     if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
647     res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6;
648     if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
649     return res + ((--len, ((unsigned char)*str++)) & 0x3F);
650   } else return ++str, REPLACEMENT_CHAR;
651 }
652 
first(const char * str)653 char32_t utf8::first(const char* str) {
654   return decode(str);
655 }
656 
first(const char * str,size_t len)657 char32_t utf8::first(const char* str, size_t len) {
658   return decode(str, len);
659 }
660 
first(const std::string & str)661 char32_t utf8::first(const std::string& str) {
662   return first(str.c_str());
663 }
664 
decode(const std::string & str,std::u32string & decoded)665 void utf8::decode(const std::string& str, std::u32string& decoded) {
666   decode(str.c_str(), decoded);
667 }
668 
669 class utf8::string_decoder::iterator : public std::iterator<std::input_iterator_tag, char32_t> {
670  public:
iterator(const char * str)671   iterator(const char* str) : codepoint(0), next(str) { operator++(); }
iterator(const iterator & it)672   iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {}
operator ++()673   iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
operator ++(int)674   iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
operator ==(const iterator & other) const675   bool operator==(const iterator& other) const { return next == other.next; }
operator !=(const iterator & other) const676   bool operator!=(const iterator& other) const { return next != other.next; }
operator *()677   const char32_t& operator*() { return codepoint; }
678  private:
679   char32_t codepoint;
680   const char* next;
681 };
682 
string_decoder(const char * str)683 utf8::string_decoder::string_decoder(const char* str) : str(str) {}
684 
begin()685 utf8::string_decoder::iterator utf8::string_decoder::begin() {
686   return iterator(str);
687 }
688 
end()689 utf8::string_decoder::iterator utf8::string_decoder::end() {
690   return iterator(nullptr);
691 }
692 
decoder(const char * str)693 utf8::string_decoder utf8::decoder(const char* str) {
694   return string_decoder(str);
695 }
696 
decoder(const std::string & str)697 utf8::string_decoder utf8::decoder(const std::string& str) {
698   return string_decoder(str.c_str());
699 }
700 
701 class utf8::buffer_decoder::iterator : public std::iterator<std::input_iterator_tag, char32_t> {
702  public:
iterator(const char * str,size_t len)703   iterator(const char* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); }
iterator(const iterator & it)704   iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {}
operator ++()705   iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; }
operator ++(int)706   iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
operator ==(const iterator & other) const707   bool operator==(const iterator& other) const { return next == other.next; }
operator !=(const iterator & other) const708   bool operator!=(const iterator& other) const { return next != other.next; }
operator *()709   const char32_t& operator*() { return codepoint; }
710  private:
711   char32_t codepoint;
712   const char* next;
713   size_t len;
714 };
715 
buffer_decoder(const char * str,size_t len)716 utf8::buffer_decoder::buffer_decoder(const char* str, size_t len) : str(str), len(len) {}
717 
begin()718 utf8::buffer_decoder::iterator utf8::buffer_decoder::begin() {
719   return iterator(str, len);
720 }
721 
end()722 utf8::buffer_decoder::iterator utf8::buffer_decoder::end() {
723   return iterator(nullptr, 0);
724 }
725 
decoder(const char * str,size_t len)726 utf8::buffer_decoder utf8::decoder(const char* str, size_t len) {
727   return buffer_decoder(str, len);
728 }
729 
append(char * & str,char32_t chr)730 void utf8::append(char*& str, char32_t chr) {
731   if (chr < 0x80) *str++ = chr;
732   else if (chr < 0x800) { *str++ = 0xC0 + (chr >> 6); *str++ = 0x80 + (chr & 0x3F); }
733   else if (chr < 0x10000) { *str++ = 0xE0 + (chr >> 12); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); }
734   else if (chr < 0x200000) { *str++ = 0xF0 + (chr >> 18); *str++ = 0x80 + ((chr >> 12) & 0x3F); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); }
735   else *str++ = REPLACEMENT_CHAR;
736 }
737 
append(std::string & str,char32_t chr)738 void utf8::append(std::string& str, char32_t chr) {
739   if (chr < 0x80) str += chr;
740   else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); }
741   else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); }
742   else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); }
743   else str += REPLACEMENT_CHAR;
744 }
745 
map(F f,const char * str,std::string & result)746 template<class F> void utf8::map(F f, const char* str, std::string& result) {
747   result.clear();
748 
749   for (char32_t chr; (chr = decode(str)); )
750     append(result, f(chr));
751 }
752 
map(F f,const char * str,size_t len,std::string & result)753 template<class F> void utf8::map(F f, const char* str, size_t len, std::string& result) {
754   result.clear();
755 
756   while (len)
757     append(result, f(decode(str, len)));
758 }
759 
map(F f,const std::string & str,std::string & result)760 template<class F> void utf8::map(F f, const std::string& str, std::string& result) {
761   map(f, str.c_str(), result);
762 }
763 
764 } // namespace unilib
765 
766 /////////
767 // File: model/evaluator.cpp
768 /////////
769 
770 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
771 //
772 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
773 // Mathematics and Physics, Charles University in Prague, Czech Republic.
774 //
775 // This Source Code Form is subject to the terms of the Mozilla Public
776 // License, v. 2.0. If a copy of the MPL was not distributed with this
777 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
778 
779 const string evaluator::DEFAULT;
780 const string evaluator::NONE = "none";
781 
evaluator(const model * m,const string & tokenizer,const string & tagger,const string & parser)782 evaluator::evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser) {
783   set_model(m);
784   set_tokenizer(tokenizer);
785   set_tagger(tagger);
786   set_parser(parser);
787 }
788 
set_model(const model * m)789 void evaluator::set_model(const model* m) {
790   this->m = m;
791 }
792 
set_tokenizer(const string & tokenizer)793 void evaluator::set_tokenizer(const string& tokenizer) {
794   this->tokenizer = tokenizer;
795 }
796 
set_tagger(const string & tagger)797 void evaluator::set_tagger(const string& tagger) {
798   this->tagger = tagger;
799 }
800 
set_parser(const string & parser)801 void evaluator::set_parser(const string& parser) {
802   this->parser = parser;
803 }
804 
evaluate(istream & is,ostream & os,string & error) const805 bool evaluator::evaluate(istream& is, ostream& os, string& error) const {
806   error.clear();
807 
808   unique_ptr<input_format> conllu_input(input_format::new_conllu_input_format());
809   if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false;
810 
811   vector<string> plain_text_paragraphs(1); unsigned space_after_nos = 0;
812   sentence system, gold;
813   evaluation_data gold_data, system_goldtok_data, system_goldtok_goldtags_data, system_plaintext_data;
814 
815   string block;
816   while (conllu_input->read_block(is, block)) {
817     conllu_input->set_text(block);
818     while (conllu_input->next_sentence(gold, error)) {
819       gold_data.add_sentence(gold);
820 
821       // Detokenize the input when tokenizing
822       if (tokenizer != NONE) {
823         if (gold.get_new_doc() || gold.get_new_par()) {
824           plain_text_paragraphs.back().append("\n\n");
825           plain_text_paragraphs.emplace_back();
826         }
827 
828         for (size_t i = 1, j = 0; i < gold.words.size(); i++) {
829           const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i];
830           plain_text_paragraphs.back().append(tok.form);
831           if (tok.get_space_after())
832             plain_text_paragraphs.back().push_back(' ');
833           else
834             space_after_nos += 1;
835           if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i))
836             i = gold.multiword_tokens[j++].id_last;
837         }
838       }
839 
840       // Goldtok data
841       if (tokenizer == NONE && tagger != NONE) {
842         system.clear();
843         for (size_t i = 1; i < gold.words.size(); i++)
844           system.add_word(gold.words[i].form);
845 
846         if (tagger != NONE) {
847           if (!m->tag(system, tagger, error))
848             return false;
849           if (parser != NONE)
850             if (!m->parse(system, parser, error))
851               return false;
852         }
853         system_goldtok_data.add_sentence(system);
854       }
855 
856       // Goldtok_goldtags data
857       if (tokenizer == NONE && tagger == NONE && parser != NONE) {
858         system.clear();
859         for (size_t i = 1; i < gold.words.size(); i++) {
860           system.add_word(gold.words[i].form);
861           system.words[i].upostag = gold.words[i].upostag;
862           system.words[i].xpostag = gold.words[i].xpostag;
863           system.words[i].feats = gold.words[i].feats;
864           system.words[i].lemma = gold.words[i].lemma;
865         }
866         if (parser != NONE)
867           if (!m->parse(system, parser, error))
868             return false;
869         system_goldtok_goldtags_data.add_sentence(system);
870       }
871     }
872     if (!error.empty()) return false;
873   }
874 
875   // Tokenize, tag and parse plaintext input
876   if (tokenizer != NONE) {
877     unique_ptr<input_format> t(m->new_tokenizer(tokenizer));
878     if (!t) return error.assign("Cannot allocate new tokenizer!"), false;
879 
880     for (auto&& plain_text : plain_text_paragraphs) {
881       t->set_text(plain_text);
882       while (t->next_sentence(system, error)) {
883         if (tagger != NONE) {
884           if (!m->tag(system, tagger, error))
885             return false;
886 
887           if (parser != NONE)
888             if (!m->parse(system, parser, error))
889               return false;
890         }
891         system_plaintext_data.add_sentence(system);
892       }
893       if (!error.empty()) return false;
894     }
895   }
896 
897   // Evaluate from plain text
898   if (tokenizer != NONE) {
899     if (system_plaintext_data.chars != gold_data.chars) {
900       os << "Cannot evaluate tokenizer, it returned different sequence of token characters!" << endl;
901     } else {
902       word_alignment plaintext_alignment;
903       word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment);
904 
905       os << "Number of SpaceAfter=No features in gold data: " << space_after_nos << endl;
906 
907       auto tokens = evaluate_f1(system_plaintext_data.tokens, gold_data.tokens);
908       auto multiwords = evaluate_f1(system_plaintext_data.multiwords, gold_data.multiwords);
909       auto sentences = evaluate_f1(system_plaintext_data.sentences, gold_data.sentences);
910       auto words = plaintext_alignment.evaluate_f1([](const word&, const word&) {return true;});
911       if (multiwords.total_gold || multiwords.total_system)
912         os << "Tokenizer tokens - system: " << tokens.total_system << ", gold: " << tokens.total_gold
913            << ", precision: " << fixed << setprecision(2) << 100. * tokens.precision
914            << "%, recall: " << 100. * tokens.recall << "%, f1: " << 100. * tokens.f1 << "%" << endl
915            << "Tokenizer multiword tokens - system: " << multiwords.total_system << ", gold: " << multiwords.total_gold
916            << ", precision: " << fixed << setprecision(2) << 100. * multiwords.precision
917            << "%, recall: " << 100. * multiwords.recall << "%, f1: " << 100. * multiwords.f1 << "%" << endl;
918       os << "Tokenizer words - system: " << words.total_system << ", gold: " << words.total_gold
919          << ", precision: " << fixed << setprecision(2) << 100. * words.precision
920          << "%, recall: " << 100. * words.recall << "%, f1: " << 100. * words.f1 << "%" << endl
921          << "Tokenizer sentences - system: " << sentences.total_system << ", gold: " << sentences.total_gold
922          << ", precision: " << fixed << setprecision(2) << 100. * sentences.precision
923          << "%, recall: " << 100. * sentences.recall << "%, f1: " << 100. * sentences.f1 << "%" << endl;
924 
925       if (tagger != NONE) {
926         auto upostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; });
927         auto xpostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; });
928         auto feats = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; });
929         auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
930         auto lemmas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; });
931         os << "Tagging from plain text (CoNLL17 F1 score) - gold forms: " << upostags.total_gold << ", upostag: "
932            << fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: "
933            << 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: "
934            << 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl;
935       }
936 
937       if (tagger != NONE && parser != NONE) {
938         auto uas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; });
939         auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
940         os << "Parsing from plain text with computed tags (CoNLL17 F1 score) - gold forms: " << uas.total_gold
941            << ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl;
942       }
943     }
944   }
945 
946   // Evaluate tagger from gold tokenization
947   if (tokenizer == NONE && tagger != NONE) {
948     word_alignment goldtok_alignment;
949     if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment))
950       return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false;
951 
952     auto upostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; });
953     auto xpostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; });
954     auto feats = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; });
955     auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
956     auto lemmas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; });
957     os << "Tagging from gold tokenization - forms: " << upostags.total_gold << ", upostag: "
958        << fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: "
959        << 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: "
960        << 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl;
961 
962     if (parser != NONE) {
963       auto uas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; });
964       auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
965       os << "Parsing from gold tokenization with computed tags - forms: " << uas.total_gold
966          << ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl;
967     }
968   }
969 
970   // Evaluate parser from gold tokenization
971   if (tokenizer == NONE && tagger == NONE && parser != NONE) {
972     word_alignment goldtok_goldtags_alignment;
973     if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment))
974       return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false;
975 
976     auto uas = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; });
977     auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
978     os << "Parsing from gold tokenization with gold tags - forms: " << uas.total_gold
979        << ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl;
980   }
981 
982   return true;
983 }
984 
985 template <class T>
evaluate_f1(const vector<pair<size_t,T>> & system,const vector<pair<size_t,T>> & gold)986 evaluator::f1_info evaluator::evaluate_f1(const vector<pair<size_t, T>>& system, const vector<pair<size_t, T>>& gold) {
987   size_t both = 0;
988   for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
989     if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
990       si++;
991     else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
992       gi++;
993     else
994       both += system[si++].second == gold[gi++].second;
995 
996   return {system.size(), gold.size(), system.size() ? both / double(system.size()) : 0.,
997     gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
998 }
999 
word_data(size_t start,size_t end,int id,bool is_multiword,const word & w)1000 evaluator::evaluation_data::word_data::word_data(size_t start, size_t end, int id, bool is_multiword, const word& w)
1001   : start(start), end(end), is_multiword(is_multiword), w(w)
1002 {
1003   // Use absolute ids for words and heads
1004   this->w.id = id;
1005   this->w.head = w.head ? id + (w.head - w.id) : 0;
1006 
1007   // Forms in MWTs are compares case-insensitively in LCS, therefore
1008   // we lowercase them here.
1009   unilib::utf8::map(unilib::unicode::lowercase, w.form, this->w.form);
1010 
1011   // During evaluation, only universal part of DEPREL (up to a colon) is used.
1012   auto colon = w.deprel.find(':');
1013   if (colon != string::npos)
1014     this->w.deprel.erase(colon);
1015 }
1016 
add_sentence(const sentence & s)1017 void evaluator::evaluation_data::add_sentence(const sentence& s) {
1018   sentences.emplace_back(chars.size(), chars.size());
1019   for (size_t i = 1, j = 0; i < s.words.size(); i++) {
1020     tokens.emplace_back(chars.size(), chars.size());
1021     const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form;
1022     for (auto&& chr : unilib::utf8::decoder(form))
1023       if (chr != ' ')
1024         chars.push_back(chr);
1025     tokens.back().second = chars.size();
1026 
1027     if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) {
1028       multiwords.emplace_back(tokens.back().first, form);
1029       for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) {
1030         words.emplace_back(tokens.back().first, tokens.back().second, words.size() + 1, true, s.words[k]);
1031         multiwords.back().second.append(" ").append(words.back().w.form);
1032       }
1033       i = s.multiword_tokens[j++].id_last;
1034     } else {
1035       words.emplace_back(tokens.back().first, tokens.back().second, words.size() + 1, false, s.words[i]);
1036     }
1037   }
1038   sentences.back().second = chars.size();
1039 }
1040 
1041 template <class Equals>
evaluate_f1(Equals equals)1042 evaluator::f1_info evaluator::word_alignment::evaluate_f1(Equals equals) {
1043   size_t both = 0;
1044   for (auto&& match : matched)
1045     if (equals(match.system, match.gold))
1046       both++;
1047 
1048   return {total_system, total_gold, total_system ? both / double(total_system) : 0.,
1049     total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
1050 }
1051 
perfect_alignment(const evaluation_data & system,const evaluation_data & gold,word_alignment & alignment)1052 bool evaluator::word_alignment::perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) {
1053   alignment.total_system = system.words.size();
1054   alignment.total_gold = gold.words.size();
1055   if (alignment.total_system != alignment.total_gold) return false;
1056 
1057   alignment.matched.clear();
1058   alignment.matched.reserve(alignment.total_system);
1059   for (size_t i = 0; i < system.words.size(); i++) {
1060     if (system.words[i].w.form != gold.words[i].w.form)
1061       return false;
1062     alignment.matched.emplace_back(system.words[i].w, gold.words[i].w);
1063   }
1064 
1065   return true;
1066 }
1067 
best_alignment(const evaluation_data & system,const evaluation_data & gold,word_alignment & alignment)1068 void evaluator::word_alignment::best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) {
1069   alignment.total_system = system.words.size();
1070   alignment.total_gold = gold.words.size();
1071   alignment.matched.clear();
1072 
1073   for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); )
1074     if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) &&
1075         (gold.words[gi].start > system.words[si].start || !gold.words[gi].is_multiword)) {
1076       // No multiword, align using start+end indices
1077       if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end)
1078         alignment.matched.emplace_back(system.words[si++].w, gold.words[gi++].w);
1079       else if (system.words[si].start <= gold.words[gi].start)
1080         si++;
1081       else
1082         gi++;
1083     } else {
1084       // We have a multiword
1085       size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end;
1086 
1087       // Find all words in the multiword range
1088       while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end :
1089                                            system.words[si].end <= multiword_range_end)) ||
1090              (gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end :
1091                                          gold.words[gi].end <= multiword_range_end))) {
1092         // Extend the multiword range
1093         if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) {
1094           if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end);
1095           si++;
1096         } else {
1097           if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end);
1098           gi++;
1099         }
1100       }
1101 
1102       // LCS on the chosen words
1103       vector<vector<unsigned>> lcs(si - ss);
1104       for (unsigned s = si - ss; s--; ) {
1105         lcs[s].resize(gi - gs);
1106         for (unsigned g = gi - gs; g--; ) {
1107           lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0);
1108           lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0);
1109           if (system.words[ss + s].w.form == gold.words[gs + g].w.form)
1110             lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0));
1111         }
1112       }
1113 
1114       for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) {
1115         if (system.words[ss + s].w.form == gold.words[gs + g].w.form)
1116           alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w);
1117         else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0))
1118           s++;
1119         else /* if (lcs[s][g] == (g+1 < lcs[s].size() ? lcs[s][g+1] : 0)) */
1120           g++;
1121       }
1122     }
1123 
1124   // Reindex HEAD pointers in system to use gold indices
1125   vector<int> gold_aligned(system.words.size(), -1);
1126   for (auto&& match : alignment.matched)
1127     gold_aligned[match.system.id - 1] = match.gold.id;
1128   for (auto&& match : alignment.matched)
1129     if (match.system.head)
1130       match.system.head = gold_aligned[match.system.head - 1];
1131 }
1132 
1133 /////////
1134 // File: morphodita/tokenizer/tokenizer.h
1135 /////////
1136 
1137 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
1138 //
1139 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1140 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1141 //
1142 // This Source Code Form is subject to the terms of the Mozilla Public
1143 // License, v. 2.0. If a copy of the MPL was not distributed with this
1144 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1145 
1146 namespace morphodita {
1147 
1148 // Range of a token, measured in Unicode characters, not UTF8 bytes.
1149 struct token_range {
1150   size_t start;
1151   size_t length;
1152 
token_rangeufal::udpipe::morphodita::token_range1153   token_range() {}
token_rangeufal::udpipe::morphodita::token_range1154   token_range(size_t start, size_t length) : start(start), length(length) {}
1155 };
1156 
1157 class tokenizer {
1158  public:
~tokenizer()1159   virtual ~tokenizer() {}
1160 
1161   virtual void set_text(string_piece text, bool make_copy = false) = 0;
1162   virtual bool next_sentence(vector<string_piece>* forms, vector<token_range>* tokens) = 0;
1163 
1164   // Static factory methods
1165   static tokenizer* new_vertical_tokenizer();
1166 
1167   static tokenizer* new_czech_tokenizer();
1168   static tokenizer* new_english_tokenizer();
1169   static tokenizer* new_generic_tokenizer();
1170 };
1171 
1172 } // namespace morphodita
1173 
1174 /////////
1175 // File: morphodita/tokenizer/tokenizer_factory.h
1176 /////////
1177 
1178 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
1179 //
1180 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
1181 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1182 //
1183 // This Source Code Form is subject to the terms of the Mozilla Public
1184 // License, v. 2.0. If a copy of the MPL was not distributed with this
1185 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1186 
1187 namespace morphodita {
1188 
1189 class tokenizer_factory {
1190  public:
~tokenizer_factory()1191   virtual ~tokenizer_factory() {}
1192 
1193   static tokenizer_factory* load(istream& is);
1194   static tokenizer_factory* load(const char* fname);
1195 
1196   // Construct a new tokenizer instance.
1197   virtual tokenizer* new_tokenizer() const = 0;
1198 };
1199 
1200 } // namespace morphodita
1201 
1202 /////////
1203 // File: morphodita/derivator/derivator.h
1204 /////////
1205 
1206 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
1207 //
1208 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
1209 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1210 //
1211 // This Source Code Form is subject to the terms of the Mozilla Public
1212 // License, v. 2.0. If a copy of the MPL was not distributed with this
1213 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1214 
1215 namespace morphodita {
1216 
1217 struct derivated_lemma {
1218   string lemma;
1219 };
1220 
1221 class derivator {
1222  public:
~derivator()1223   virtual ~derivator() {}
1224 
1225   // For given lemma, return the parent in the derivation graph.
1226   // The lemma is assumed to be lemma id and any lemma comments are ignored.
1227   virtual bool parent(string_piece lemma, derivated_lemma& parent) const = 0;
1228 
1229   // For given lemma, return the children in the derivation graph.
1230   // The lemma is assumed to be lemma id and any lemma comments are ignored.
1231   virtual bool children(string_piece lemma, vector<derivated_lemma>& children) const = 0;
1232 };
1233 
1234 } // namespace morphodita
1235 
1236 /////////
1237 // File: morphodita/morpho/morpho.h
1238 /////////
1239 
1240 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
1241 //
1242 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1243 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1244 //
1245 // This Source Code Form is subject to the terms of the Mozilla Public
1246 // License, v. 2.0. If a copy of the MPL was not distributed with this
1247 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1248 
1249 namespace morphodita {
1250 
1251 struct tagged_form {
1252   string form;
1253   string tag;
1254 
tagged_formufal::udpipe::morphodita::tagged_form1255   tagged_form() {}
tagged_formufal::udpipe::morphodita::tagged_form1256   tagged_form(const string& form, const string& tag) : form(form), tag(tag) {}
1257 };
1258 
1259 struct tagged_lemma {
1260   string lemma;
1261   string tag;
1262 
tagged_lemmaufal::udpipe::morphodita::tagged_lemma1263   tagged_lemma() {}
tagged_lemmaufal::udpipe::morphodita::tagged_lemma1264   tagged_lemma(const string& lemma, const string& tag) : lemma(lemma), tag(tag) {}
1265 };
1266 
1267 struct tagged_lemma_forms {
1268   string lemma;
1269   vector<tagged_form> forms;
1270 
tagged_lemma_formsufal::udpipe::morphodita::tagged_lemma_forms1271   tagged_lemma_forms() {}
tagged_lemma_formsufal::udpipe::morphodita::tagged_lemma_forms1272   tagged_lemma_forms(const string& lemma) : lemma(lemma) {}
1273 };
1274 
1275 class morpho {
1276  public:
~morpho()1277   virtual ~morpho() {}
1278 
1279   static morpho* load(istream& is);
1280   static morpho* load(const char* fname);
1281 
1282   enum guesser_mode { NO_GUESSER = 0, GUESSER = 1 };
1283 
1284   // Perform morphologic analysis of a form. The form is given by a pointer and
1285   // length and therefore does not need to be '\0' terminated.  The guesser
1286   // parameter specifies whether a guesser can be used if the form is not found
1287   // in the dictionary. Output is assigned to the lemmas vector.
1288   //
1289   // If the form is found in the dictionary, analyses are assigned to lemmas
1290   // and NO_GUESSER returned. If guesser == GUESSER and the form analyses are
1291   // found using a guesser, they are assigned to lemmas and GUESSER is
1292   // returned.  Otherwise <0 is returned and lemmas are filled with one
1293   // analysis containing given form as lemma and a tag for unknown word.
1294   virtual int analyze(string_piece form, guesser_mode guesser, vector<tagged_lemma>& lemmas) const = 0;
1295 
1296   // Perform morphologic generation of a lemma. The lemma is given by a pointer
1297   // and length and therefore does not need to be '\0' terminated. Optionally
1298   // a tag_wildcard can be specified (or be NULL) and if so, results are
1299   // filtered using this wildcard. The guesser parameter speficies whether
1300   // a guesser can be used if the lemma is not found in the dictionary. Output
1301   // is assigned to the forms vector.
1302   //
1303   // Tag_wildcard can be either NULL or a wildcard applied to the results.
1304   // A ? in the wildcard matches any character, [bytes] matches any of the
1305   // bytes and [^bytes] matches any byte different from the specified ones.
1306   // A - has no special meaning inside the bytes and if ] is first in bytes, it
1307   // does not end the bytes group.
1308   //
1309   // If the given lemma is only a raw lemma, all lemma ids with this raw lemma
1310   // are returned. Otherwise only matching lemma ids are returned, ignoring any
1311   // lemma comments. For every found lemma, matching forms are filtered using
1312   // the tag_wildcard. If at least one lemma is found in the dictionary,
1313   // NO_GUESSER is returned. If guesser == GUESSER and the lemma is found by
1314   // the guesser, GUESSER is returned. Otherwise, forms are cleared and <0 is
1315   // returned.
1316   virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector<tagged_lemma_forms>& forms) const = 0;
1317 
1318   // Rawlemma and lemma id identification
1319   virtual int raw_lemma_len(string_piece lemma) const = 0;
1320   virtual int lemma_id_len(string_piece lemma) const = 0;
1321 
1322   // Rawform identification
1323   virtual int raw_form_len(string_piece form) const = 0;
1324 
1325   // Construct a new tokenizer instance appropriate for this morphology.
1326   // Can return NULL if no such tokenizer exists.
1327   virtual tokenizer* new_tokenizer() const = 0;
1328 
1329   // Return a derivator for this morphology, or NULL if it does not exist.
1330   // The returned instance is owned by the morphology and should not be deleted.
1331   virtual const derivator* get_derivator() const;
1332 
1333  protected:
1334   unique_ptr<derivator> derinet;
1335 };
1336 
1337 } // namespace morphodita
1338 
1339 /////////
1340 // File: morphodita/tagger/tagger.h
1341 /////////
1342 
1343 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
1344 //
1345 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1346 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1347 //
1348 // This Source Code Form is subject to the terms of the Mozilla Public
1349 // License, v. 2.0. If a copy of the MPL was not distributed with this
1350 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1351 
1352 namespace morphodita {
1353 
1354 class tagger {
1355  public:
~tagger()1356   virtual ~tagger() {}
1357 
1358   static tagger* load(const char* fname);
1359   static tagger* load(istream& is);
1360 
1361   // Return morpho associated with the tagger. Do not delete the pointer, it is
1362   // owned by the tagger instance and deleted in the tagger destructor.
1363   virtual const morpho* get_morpho() const = 0;
1364 
1365   // Perform morphologic analysis and subsequent disambiguation.
1366   virtual void tag(const vector<string_piece>& forms, vector<tagged_lemma>& tags, morpho::guesser_mode guesser = morpho::guesser_mode(-1)) const = 0;
1367 
1368   // Perform disambiguation only on given analyses.
1369   virtual void tag_analyzed(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<int>& tags) const = 0;
1370 
1371   // Construct a new tokenizer instance appropriate for this tagger.
1372   // Can return NULL if no such tokenizer exists.
1373   // Is equal to get_morpho()->new_tokenizer.
1374   tokenizer* new_tokenizer() const;
1375 };
1376 
1377 } // namespace morphodita
1378 
1379 /////////
1380 // File: parsito/tree/node.h
1381 /////////
1382 
1383 // This file is part of Parsito <http://github.com/ufal/parsito/>.
1384 //
1385 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1386 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1387 //
1388 // This Source Code Form is subject to the terms of the Mozilla Public
1389 // License, v. 2.0. If a copy of the MPL was not distributed with this
1390 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1391 
1392 namespace parsito {
1393 
1394 class node {
1395  public:
1396   int id;         // 0 is root, >0 is sentence node, <0 is undefined
1397   string form;    // form
1398   string lemma;   // lemma
1399   string upostag; // universal part-of-speech tag
1400   string xpostag; // language-specific part-of-speech tag
1401   string feats;   // list of morphological features
1402   int head;       // head, 0 is root, <0 is without parent
1403   string deprel;  // dependency relation to the head
1404   string deps;    // secondary dependencies
1405   string misc;    // miscellaneous information
1406 
1407   vector<int> children;
1408 
node(int id=-1,const string & form=string ())1409   node(int id = -1, const string& form = string()) : id(id), form(form), head(-1) {}
1410 };
1411 
1412 } // namespace parsito
1413 
1414 /////////
1415 // File: parsito/tree/tree.h
1416 /////////
1417 
1418 // This file is part of Parsito <http://github.com/ufal/parsito/>.
1419 //
1420 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1421 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1422 //
1423 // This Source Code Form is subject to the terms of the Mozilla Public
1424 // License, v. 2.0. If a copy of the MPL was not distributed with this
1425 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1426 
1427 namespace parsito {
1428 
1429 class tree {
1430  public:
1431   tree();
1432 
1433   vector<node> nodes;
1434 
1435   bool empty();
1436   void clear();
1437   node& add_node(const string& form);
1438   void set_head(int id, int head, const string& deprel);
1439   void unlink_all_nodes();
1440 
1441   static const string root_form;
1442 };
1443 
1444 } // namespace parsito
1445 
1446 /////////
1447 // File: parsito/configuration/configuration.h
1448 /////////
1449 
1450 // This file is part of Parsito <http://github.com/ufal/parsito/>.
1451 //
1452 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1453 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1454 //
1455 // This Source Code Form is subject to the terms of the Mozilla Public
1456 // License, v. 2.0. If a copy of the MPL was not distributed with this
1457 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1458 
1459 namespace parsito {
1460 
1461 class configuration {
1462  public:
configuration(bool single_root)1463   configuration(bool single_root) : single_root(single_root) {}
1464 
1465   void init(tree* t);
1466   bool final();
1467 
1468   tree* t;
1469   vector<int> stack;
1470   vector<int> buffer;
1471 
1472   bool single_root;
1473 };
1474 
1475 } // namespace parsito
1476 
1477 /////////
1478 // File: utils/binary_decoder.h
1479 /////////
1480 
1481 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
1482 //
1483 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1484 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1485 //
1486 // This Source Code Form is subject to the terms of the Mozilla Public
1487 // License, v. 2.0. If a copy of the MPL was not distributed with this
1488 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1489 
1490 namespace utils {
1491 
1492 //
1493 // Declarations
1494 //
1495 
1496 class binary_decoder_error : public runtime_error {
1497  public:
binary_decoder_error(const char * description)1498   explicit binary_decoder_error(const char* description) : runtime_error(description) {}
1499 };
1500 
1501 class binary_decoder {
1502  public:
1503   inline unsigned char* fill(unsigned len);
1504 
1505   inline unsigned next_1B() throw (binary_decoder_error);
1506   inline unsigned next_2B() throw (binary_decoder_error);
1507   inline unsigned next_4B() throw (binary_decoder_error);
1508   inline void next_str(string& str) throw (binary_decoder_error);
1509   template <class T> inline const T* next(unsigned elements) throw (binary_decoder_error);
1510 
1511   inline bool is_end();
1512   inline unsigned tell();
1513   inline void seek(unsigned pos) throw (binary_decoder_error);
1514 
1515  private:
1516   vector<unsigned char> buffer;
1517   const unsigned char* data;
1518   const unsigned char* data_end;
1519 };
1520 
1521 //
1522 // Definitions
1523 //
1524 
fill(unsigned len)1525 unsigned char* binary_decoder::fill(unsigned len) {
1526   buffer.resize(len);
1527   data = buffer.data();
1528   data_end = buffer.data() + len;
1529 
1530   return buffer.data();
1531 }
1532 
next_1B()1533 unsigned binary_decoder::next_1B() throw (binary_decoder_error) {
1534   if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder");
1535   return *data++;
1536 }
1537 
next_2B()1538 unsigned binary_decoder::next_2B() throw (binary_decoder_error) {
1539   if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder");
1540   unsigned result = *(uint16_t*)data;
1541   data += sizeof(uint16_t);
1542   return result;
1543 }
1544 
next_4B()1545 unsigned binary_decoder::next_4B() throw (binary_decoder_error) {
1546   if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder");
1547   unsigned result = *(uint32_t*)data;
1548   data += sizeof(uint32_t);
1549   return result;
1550 }
1551 
next_str(string & str)1552 void binary_decoder::next_str(string& str) throw (binary_decoder_error) {
1553   unsigned len = next_1B();
1554   if (len == 255) len = next_4B();
1555   str.assign(next<char>(len), len);
1556 }
1557 
next(unsigned elements)1558 template <class T> const T* binary_decoder::next(unsigned elements) throw (binary_decoder_error) {
1559   if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder");
1560   const T* result = (const T*) data;
1561   data += sizeof(T) * elements;
1562   return result;
1563 }
1564 
is_end()1565 bool binary_decoder::is_end() {
1566   return data >= data_end;
1567 }
1568 
tell()1569 unsigned binary_decoder::tell() {
1570   return data - buffer.data();
1571 }
1572 
seek(unsigned pos)1573 void binary_decoder::seek(unsigned pos) throw (binary_decoder_error) {
1574   if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder");
1575   data = buffer.data() + pos;
1576 }
1577 
1578 } // namespace utils
1579 
1580 /////////
1581 // File: parsito/parser/parser.h
1582 /////////
1583 
1584 // This file is part of Parsito <http://github.com/ufal/parsito/>.
1585 //
1586 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1587 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1588 //
1589 // This Source Code Form is subject to the terms of the Mozilla Public
1590 // License, v. 2.0. If a copy of the MPL was not distributed with this
1591 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1592 
1593 namespace parsito {
1594 
1595 // Parser
1596 class parser {
1597  public:
~parser()1598   virtual ~parser() {};
1599 
1600   virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const = 0;
1601 
1602   enum { NO_CACHE = 0, FULL_CACHE = 2147483647};
1603   static parser* load(const char* file, unsigned cache = 1000);
1604   static parser* load(istream& in, unsigned cache = 1000);
1605 
1606  protected:
1607   virtual void load(binary_decoder& data, unsigned cache) = 0;
1608   static parser* create(const string& name);
1609 };
1610 
1611 } // namespace parsito
1612 
1613 /////////
1614 // File: tokenizer/multiword_splitter.h
1615 /////////
1616 
1617 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
1618 //
1619 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
1620 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1621 //
1622 // This Source Code Form is subject to the terms of the Mozilla Public
1623 // License, v. 2.0. If a copy of the MPL was not distributed with this
1624 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1625 
1626 class multiword_splitter {
1627  public:
1628   void append_token(string_piece token, string_piece misc, sentence& s) const;
1629 
1630   static multiword_splitter* load(istream& is);
1631 
1632  private:
multiword_splitter(unsigned version)1633   multiword_splitter(unsigned version) : version(version) {}
1634   unsigned version;
1635   enum { VERSION_LATEST = 2 };
1636   friend class multiword_splitter_trainer;
1637 
1638   struct suffix_info {
1639     vector<string> words;
1640   };
1641   unordered_map<string, suffix_info> full_rules, suffix_rules;
1642 };
1643 
1644 /////////
1645 // File: utils/parse_int.h
1646 /////////
1647 
1648 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
1649 //
1650 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1651 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1652 //
1653 // This Source Code Form is subject to the terms of the Mozilla Public
1654 // License, v. 2.0. If a copy of the MPL was not distributed with this
1655 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1656 
1657 namespace utils {
1658 
1659 //
1660 // Declarations
1661 //
1662 
1663 // Try to parse an int from given string. If the int cannot be parsed or does
1664 // not fit into int, false is returned and the error string is filled using the
1665 // value_name argument.
1666 inline bool parse_int(string_piece str, const char* value_name, int& value, string& error);
1667 
1668 // Try to parse an int from given string. If the int cannot be parsed or does
1669 // not fit into int, an error is displayed and program exits.
1670 inline int parse_int(string_piece str, const char* value_name);
1671 
1672 //
1673 // Definitions
1674 //
1675 
parse_int(string_piece str,const char * value_name,int & value,string & error)1676 bool parse_int(string_piece str, const char* value_name, int& value, string& error) {
1677   string_piece original = str;
1678 
1679   // Skip spaces
1680   while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
1681     str.str++, str.len--;
1682 
1683   // Allow minus
1684   bool positive = true;
1685   if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
1686     positive = str.str[0] == '+';
1687     str.str++, str.len--;
1688   }
1689 
1690   // Parse value, checking for overflow/underflow
1691   if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false;
1692   if (!(str.str[0] >= '0' || str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false;
1693 
1694   value = 0;
1695   while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
1696     if (positive) {
1697       if (value > (numeric_limits<int>::max() - (str.str[0] - '0')) / 10)
1698         return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': overflow occured."), false;
1699       value = 10 * value + (str.str[0] - '0');
1700     } else {
1701       if (value < (numeric_limits<int>::min() + (str.str[0] - '0')) / 10)
1702         return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': underflow occured."), false;
1703       value = 10 * value - (str.str[0] - '0');
1704     }
1705     str.str++, str.len--;
1706   }
1707 
1708   // Skip spaces
1709   while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
1710     str.str++, str.len--;
1711 
1712   // Check for remaining characters
1713   if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false;
1714 
1715   return true;
1716 }
1717 
parse_int(string_piece str,const char * value_name)1718 int parse_int(string_piece str, const char* value_name) {
1719   int result;
1720   string error;
1721 
1722   if (!parse_int(str, value_name, result, error))
1723     runtime_failure(error);
1724 
1725   return result;
1726 }
1727 
1728 } // namespace utils
1729 
1730 /////////
1731 // File: utils/named_values.h
1732 /////////
1733 
1734 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
1735 //
1736 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1737 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1738 //
1739 // This Source Code Form is subject to the terms of the Mozilla Public
1740 // License, v. 2.0. If a copy of the MPL was not distributed with this
1741 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1742 
1743 namespace utils {
1744 
1745 //
1746 // Declarations
1747 //
1748 
1749 class named_values {
1750  public:
1751   typedef unordered_map<string, string> map;
1752 
1753   inline static bool parse(const string& values, map& parsed_values, string& error);
1754 };
1755 
1756 //
1757 // Definitions
1758 //
1759 
parse(const string & values,map & parsed_values,string & error)1760 bool named_values::parse(const string& values, map& parsed_values, string& error) {
1761   error.clear();
1762   parsed_values.clear();
1763 
1764   string name, file;
1765   for (size_t start = 0; start < values.size(); ) {
1766     while (start < values.size() && values[start] == ';') start++;
1767     if (start >= values.size()) break;
1768 
1769     size_t name_end = values.find_first_of("=;", start);
1770     name.assign(values, start, name_end - start);
1771     string& value = parsed_values[name];
1772 
1773     if (name_end == string::npos) {
1774       start = name_end;
1775     } else if (values[name_end] == ';') {
1776       start = name_end + 1;
1777     } else /* if (values[name_end] == '=') */ {
1778       size_t equal_sign = name_end;
1779 
1780       if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) {
1781         // Value of type file:
1782         size_t file_name = equal_sign + 1 + 5;
1783         size_t semicolon = min(values.find(';', file_name), values.size());
1784 
1785         file.assign(values, file_name, semicolon - file_name);
1786         ifstream is(file.c_str());
1787         if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false;
1788 
1789         char buffer[1024];
1790         for (value.clear(); is.read(buffer, sizeof(buffer)); )
1791           value.append(buffer, sizeof(buffer));
1792         value.append(buffer, is.gcount());
1793 
1794         start = semicolon + 1;
1795       } else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) {
1796         // Value of type data:
1797         size_t data_size_start = equal_sign + 1 + 5;
1798         size_t data_size_end = values.find(':', data_size_start);
1799         if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false;
1800 
1801         int data_size;
1802         if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false;
1803 
1804         size_t data_start = data_size_end + 1, data_end = data_start + data_size;
1805         if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false;
1806         if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
1807 
1808         value.assign(values, data_start, data_end - data_start);
1809         start = data_end + 1;
1810       } else {
1811         // Value of string type
1812         size_t semicolon = min(values.find(';', equal_sign), values.size());
1813         value.assign(values, equal_sign + 1, semicolon - equal_sign - 1);
1814         start = semicolon + 1;
1815       }
1816     }
1817   }
1818 
1819   return true;
1820 }
1821 
1822 } // namespace utils
1823 
1824 /////////
1825 // File: utils/threadsafe_stack.h
1826 /////////
1827 
1828 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
1829 //
1830 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1831 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1832 //
1833 // This Source Code Form is subject to the terms of the Mozilla Public
1834 // License, v. 2.0. If a copy of the MPL was not distributed with this
1835 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1836 
1837 namespace utils {
1838 
1839 //
1840 // Declarations
1841 //
1842 
1843 template <class T>
1844 class threadsafe_stack {
1845  public:
1846   inline void push(T* t);
1847   inline T* pop();
1848 
1849  private:
1850   vector<unique_ptr<T>> stack;
1851   atomic_flag lock = ATOMIC_FLAG_INIT;
1852 };
1853 
1854 //
1855 // Definitions
1856 //
1857 
1858 template <class T>
push(T * t)1859 void threadsafe_stack<T>::push(T* t) {
1860   while (lock.test_and_set(memory_order_acquire)) {}
1861   stack.emplace_back(t);
1862   lock.clear(memory_order_release);
1863 }
1864 
1865 template <class T>
pop()1866 T* threadsafe_stack<T>::pop() {
1867   T* res = nullptr;
1868 
1869   while (lock.test_and_set(memory_order_acquire)) {}
1870   if (!stack.empty()) {
1871     res = stack.back().release();
1872     stack.pop_back();
1873   }
1874   lock.clear(memory_order_release);
1875 
1876   return res;
1877 }
1878 
1879 } // namespace utils
1880 
1881 /////////
1882 // File: model/model_morphodita_parsito.h
1883 /////////
1884 
1885 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
1886 //
1887 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1888 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1889 //
1890 // This Source Code Form is subject to the terms of the Mozilla Public
1891 // License, v. 2.0. If a copy of the MPL was not distributed with this
1892 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1893 
1894 class model_morphodita_parsito : public model {
1895  public:
1896   virtual input_format* new_tokenizer(const string& options) const override;
1897   virtual bool tag(sentence& s, const string& options, string& error) const override;
1898   virtual bool parse(sentence& s, const string& options, string& error) const override;
1899 
1900   static model* load(istream& is);
1901 
1902  private:
1903   model_morphodita_parsito(unsigned version);
1904   unsigned version;
1905   enum { VERSION_LATEST = 3 };
1906 
1907   unique_ptr<morphodita::tokenizer_factory> tokenizer_factory;
1908   unique_ptr<multiword_splitter> splitter;
1909   struct tagger_model {
1910     bool upostag; int lemma; bool xpostag, feats;
1911     unique_ptr<morphodita::tagger> tagger;
1912 
tagger_modelufal::udpipe::model_morphodita_parsito::tagger_model1913     tagger_model(bool upostag, int lemma, bool xpostag, bool feats, morphodita::tagger* tagger)
1914         : upostag(upostag), lemma(lemma), xpostag(xpostag), feats(feats), tagger(tagger) {}
1915   };
1916   vector<tagger_model> taggers;
1917   unique_ptr<parsito::parser> parser;
1918 
1919   struct tagger_cache {
1920     vector<string> forms_normalized;
1921     vector<string_piece> forms_string_pieces;
1922     vector<morphodita::tagged_lemma> lemmas;
1923   };
1924   mutable threadsafe_stack<tagger_cache> tagger_caches;
1925 
1926   struct parser_cache {
1927     parsito::tree tree;
1928     named_values::map options;
1929   };
1930   mutable threadsafe_stack<parser_cache> parser_caches;
1931 
1932   bool parse(sentence& s, const string& options, string& error, double* cost) const;
1933 
1934   class joint_with_parsing_tokenizer : public input_format {
1935    public:
joint_with_parsing_tokenizer(input_format * tokenizer,const model_morphodita_parsito & model,int max_sentence_len,double change_boundary_logprob,double sentence_logprob)1936     joint_with_parsing_tokenizer(input_format* tokenizer, const model_morphodita_parsito& model,
1937                                  int max_sentence_len, double change_boundary_logprob, double sentence_logprob)
1938         : tokenizer(tokenizer), model(model), max_sentence_len(max_sentence_len),
1939           change_boundary_logprob(change_boundary_logprob), sentence_logprob(sentence_logprob) {}
1940 
1941     virtual bool read_block(istream& is, string& block) const override;
1942     virtual void reset_document(string_piece id) override;
1943     virtual void set_text(string_piece text, bool make_copy = false) override;
1944     virtual bool next_sentence(sentence& s, string& error) override;
1945 
1946    private:
1947     bool parse_paragraph(vector<sentence>& paragraph, string& error);
1948 
1949     unique_ptr<input_format> tokenizer;
1950     const model_morphodita_parsito& model;
1951     int max_sentence_len;
1952     double change_boundary_logprob;
1953     double sentence_logprob;
1954 
1955     string_piece text;
1956     string text_copy;
1957     bool new_document = true;
1958     string document_id;
1959     unsigned sentence_id = 1;
1960     vector<sentence> sentences;
1961     size_t sentences_index = 0;
1962   };
1963 
1964   void fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word) const;
1965   const string& normalize_form(string_piece form, string& output) const;
1966   const string& normalize_lemma(string_piece lemma, string& output) const;
1967   friend class trainer_morphodita_parsito;
1968 };
1969 
1970 /////////
1971 // File: model/model.cpp
1972 /////////
1973 
1974 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
1975 //
1976 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1977 // Mathematics and Physics, Charles University in Prague, Czech Republic.
1978 //
1979 // This Source Code Form is subject to the terms of the Mozilla Public
1980 // License, v. 2.0. If a copy of the MPL was not distributed with this
1981 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
1982 
1983 const string model::DEFAULT;
1984 const string model::TOKENIZER_NORMALIZED_SPACES = "normalized_spaces";
1985 const string model::TOKENIZER_PRESEGMENTED = "presegmented";
1986 const string model::TOKENIZER_RANGES = "ranges";
1987 
load(const char * fname)1988 model* model::load(const char* fname) {
1989   ifstream in(fname, ifstream::in | ifstream::binary);
1990   if (!in.is_open()) return nullptr;
1991   return load(in);
1992 }
1993 
load(istream & is)1994 model* model::load(istream& is) {
1995   char len;
1996   if (!is.get(len)) return nullptr;
1997   string name(len, ' ');
1998   if (!is.read(&name[0], len)) return nullptr;
1999 
2000   if (name == "morphodita_parsito") return model_morphodita_parsito::load(is);
2001 
2002   return nullptr;
2003 }
2004 
2005 /////////
2006 // File: tokenizer/morphodita_tokenizer_wrapper.h
2007 /////////
2008 
2009 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
2010 //
2011 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2012 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2013 //
2014 // This Source Code Form is subject to the terms of the Mozilla Public
2015 // License, v. 2.0. If a copy of the MPL was not distributed with this
2016 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2017 
2018 class morphodita_tokenizer_wrapper : public input_format {
2019  public:
2020   morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter, bool normalized_spaces, bool token_ranges);
2021 
2022   virtual bool read_block(istream& is, string& block) const override;
2023   virtual void reset_document(string_piece id) override;
2024   virtual void set_text(string_piece text, bool make_copy = false) override;
2025   virtual bool next_sentence(sentence& s, string& error) override;
2026 
2027  private:
2028   unique_ptr<morphodita::tokenizer> tokenizer;
2029   const multiword_splitter* splitter;
2030   bool normalized_spaces, token_ranges;
2031 
2032   bool new_document = true;
2033   string document_id;
2034   unsigned preceeding_newlines = 2;
2035   unsigned sentence_id = 1;
2036 
2037   string_piece text;
2038   string text_copy;
2039   size_t unicode_offset = 0, text_unicode_length = 0;
2040   string saved_spaces;
2041   vector<string_piece> forms;
2042   vector<morphodita::token_range> tokens;
2043   token tok;
2044 };
2045 
2046 /////////
2047 // File: utils/getpara.h
2048 /////////
2049 
2050 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
2051 //
2052 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2053 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2054 //
2055 // This Source Code Form is subject to the terms of the Mozilla Public
2056 // License, v. 2.0. If a copy of the MPL was not distributed with this
2057 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2058 
2059 namespace utils {
2060 
2061 //
2062 // Declarations
2063 //
2064 
2065 // Read paragraph until EOF or end line. All encountered \n are stored.
2066 inline istream& getpara(istream& is, string& para);
2067 
2068 //
2069 // Definitions
2070 //
2071 
getpara(istream & is,string & para)2072 istream& getpara(istream& is, string& para) {
2073   para.clear();
2074 
2075   for (string line; getline(is, line); ) {
2076     para.append(line);
2077     para.push_back('\n');
2078 
2079     if (line.empty()) break;
2080   }
2081 
2082   if (is.eof() && !para.empty()) is.clear(istream::eofbit);
2083   return is;
2084 }
2085 
2086 } // namespace utils
2087 
2088 /////////
2089 // File: utils/parse_double.h
2090 /////////
2091 
2092 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
2093 //
2094 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2095 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2096 //
2097 // This Source Code Form is subject to the terms of the Mozilla Public
2098 // License, v. 2.0. If a copy of the MPL was not distributed with this
2099 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2100 
2101 namespace utils {
2102 
2103 //
2104 // Declarations
2105 //
2106 
2107 // Try to parse an double from given string. If the double cannot be parsed or does
2108 // not fit doubleo double, false is returned and the error string is filled using the
2109 // value_name argument.
2110 inline bool parse_double(string_piece str, const char* value_name, double& value, string& error);
2111 
2112 // Try to parse an double from given string. If the double cannot be parsed or does
2113 // not fit doubleo double, an error is displayed and program exits.
2114 inline double parse_double(string_piece str, const char* value_name);
2115 
2116 //
2117 // Definitions
2118 //
2119 
parse_double(string_piece str,const char * value_name,double & value,string & error)2120 bool parse_double(string_piece str, const char* value_name, double& value, string& error) {
2121   string_piece original = str;
2122 
2123   // Skip spaces
2124   while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
2125     str.str++, str.len--;
2126 
2127   // Allow plus/minus
2128   bool negative = false;
2129   if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
2130     negative = str.str[0] == '-';
2131     str.str++, str.len--;
2132   }
2133 
2134   // Parse value, checking for overflow/underflow
2135   if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false;
2136   if (!(str.str[0] >= '0' || str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false;
2137 
2138   value = 0;
2139   while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
2140     value = 10 * value + (str.str[0] - '0');
2141     str.str++, str.len--;
2142   }
2143 
2144   // If there is a decimal point, parse the rest of the
2145   if (str.len && str.str[0] == '.') {
2146     double divider = 1;
2147 
2148     str.str++, str.len--;
2149     while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
2150       value = 10 * value + (str.str[0] - '0');
2151       divider *= 10.;
2152       str.str++, str.len--;
2153     }
2154 
2155     value /= divider;
2156   }
2157   if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false;
2158 
2159   // Optionally parse an exponent
2160   if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) {
2161     str.str++, str.len--;
2162 
2163     double exponent = 0;
2164     bool exponent_negative = false;
2165     if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
2166       exponent_negative = str.str[0] == '-';
2167       str.str++, str.len--;
2168     }
2169 
2170     while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
2171       exponent = 10 * exponent + (str.str[0] - '0');
2172       str.str++, str.len--;
2173     }
2174 
2175     exponent = pow(10., exponent_negative ? -exponent : exponent);
2176     if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false;
2177     if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false;
2178 
2179     if (value) {
2180       value *= exponent;
2181       if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false;
2182       if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false;
2183     }
2184   }
2185 
2186   // Apply initial minus
2187   if (negative) value *= -1;
2188 
2189   // Skip spaces
2190   while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
2191     str.str++, str.len--;
2192 
2193   // Check for remaining characters
2194   if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false;
2195 
2196   return true;
2197 }
2198 
parse_double(string_piece str,const char * value_name)2199 double parse_double(string_piece str, const char* value_name) {
2200   double result;
2201   string error;
2202 
2203   if (!parse_double(str, value_name, result, error))
2204     runtime_failure(error);
2205 
2206   return result;
2207 }
2208 
2209 } // namespace utils
2210 
2211 /////////
2212 // File: model/model_morphodita_parsito.cpp
2213 /////////
2214 
2215 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
2216 //
2217 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2218 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2219 //
2220 // This Source Code Form is subject to the terms of the Mozilla Public
2221 // License, v. 2.0. If a copy of the MPL was not distributed with this
2222 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2223 
2224 // Versions:
2225 // 1 - initial version
2226 // 2 - add absolute lemmas (tagger_model::lemma == 2)
2227 //   - use Arabic and space normalization
2228 
new_tokenizer(const string & options) const2229 input_format* model_morphodita_parsito::new_tokenizer(const string& options) const {
2230   if (!tokenizer_factory)
2231     return nullptr;
2232 
2233   named_values::map parsed_options;
2234   string parse_error;
2235   if (!named_values::parse(options, parsed_options, parse_error))
2236     return nullptr;
2237 
2238   bool normalized_spaces = parsed_options.count("normalized_spaces");
2239   bool token_ranges = parsed_options.count("ranges");
2240 
2241   unique_ptr<input_format> result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(), splitter.get(), normalized_spaces, token_ranges));
2242 
2243   // Presegmented
2244   if (parsed_options.count("presegmented") && result)
2245     result.reset(input_format::new_presegmented_tokenizer(result.release()));
2246 
2247   // Joint with parsing
2248   if (parsed_options.count("joint_with_parsing") && result) {
2249     int max_sentence_len = 20;
2250     if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
2251       return nullptr;
2252 
2253     double change_boundary_logprob = -0.5;
2254     if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
2255       return nullptr;
2256 
2257     double sentence_logprob = -0.5;
2258     if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
2259       return nullptr;
2260 
2261     result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob));
2262   }
2263 
2264   return result.release();
2265 }
2266 
tag(sentence & s,const string &,string & error) const2267 bool model_morphodita_parsito::tag(sentence& s, const string& /*options*/, string& error) const {
2268   error.clear();
2269 
2270   if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false;
2271   if (s.empty()) return true;
2272 
2273   tagger_cache* c = tagger_caches.pop();
2274   if (!c) c = new tagger_cache();
2275 
2276   // Prepare input forms
2277   c->forms_normalized.resize(s.words.size() - 1);
2278   c->forms_string_pieces.resize(s.words.size() - 1);
2279   for (size_t i = 1; i < s.words.size(); i++)
2280     c->forms_string_pieces[i - 1] = normalize_form(s.words[i].form, c->forms_normalized[i - 1]);
2281 
2282   // Clear first
2283   for (size_t i = 1; i < s.words.size(); i++) {
2284     s.words[i].lemma.assign("_");
2285     s.words[i].upostag.clear();
2286     s.words[i].xpostag.clear();
2287     s.words[i].feats.clear();
2288   }
2289 
2290   // Fill information from the tagger models
2291   for (auto&& tagger : taggers) {
2292     if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false;
2293 
2294     tagger.tagger->tag(c->forms_string_pieces, c->lemmas);
2295 
2296     for (size_t i = 0; i < c->lemmas.size(); i++)
2297       fill_word_analysis(c->lemmas[i], tagger.upostag, tagger.lemma, tagger.xpostag, tagger.feats, s.words[i+1]);
2298   }
2299 
2300   tagger_caches.push(c);
2301   return true;
2302 }
2303 
parse(sentence & s,const string & options,string & error) const2304 bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error) const {
2305   return parse(s, options, error, nullptr);
2306 }
2307 
parse(sentence & s,const string & options,string & error,double * cost) const2308 bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error, double* cost) const {
2309   error.clear();
2310 
2311   if (!parser) return error.assign("No parser defined for the UDPipe model!"), false;
2312   if (s.empty()) return true;
2313 
2314   parser_cache* c = parser_caches.pop();
2315   if (!c) c = new parser_cache();
2316 
2317   int beam_search = 5;
2318   if (!named_values::parse(options, c->options, error))
2319     return false;
2320   if (c->options.count("beam_search"))
2321     if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error))
2322       return false;
2323 
2324   c->tree.clear();
2325   for (size_t i = 1; i < s.words.size(); i++) {
2326     c->tree.add_node(string());
2327     normalize_form(s.words[i].form, c->tree.nodes.back().form);
2328     normalize_lemma(s.words[i].lemma, c->tree.nodes.back().lemma);
2329     c->tree.nodes.back().upostag.assign(s.words[i].upostag);
2330     c->tree.nodes.back().xpostag.assign(s.words[i].xpostag);
2331     c->tree.nodes.back().feats.assign(s.words[i].feats);
2332     c->tree.nodes.back().deps.assign(s.words[i].deps);
2333     c->tree.nodes.back().misc.assign(s.words[i].misc);
2334   }
2335 
2336   parser->parse(c->tree, beam_search, cost);
2337   for (size_t i = 1; i < s.words.size(); i++)
2338     s.set_head(i, c->tree.nodes[i].head, c->tree.nodes[i].deprel);
2339 
2340   parser_caches.push(c);
2341   return true;
2342 }
2343 
load(istream & is)2344 model* model_morphodita_parsito::load(istream& is) {
2345   char version;
2346   if (!is.get(version)) return nullptr;
2347   if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr;
2348 
2349   // Because UDPipe 1.0 does not check the model version,
2350   // a specific sentinel was added since version 2 so that
2351   // loading of such model fail on UDPipe 1.0
2352   if (version >= 2) {
2353     char sentinel;
2354     if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
2355     if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
2356   }
2357 
2358   unique_ptr<model_morphodita_parsito> m(new model_morphodita_parsito((unsigned char)version));
2359   if (!m) return nullptr;
2360 
2361   char tokenizer;
2362   if (!is.get(tokenizer)) return nullptr;
2363   m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
2364   if (tokenizer && !m->tokenizer_factory) return nullptr;
2365   m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr);
2366   if (tokenizer && !m->splitter) return nullptr;
2367 
2368   m->taggers.clear();
2369   char taggers; if (!is.get(taggers)) return nullptr;
2370   for (char i = 0; i < taggers; i++) {
2371     char lemma; if (!is.get(lemma)) return nullptr;
2372     char xpostag; if (!is.get(xpostag)) return nullptr;
2373     char feats; if (!is.get(feats)) return nullptr;
2374     morphodita::tagger* tagger = morphodita::tagger::load(is);
2375     if (!tagger) return nullptr;
2376     m->taggers.emplace_back(i == 0, int(lemma), bool(xpostag), bool(feats), tagger);
2377   }
2378 
2379   char parser;
2380   if (!is.get(parser)) return nullptr;
2381   m->parser.reset(parser ? parsito::parser::load(is) : nullptr);
2382   if (parser && !m->parser) return nullptr;
2383 
2384   return m.release();
2385 }
2386 
model_morphodita_parsito(unsigned version)2387 model_morphodita_parsito::model_morphodita_parsito(unsigned version) : version(version) {}
2388 
read_block(istream & is,string & block) const2389 bool model_morphodita_parsito::joint_with_parsing_tokenizer::read_block(istream& is, string& block) const {
2390   block.clear();
2391 
2392   for (string line; getline(is, line); ) {
2393     block.append(line);
2394     block.push_back('\n');
2395   }
2396 
2397   if (is.eof() && !block.empty()) is.clear(istream::eofbit);
2398   return bool(is);
2399 }
2400 
reset_document(string_piece id)2401 void model_morphodita_parsito::joint_with_parsing_tokenizer::reset_document(string_piece id) {
2402   new_document = true;
2403   document_id.assign(id.str, id.len);
2404   sentence_id = 1;
2405   set_text("");
2406   sentences.clear();
2407   sentences_index = 0;
2408 }
2409 
set_text(string_piece text,bool make_copy)2410 void model_morphodita_parsito::joint_with_parsing_tokenizer::set_text(string_piece text, bool make_copy) {
2411   if (make_copy) {
2412     text_copy.assign(text.str, text.len);
2413     text.str = text_copy.c_str();
2414   }
2415   this->text = text;
2416 }
2417 
next_sentence(sentence & s,string & error)2418 bool model_morphodita_parsito::joint_with_parsing_tokenizer::next_sentence(sentence& s, string& error) {
2419   error.clear();
2420 
2421   if (text.len) {
2422     sentences.clear();
2423     sentences_index = 0;
2424 
2425     tokenizer->set_text(text, false);
2426 
2427     sentence input;
2428     vector<sentence> paragraph;
2429     while (tokenizer->next_sentence(input, error)) {
2430       if (input.get_new_par() && !paragraph.empty()) {
2431         if (!parse_paragraph(paragraph, error)) return false;
2432         for (auto&& sentence : paragraph)
2433           sentences.push_back(sentence);
2434         paragraph.clear();
2435       }
2436       paragraph.push_back(input);
2437     }
2438     if (!error.empty()) return false;
2439 
2440     if (!paragraph.empty()) {
2441       if (!parse_paragraph(paragraph, error)) return false;
2442       for (auto&& sentence : paragraph)
2443         sentences.push_back(sentence);
2444     }
2445 
2446     text.len = 0;
2447   }
2448 
2449   if (sentences_index < sentences.size()) {
2450     s = sentences[sentences_index++];
2451     return true;
2452   }
2453 
2454   return false;
2455 }
2456 
parse_paragraph(vector<sentence> & paragraph,string & error)2457 bool model_morphodita_parsito::joint_with_parsing_tokenizer::parse_paragraph(vector<sentence>& paragraph, string& error) {
2458   sentence all_words;
2459   vector<bool> sentence_boundary(1, true);
2460   vector<bool> token_boundary(1, true);
2461 
2462   for (auto&& s : paragraph) {
2463     unsigned offset = all_words.words.size() - 1;
2464     for (unsigned i = 1; i < s.words.size(); i++) {
2465       all_words.words.push_back(s.words[i]);
2466       all_words.words.back().id += offset;
2467       sentence_boundary.push_back(i+1 == s.words.size());
2468       token_boundary.push_back(true);
2469     }
2470 
2471     for (auto&& mwt : s.multiword_tokens) {
2472       all_words.multiword_tokens.push_back(mwt);
2473       all_words.multiword_tokens.back().id_first += offset;
2474       all_words.multiword_tokens.back().id_last += offset;
2475       for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++)
2476         token_boundary[i] = false;
2477     }
2478   }
2479 
2480   vector<double> best_logprob(all_words.words.size(), -numeric_limits<double>::infinity()); best_logprob[0] = 0.;
2481   vector<unsigned> best_length(all_words.words.size(), 0);
2482   sentence s;
2483 
2484   for (unsigned start = 1; start < all_words.words.size(); start++) {
2485     if (!token_boundary[start - 1]) continue;
2486     s.clear();
2487     for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) {
2488       s.words.push_back(all_words.words[end - 1]);
2489       s.words.back().id -= start - 1;
2490       if (!token_boundary[end - 1]) continue;
2491 
2492       for (unsigned i = 1; i < s.words.size(); i++) {
2493         s.words[i].head = -1;
2494         s.words[i].children.clear();
2495       }
2496 
2497       double cost;
2498       if (!model.tag(s, DEFAULT, error)) return false;
2499       if (!model.parse(s, DEFAULT, error, &cost)) return false;
2500       cost += sentence_logprob + change_boundary_logprob * (2 - int(sentence_boundary[start - 1]) - int(sentence_boundary[end - 1]));
2501       if (best_logprob[start - 1] + cost > best_logprob[end - 1]) {
2502         best_logprob[end - 1] = best_logprob[start - 1] + cost;
2503         best_length[end - 1] = end - start;
2504       }
2505     }
2506   }
2507 
2508   vector<unsigned> sentence_lengths;
2509   for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1])
2510     sentence_lengths.push_back(best_length[end - 1]);
2511 
2512   paragraph.clear();
2513 
2514   sentence_lengths.push_back(1);
2515   reverse(sentence_lengths.begin(), sentence_lengths.end());
2516   for (unsigned i = 1; i < sentence_lengths.size(); i++) {
2517     sentence_lengths[i] += sentence_lengths[i - 1];
2518 
2519     paragraph.emplace_back();
2520     while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) {
2521       paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front());
2522       paragraph.back().multiword_tokens.back().id_first -= sentence_lengths[i-1] - 1;
2523       paragraph.back().multiword_tokens.back().id_last -= sentence_lengths[i-1] - 1;
2524       all_words.multiword_tokens.erase(all_words.multiword_tokens.begin());
2525     }
2526 
2527     for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) {
2528       paragraph.back().words.push_back(all_words.words[word]);
2529       paragraph.back().words.back().id -= sentence_lengths[i-1] - 1;
2530       paragraph.back().words.back().head = -1;
2531       paragraph.back().words.back().children.clear();
2532     }
2533   }
2534 
2535   if (!paragraph.empty()) {
2536     if (new_document) {
2537       paragraph.front().set_new_doc(true, document_id);
2538       new_document = false;
2539     }
2540 
2541     paragraph.front().set_new_par(true);
2542   }
2543 
2544   return true;
2545 }
2546 
fill_word_analysis(const morphodita::tagged_lemma & analysis,bool upostag,int lemma,bool xpostag,bool feats,word & word) const2547 void model_morphodita_parsito::fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word) const {
2548   // Lemma
2549   if (lemma == 1) {
2550     word.lemma.assign(analysis.lemma);
2551   } else if (lemma == 2) {
2552     word.lemma.assign(analysis.lemma);
2553 
2554     // Lemma matching ~replacement~normalized_form is changed to replacement.
2555     if (analysis.lemma[0] == '~') {
2556       auto end = analysis.lemma.find('~', 1);
2557       if (end != string::npos) {
2558         normalize_form(word.form, word.lemma);
2559         if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0)
2560           word.lemma.assign(analysis.lemma, 1, end - 1);
2561         else
2562           word.lemma.assign(analysis.lemma);
2563       }
2564     }
2565   }
2566   if (version == 2) {
2567     // Replace '\001' back to spaces
2568     for (auto && chr : word.lemma)
2569       if (chr == '\001')
2570         chr = ' ';
2571   } else if (version >= 3) {
2572     // Replace '0xC2 0xA0' back to spaces
2573     for (size_t i = 0; i + 1 < word.lemma.size(); i++)
2574       if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0))
2575         word.lemma.replace(i, 2, 1, ' ');
2576   }
2577 
2578   if (!upostag && !xpostag && !feats) return;
2579 
2580   // UPOSTag
2581   char separator = analysis.tag[0];
2582   size_t start = min(size_t(1), analysis.tag.size()), end = min(analysis.tag.find(separator, 1), analysis.tag.size());
2583   if (upostag) word.upostag.assign(analysis.tag, start, end - start);
2584 
2585   if (!xpostag && !feats) return;
2586 
2587   // XPOSTag
2588   start = min(end + 1, analysis.tag.size());
2589   end = min(analysis.tag.find(separator, start), analysis.tag.size());
2590   if (xpostag) word.xpostag.assign(analysis.tag, start, end - start);
2591 
2592   if (!feats) return;
2593 
2594   // Features
2595   start = min(end + 1, analysis.tag.size());
2596   word.feats.assign(analysis.tag, start, analysis.tag.size() - start);
2597 }
2598 
normalize_form(string_piece form,string & output) const2599 const string& model_morphodita_parsito::normalize_form(string_piece form, string& output) const {
2600   using unilib::utf8;
2601 
2602   // No normalization on version 1
2603   if (version <= 1) return output.assign(form.str, form.len);
2604 
2605   // If requested, replace space by \001 in version 2 and by &nbsp; (\u00a0) since version 3
2606 
2607   // Arabic normalization since version 2, implementation resulted from
2608   // discussion with Otakar Smrz and Nasrin Taghizadeh.
2609   // 1. Remove https://codepoints.net/U+0640 without any reasonable doubt :)
2610   // 2. Remove https://codepoints.net/U+0652
2611   // 3. Remove https://codepoints.net/U+0670
2612   // 4. Remove everything from https://codepoints.net/U+0653 to
2613   //    https://codepoints.net/U+0657 though they are probably very rare in date
2614   // 5. Remove everything from https://codepoints.net/U+064B to
2615   //    https://codepoints.net/U+0650
2616   // 6. Remove https://codepoints.net/U+0651
2617   // 7. Replace https://codepoints.net/U+0671 with https://codepoints.net/U+0627
2618   // 8. Replace https://codepoints.net/U+0622 with https://codepoints.net/U+0627
2619   // 9. Replace https://codepoints.net/U+0623 with https://codepoints.net/U+0627
2620   // 10. Replace https://codepoints.net/U+0625 with https://codepoints.net/U+0627
2621   // 11. Replace https://codepoints.net/U+0624 with https://codepoints.net/U+0648
2622   // 12. Replace https://codepoints.net/U+0626 with https://codepoints.net/U+064A
2623   // One might also consider replacing some Farsi characters that might be typed
2624   // unintentionally (by Iranians writing Arabic language texts):
2625   // 13. Replace https://codepoints.net/U+06CC with https://codepoints.net/U+064A
2626   // 14. Replace https://codepoints.net/U+06A9 with https://codepoints.net/U+0643
2627   // 15. Replace https://codepoints.net/U+06AA with https://codepoints.net/U+0643
2628   //
2629   // Not implemented:
2630   // There is additional challenge with data coming from Egypt (such as printed
2631   // or online newspapers), where the word-final https://codepoints.net/U+064A
2632   // may be switched for https://codepoints.net/U+0649 and visa versa. Also, the
2633   // word-final https://codepoints.net/U+0647 could actually represent https://
2634   // codepoints.net/U+0629. You can experiment with the following replacements,
2635   // but I would rather apply them only after classifying the whole document as
2636   // following such convention:
2637   // 1. Replace https://codepoints.net/U+0629 with https://codepoints.net/U+0647
2638   //    (frequent femine ending markers would appear like a third-person
2639   //    masculine pronoun clitic instead)
2640   // 2. Replace https://codepoints.net/U+0649 with https://codepoints.net/U+064A
2641   //    (some "weak" words would become even more ambiguous or appear as if
2642   //    with a first-person pronoun clitic)
2643 
2644   output.clear();
2645   for (auto&& chr : utf8::decoder(form.str, form.len)) {
2646     // Arabic normalization
2647     if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {}
2648     else if (chr == 0x622) utf8::append(output, 0x627);
2649     else if (chr == 0x623) utf8::append(output, 0x627);
2650     else if (chr == 0x624) utf8::append(output, 0x648);
2651     else if (chr == 0x625) utf8::append(output, 0x627);
2652     else if (chr == 0x626) utf8::append(output, 0x64A);
2653     else if (chr == 0x671) utf8::append(output, 0x627);
2654     else if (chr == 0x6A9) utf8::append(output, 0x643);
2655     else if (chr == 0x6AA) utf8::append(output, 0x643);
2656     else if (chr == 0x6CC) utf8::append(output, 0x64A);
2657     // Space normalization
2658     else if (chr == ' ' && version == 2) utf8::append(output, 0x01);
2659     else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0);
2660     // Default
2661     else utf8::append(output, chr);
2662   }
2663 
2664   // Make sure we do not remove everything
2665   if (output.empty() && form.len)
2666     utf8::append(output, utf8::first(form.str, form.len));
2667 
2668   return output;
2669 }
2670 
normalize_lemma(string_piece lemma,string & output) const2671 const string& model_morphodita_parsito::normalize_lemma(string_piece lemma, string& output) const {
2672   using unilib::utf8;
2673 
2674   // No normalization on version 1 and 2
2675   if (version <= 2) return output.assign(lemma.str, lemma.len);
2676 
2677   // Normalize spaces by &nbsp; since version 3
2678   output.clear();
2679   for (size_t i = 0; i < lemma.len; i++) {
2680     // Space normalization
2681     if (lemma.str[i] == ' ') utf8::append(output, 0xA0);
2682     // Default
2683     else output.push_back(lemma.str[i]);
2684   }
2685 
2686   return output;
2687 }
2688 
2689 /////////
2690 // File: model/pipeline.h
2691 /////////
2692 
2693 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
2694 //
2695 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2696 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2697 //
2698 // This Source Code Form is subject to the terms of the Mozilla Public
2699 // License, v. 2.0. If a copy of the MPL was not distributed with this
2700 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2701 
2702 class pipeline {
2703  public:
2704   pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output);
2705 
2706   void set_model(const model* m);
2707   void set_input(const string& input);
2708   void set_tagger(const string& tagger);
2709   void set_parser(const string& parser);
2710   void set_output(const string& output);
2711 
2712   void set_immediate(bool immediate);
2713   void set_document_id(const string& document_id);
2714 
2715   bool process(istream& is, ostream& os, string& error) const;
2716 
2717   static const string DEFAULT;
2718   static const string NONE;
2719 
2720  private:
2721   const model* m;
2722   string input, tokenizer, tagger, parser, output;
2723   string document_id;
2724   bool immediate;
2725 };
2726 
2727 /////////
2728 // File: sentence/output_format.h
2729 /////////
2730 
2731 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
2732 //
2733 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2734 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2735 //
2736 // This Source Code Form is subject to the terms of the Mozilla Public
2737 // License, v. 2.0. If a copy of the MPL was not distributed with this
2738 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2739 
2740 class output_format {
2741  public:
~output_format()2742   virtual ~output_format() {}
2743 
2744   virtual void write_sentence(const sentence& s, ostream& os) = 0;
finish_document(ostream &)2745   virtual void finish_document(ostream& /*os*/) {}
2746 
2747   // Static factory methods
2748   static output_format* new_output_format(const string& name);
2749   static output_format* new_conllu_output_format(const string& options = string());
2750   static output_format* new_epe_output_format(const string& options = string());
2751   static output_format* new_matxin_output_format(const string& options = string());
2752   static output_format* new_horizontal_output_format(const string& options = string());
2753   static output_format* new_plaintext_output_format(const string& options = string());
2754   static output_format* new_vertical_output_format(const string& options = string());
2755 
2756   static const string CONLLU_V1;
2757   static const string CONLLU_V2;
2758   static const string HORIZONTAL_PARAGRAPHS;
2759   static const string PLAINTEXT_NORMALIZED_SPACES;
2760   static const string VERTICAL_PARAGRAPHS;
2761 };
2762 
2763 /////////
2764 // File: utils/getwhole.h
2765 /////////
2766 
2767 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
2768 //
2769 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2770 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2771 //
2772 // This Source Code Form is subject to the terms of the Mozilla Public
2773 // License, v. 2.0. If a copy of the MPL was not distributed with this
2774 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2775 
2776 namespace utils {
2777 
2778 //
2779 // Declarations
2780 //
2781 
2782 // Read whole content until EOF. All encountered \n are stored.
2783 inline istream& getwhole(istream& is, string& whole);
2784 
2785 //
2786 // Definitions
2787 //
2788 
getwhole(istream & is,string & whole)2789 istream& getwhole(istream& is, string& whole) {
2790   whole.clear();
2791 
2792   for (string line; getline(is, line); )
2793     whole.append(line).push_back('\n');
2794 
2795   if (is.eof() && !whole.empty()) is.clear(istream::eofbit);
2796   return is;
2797 }
2798 
2799 } // namespace utils
2800 
2801 /////////
2802 // File: model/pipeline.cpp
2803 /////////
2804 
2805 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
2806 //
2807 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2808 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2809 //
2810 // This Source Code Form is subject to the terms of the Mozilla Public
2811 // License, v. 2.0. If a copy of the MPL was not distributed with this
2812 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2813 
2814 const string pipeline::DEFAULT;
2815 const string pipeline::NONE = "none";
2816 
pipeline(const model * m,const string & input,const string & tagger,const string & parser,const string & output)2817 pipeline::pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output) : immediate(false) {
2818   set_model(m);
2819   set_input(input);
2820   set_tagger(tagger);
2821   set_parser(parser);
2822   set_output(output);
2823 }
2824 
set_model(const model * m)2825 void pipeline::set_model(const model* m) {
2826   this->m = m;
2827 }
2828 
set_input(const string & input)2829 void pipeline::set_input(const string& input) {
2830   tokenizer.clear();
2831 
2832   if (input.empty()) {
2833     this->input = "conllu";
2834   } else if (input == "tokenize" || input == "tokenizer") {
2835     this->input = "tokenizer";
2836   } else if (input.compare(0, 10, "tokenizer=") == 0) {
2837     this->input = "tokenizer";
2838     tokenizer.assign(input, 10, string::npos);
2839   } else {
2840     this->input = input;
2841   }
2842 }
2843 
set_tagger(const string & tagger)2844 void pipeline::set_tagger(const string& tagger) {
2845   this->tagger = tagger;
2846 }
2847 
set_parser(const string & parser)2848 void pipeline::set_parser(const string& parser) {
2849   this->parser = parser;
2850 }
2851 
set_output(const string & output)2852 void pipeline::set_output(const string& output) {
2853   this->output = output.empty() ? "conllu" : output;
2854 }
2855 
set_immediate(bool immediate)2856 void pipeline::set_immediate(bool immediate) {
2857   this->immediate = immediate;
2858 }
2859 
set_document_id(const string & document_id)2860 void pipeline::set_document_id(const string& document_id) {
2861   this->document_id = document_id;
2862 }
2863 
process(istream & is,ostream & os,string & error) const2864 bool pipeline::process(istream& is, ostream& os, string& error) const {
2865   error.clear();
2866 
2867   sentence s;
2868 
2869   unique_ptr<input_format> reader;
2870   if (input == "tokenizer") {
2871     reader.reset(m->new_tokenizer(tokenizer));
2872     if (!reader) return error.assign("The model does not have a tokenizer!"), false;
2873   } else {
2874     reader.reset(input_format::new_input_format(input));
2875     if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false;
2876   }
2877   reader->reset_document(document_id);
2878 
2879   unique_ptr<output_format> writer(output_format::new_output_format(output));
2880   if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false;
2881 
2882   string block;
2883   while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) {
2884     reader->set_text(block);
2885     while (reader->next_sentence(s, error)) {
2886       if (tagger != NONE)
2887         if (!m->tag(s, tagger, error))
2888           return false;
2889 
2890       if (parser != NONE)
2891         if (!m->parse(s, parser, error))
2892           return false;
2893 
2894       writer->write_sentence(s, os);
2895     }
2896     if (!error.empty()) return false;
2897   }
2898   writer->finish_document(os);
2899 
2900   return true;
2901 }
2902 
2903 /////////
2904 // File: morphodita/derivator/derivation_formatter.h
2905 /////////
2906 
2907 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
2908 //
2909 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
2910 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2911 //
2912 // This Source Code Form is subject to the terms of the Mozilla Public
2913 // License, v. 2.0. If a copy of the MPL was not distributed with this
2914 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2915 
2916 namespace morphodita {
2917 
2918 class derivation_formatter {
2919  public:
~derivation_formatter()2920   virtual ~derivation_formatter() {}
2921 
2922   // Perform the required derivation and store it directly in the lemma.
2923   virtual void format_derivation(string& lemma) const = 0;
2924 
2925   // Static factory methods.
2926   static derivation_formatter* new_none_derivation_formatter();
2927   static derivation_formatter* new_root_derivation_formatter(const derivator* derinet);
2928   static derivation_formatter* new_path_derivation_formatter(const derivator* derinet);
2929   static derivation_formatter* new_tree_derivation_formatter(const derivator* derinet);
2930   // String version of static factory method.
2931   static derivation_formatter* new_derivation_formatter(string_piece name, const derivator* derinet);
2932 };
2933 
2934 } // namespace morphodita
2935 
2936 /////////
2937 // File: morphodita/derivator/derivation_formatter.cpp
2938 /////////
2939 
2940 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
2941 //
2942 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
2943 // Mathematics and Physics, Charles University in Prague, Czech Republic.
2944 //
2945 // This Source Code Form is subject to the terms of the Mozilla Public
2946 // License, v. 2.0. If a copy of the MPL was not distributed with this
2947 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
2948 
2949 namespace morphodita {
2950 
2951 class none_derivation_formatter : public derivation_formatter {
format_derivation(string &) const2952   virtual void format_derivation(string& /*lemma*/) const override {}
2953 };
2954 
new_none_derivation_formatter()2955 derivation_formatter* derivation_formatter::new_none_derivation_formatter() {
2956   return new none_derivation_formatter();
2957 }
2958 
2959 class root_derivation_formatter : public derivation_formatter {
2960  public:
root_derivation_formatter(const derivator * derinet)2961   root_derivation_formatter(const derivator* derinet) : derinet(derinet) {}
2962 
format_derivation(string & lemma) const2963   virtual void format_derivation(string& lemma) const override {
2964     for (derivated_lemma parent; derinet->parent(lemma, parent); )
2965       lemma.assign(parent.lemma);
2966   }
2967 
2968  private:
2969   const derivator* derinet;
2970 };
2971 
new_root_derivation_formatter(const derivator * derinet)2972 derivation_formatter* derivation_formatter::new_root_derivation_formatter(const derivator* derinet) {
2973   return derinet ? new root_derivation_formatter(derinet) : nullptr;
2974 }
2975 
2976 class path_derivation_formatter : public derivation_formatter {
2977  public:
path_derivation_formatter(const derivator * derinet)2978   path_derivation_formatter(const derivator* derinet) : derinet(derinet) {}
2979 
format_derivation(string & lemma) const2980   virtual void format_derivation(string& lemma) const override {
2981     string current(lemma);
2982     for (derivated_lemma parent; derinet->parent(current, parent); current.swap(parent.lemma))
2983       lemma.append(" ").append(parent.lemma);
2984   }
2985 
2986  private:
2987   const derivator* derinet;
2988 };
2989 
new_path_derivation_formatter(const derivator * derinet)2990 derivation_formatter* derivation_formatter::new_path_derivation_formatter(const derivator* derinet) {
2991   return derinet ? new path_derivation_formatter(derinet) : nullptr;
2992 }
2993 
2994 class tree_derivation_formatter : public derivation_formatter {
2995  public:
tree_derivation_formatter(const derivator * derinet)2996   tree_derivation_formatter(const derivator* derinet) : derinet(derinet) {}
2997 
format_derivation(string & lemma) const2998   virtual void format_derivation(string& lemma) const override {
2999     string root(lemma);
3000     for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {}
3001     format_tree(root, lemma);
3002   }
3003 
format_tree(const string & root,string & tree) const3004   void format_tree(const string& root, string& tree) const {
3005     vector<derivated_lemma> children;
3006 
3007     tree.append(" ").append(root);
3008     if (derinet->children(root, children))
3009       for (auto&& child : children)
3010         format_tree(child.lemma, tree);
3011     tree.push_back(' ');
3012   }
3013 
3014  private:
3015   const derivator* derinet;
3016 };
3017 
new_tree_derivation_formatter(const derivator * derinet)3018 derivation_formatter* derivation_formatter::new_tree_derivation_formatter(const derivator* derinet) {
3019   return derinet ? new tree_derivation_formatter(derinet) : nullptr;
3020 }
3021 
new_derivation_formatter(string_piece name,const derivator * derinet)3022 derivation_formatter* derivation_formatter::new_derivation_formatter(string_piece name, const derivator* derinet) {
3023   if (name == "none") return new_none_derivation_formatter();
3024   if (name == "root") return new_root_derivation_formatter(derinet);
3025   if (name == "path") return new_path_derivation_formatter(derinet);
3026   if (name == "tree") return new_tree_derivation_formatter(derinet);
3027   return nullptr;
3028 }
3029 
3030 } // namespace morphodita
3031 
3032 /////////
3033 // File: morphodita/morpho/small_stringops.h
3034 /////////
3035 
3036 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3037 //
3038 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3039 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3040 //
3041 // This Source Code Form is subject to the terms of the Mozilla Public
3042 // License, v. 2.0. If a copy of the MPL was not distributed with this
3043 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3044 
3045 namespace morphodita {
3046 
3047 // Declarations
3048 inline bool small_memeq(const void* a, const void* b, size_t len);
3049 inline void small_memcpy(void* dest, const void* src, size_t len);
3050 
3051 // Definitions
small_memeq(const void * a_void,const void * b_void,size_t len)3052 bool small_memeq(const void* a_void, const void* b_void, size_t len) {
3053   const char* a = (const char*)a_void;
3054   const char* b = (const char*)b_void;
3055 
3056   while (len--)
3057     if (*a++ != *b++)
3058       return false;
3059   return true;
3060 }
3061 
small_memcpy(void * dest_void,const void * src_void,size_t len)3062 void small_memcpy(void* dest_void, const void* src_void, size_t len) {
3063   char* dest = (char*)dest_void;
3064   const char* src = (const char*)src_void;
3065 
3066   while (len--)
3067     *dest++ = *src++;
3068 }
3069 
3070 } // namespace morphodita
3071 
3072 /////////
3073 // File: trainer/training_failure.h
3074 /////////
3075 
3076 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
3077 //
3078 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3079 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3080 //
3081 // This Source Code Form is subject to the terms of the Mozilla Public
3082 // License, v. 2.0. If a copy of the MPL was not distributed with this
3083 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3084 
3085 namespace utils {
3086 
3087 class training_error : public runtime_error {
3088  public:
3089   training_error();
3090 
3091   static ostringstream message_collector;
3092 };
3093 
3094 #define training_failure(message) throw (training_error::message_collector << message, training_error())
3095 
3096 } // namespace utils
3097 
3098 /////////
3099 // File: utils/binary_encoder.h
3100 /////////
3101 
3102 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
3103 //
3104 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3105 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3106 //
3107 // This Source Code Form is subject to the terms of the Mozilla Public
3108 // License, v. 2.0. If a copy of the MPL was not distributed with this
3109 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3110 
3111 namespace utils {
3112 
3113 //
3114 // Declarations
3115 //
3116 
3117 class binary_encoder {
3118  public:
3119   inline binary_encoder();
3120 
3121   inline void add_1B(unsigned val);
3122   inline void add_2B(unsigned val);
3123   inline void add_4B(unsigned val);
3124   inline void add_float(double val);
3125   inline void add_double(double val);
3126   inline void add_str(string_piece str);
3127   inline void add_data(string_piece data);
3128   template <class T> inline void add_data(const vector<T>& data);
3129   template <class T> inline void add_data(const T* data, size_t elements);
3130 
3131   vector<unsigned char> data;
3132 };
3133 
3134 //
3135 // Definitions
3136 //
3137 
binary_encoder()3138 binary_encoder::binary_encoder() {
3139   data.reserve(16);
3140 }
3141 
add_1B(unsigned val)3142 void binary_encoder::add_1B(unsigned val) {
3143   if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
3144   data.push_back(val);
3145 }
3146 
add_2B(unsigned val)3147 void binary_encoder::add_2B(unsigned val) {
3148   if (uint16_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
3149   data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(uint16_t));
3150 }
3151 
add_4B(unsigned val)3152 void binary_encoder::add_4B(unsigned val) {
3153   if (uint32_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
3154   data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(uint32_t));
3155 }
3156 
add_float(double val)3157 void binary_encoder::add_float(double val) {
3158   data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(float));
3159 }
3160 
add_double(double val)3161 void binary_encoder::add_double(double val) {
3162   data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(double));
3163 }
3164 
add_str(string_piece str)3165 void binary_encoder::add_str(string_piece str) {
3166   add_1B(str.len < 255 ? str.len : 255);
3167   if (!(str.len < 255)) add_4B(str.len);
3168   add_data(str);
3169 }
3170 
add_data(string_piece data)3171 void binary_encoder::add_data(string_piece data) {
3172   this->data.insert(this->data.end(), (const unsigned char*) data.str, (const unsigned char*) (data.str + data.len));
3173 }
3174 
3175 template <class T>
add_data(const vector<T> & data)3176 void binary_encoder::add_data(const vector<T>& data) {
3177   this->data.insert(this->data.end(), (const unsigned char*) data.data(), (const unsigned char*) (data.data() + data.size()));
3178 }
3179 
3180 template <class T>
add_data(const T * data,size_t elements)3181 void binary_encoder::add_data(const T* data, size_t elements) {
3182   this->data.insert(this->data.end(), (const unsigned char*) data, (const unsigned char*) (data + elements));
3183 }
3184 
3185 } // namespace utils
3186 
3187 /////////
3188 // File: utils/pointer_decoder.h
3189 /////////
3190 
3191 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
3192 //
3193 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3194 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3195 //
3196 // This Source Code Form is subject to the terms of the Mozilla Public
3197 // License, v. 2.0. If a copy of the MPL was not distributed with this
3198 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3199 
3200 namespace utils {
3201 
3202 //
3203 // Declarations
3204 //
3205 
3206 class pointer_decoder {
3207  public:
3208   inline pointer_decoder(const unsigned char*& data);
3209   inline unsigned next_1B();
3210   inline unsigned next_2B();
3211   inline unsigned next_4B();
3212   inline void next_str(string& str);
3213   template <class T> inline const T* next(unsigned elements);
3214 
3215  private:
3216   const unsigned char*& data;
3217 };
3218 
3219 //
3220 // Definitions
3221 //
3222 
pointer_decoder(const unsigned char * & data)3223 pointer_decoder::pointer_decoder(const unsigned char*& data) : data(data) {}
3224 
next_1B()3225 unsigned pointer_decoder::next_1B() {
3226   return *data++;
3227 }
3228 
next_2B()3229 unsigned pointer_decoder::next_2B() {
3230   unsigned result = *(uint16_t*)data;
3231   data += sizeof(uint16_t);
3232   return result;
3233 }
3234 
next_4B()3235 unsigned pointer_decoder::next_4B() {
3236   unsigned result = *(uint32_t*)data;
3237   data += sizeof(uint32_t);
3238   return result;
3239 }
3240 
next_str(string & str)3241 void pointer_decoder::next_str(string& str) {
3242   unsigned len = next_1B();
3243   if (len == 255) len = next_4B();
3244   str.assign(next<char>(len), len);
3245 }
3246 
next(unsigned elements)3247 template <class T> const T* pointer_decoder::next(unsigned elements) {
3248   const T* result = (const T*) data;
3249   data += sizeof(T) * elements;
3250   return result;
3251 }
3252 
3253 } // namespace utils
3254 
3255 /////////
3256 // File: morphodita/morpho/persistent_unordered_map.h
3257 /////////
3258 
3259 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3260 //
3261 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3262 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3263 //
3264 // This Source Code Form is subject to the terms of the Mozilla Public
3265 // License, v. 2.0. If a copy of the MPL was not distributed with this
3266 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3267 
3268 namespace morphodita {
3269 
3270 // Declarations
3271 class persistent_unordered_map {
3272  public:
3273   // Accessing function
3274   template <class EntrySize>
3275   inline const unsigned char* at(const char* str, int len, EntrySize entry_size) const;
3276 
3277   template <class T>
3278   inline const T* at_typed(const char* str, int len) const;
3279 
3280   template <class EntryProcess>
3281   inline void iter(const char* str, int len, EntryProcess entry_process) const;
3282 
3283   template <class EntryProcess>
3284   inline void iter_all(EntryProcess entry_process) const;
3285 
3286   // Two helper functions accessing some internals
3287   inline int max_length() const;
3288   inline const unsigned char* data_start(int len) const;
3289 
3290   // Creation functions
persistent_unordered_map()3291   persistent_unordered_map() {}
3292   template <class Entry, class EntryEncode>
3293   persistent_unordered_map(const unordered_map<string, Entry>& map, double load_factor, EntryEncode entry_encode);
3294   template <class Entry, class EntryEncode>
3295   persistent_unordered_map(const unordered_map<string, Entry>& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode);
3296 
3297   // Manual creation functions
3298   inline void resize(unsigned elems);
3299   inline void add(const char* str, int str_len, int data_len);
3300   inline void done_adding();
3301   inline unsigned char* fill(const char* str, int str_len, int data_len);
3302   inline void done_filling();
3303 
3304   // Serialization
3305   inline void load(binary_decoder& data);
3306   inline void save(binary_encoder& enc);
3307 
3308  private:
3309   struct fnv_hash;
3310   vector<fnv_hash> hashes;
3311 
3312   template <class Entry, class EntryEncode>
3313   void construct(const map<string, Entry>& map, double load_factor, EntryEncode entry_encode);
3314 };
3315 
3316 // Definitions
3317 struct persistent_unordered_map::fnv_hash {
fnv_hashufal::udpipe::morphodita::persistent_unordered_map::fnv_hash3318   fnv_hash(unsigned num) {
3319     mask = 1;
3320     while (mask < num)
3321       mask <<= 1;
3322     hash.resize(mask + 1);
3323     mask--;
3324   }
fnv_hashufal::udpipe::morphodita::persistent_unordered_map::fnv_hash3325   fnv_hash(binary_decoder& data) {
3326     uint32_t size = data.next_4B();
3327     mask = size - 2;
3328     hash.resize(size);
3329     memcpy(hash.data(), data.next<uint32_t>(size), size * sizeof(uint32_t));
3330 
3331     size = data.next_4B();
3332     this->data.resize(size);
3333     memcpy(this->data.data(), data.next<char>(size), size);
3334   }
3335 
indexufal::udpipe::morphodita::persistent_unordered_map::fnv_hash3336   inline uint32_t index(const char* data, int len) const {
3337     if (len <= 0) return 0;
3338     if (len == 1) return *(const uint8_t*)data;
3339     if (len == 2) return *(const uint16_t*)data;
3340 
3341     uint32_t hash = 2166136261U;
3342     while (len--)
3343       hash = (hash ^ unsigned(*data++)) * 16777619U;
3344     return hash & mask;
3345   }
3346 
3347   inline void save(binary_encoder& enc);
3348 
3349   unsigned mask;
3350   vector<uint32_t> hash;
3351   vector<unsigned char> data;
3352 };
3353 
3354 template <class EntrySize>
at(const char * str,int len,EntrySize entry_size) const3355 const unsigned char* persistent_unordered_map::at(const char* str, int len, EntrySize entry_size) const {
3356   if (unsigned(len) >= hashes.size()) return nullptr;
3357 
3358   unsigned index = hashes[len].index(str, len);
3359   const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index];
3360   const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1];
3361 
3362   if (len <= 2)
3363     return data != end ? data + len : nullptr;
3364 
3365   while (data < end) {
3366     if (small_memeq(str, data, len)) return data + len;
3367     data += len;
3368     pointer_decoder decoder(data);
3369     entry_size(decoder);
3370   }
3371 
3372   return nullptr;
3373 }
3374 
3375 template <class T>
at_typed(const char * str,int len) const3376 const T* persistent_unordered_map::at_typed(const char* str, int len) const {
3377   if (unsigned(len) >= hashes.size()) return nullptr;
3378 
3379   unsigned index = hashes[len].index(str, len);
3380   const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index];
3381   const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1];
3382 
3383   if (len <= 2)
3384     return data != end ? (const T*)(data + len) : nullptr;
3385 
3386   while (data < end) {
3387     if (small_memeq(str, data, len)) return (const T*)(data + len);
3388     data += len + sizeof(T);
3389   }
3390 
3391   return nullptr;
3392 }
3393 
3394 template <class EntryProcess>
iter(const char * str,int len,EntryProcess entry_process) const3395 void persistent_unordered_map::iter(const char* str, int len, EntryProcess entry_process) const {
3396   if (unsigned(len) >= hashes.size()) return;
3397 
3398   unsigned index = hashes[len].index(str, len);
3399   const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index];
3400   const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1];
3401 
3402   while (data < end) {
3403     auto start = (const char*) data;
3404     data += len;
3405     pointer_decoder decoder(data);
3406     entry_process(start, decoder);
3407   }
3408 }
3409 
3410 template <class EntryProcess>
iter_all(EntryProcess entry_process) const3411 void persistent_unordered_map::iter_all(EntryProcess entry_process) const {
3412   for (unsigned len = 0; len < hashes.size(); len++) {
3413     const unsigned char* data = hashes[len].data.data();
3414     const unsigned char* end = data + hashes[len].data.size();
3415 
3416     while (data < end) {
3417       auto start = (const char*) data;
3418       data += len;
3419       pointer_decoder decoder(data);
3420       entry_process(start, len, decoder);
3421     }
3422   }
3423 }
3424 
max_length() const3425 int persistent_unordered_map::max_length() const {
3426   return hashes.size();
3427 }
3428 
data_start(int len) const3429 const unsigned char* persistent_unordered_map::data_start(int len) const {
3430   return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
3431 }
3432 
resize(unsigned elems)3433 void persistent_unordered_map::resize(unsigned elems) {
3434   if (hashes.size() == 0) hashes.emplace_back(1);
3435   else if (hashes.size() == 1) hashes.emplace_back(1<<8);
3436   else if (hashes.size() == 2) hashes.emplace_back(1<<16);
3437   else hashes.emplace_back(elems);
3438 }
3439 
add(const char * str,int str_len,int data_len)3440 void persistent_unordered_map::add(const char* str, int str_len, int data_len) {
3441   if (unsigned(str_len) < hashes.size())
3442     hashes[str_len].hash[hashes[str_len].index(str, str_len)] += str_len + data_len;
3443 }
3444 
done_adding()3445 void persistent_unordered_map::done_adding() {
3446   for (auto&& hash : hashes) {
3447     int total = 0;
3448     for (auto&& len : hash.hash) total += len, len = total - len;
3449     hash.data.resize(total);
3450   }
3451 }
3452 
fill(const char * str,int str_len,int data_len)3453 unsigned char* persistent_unordered_map::fill(const char* str, int str_len, int data_len) {
3454   if (unsigned(str_len) < hashes.size()) {
3455     unsigned index = hashes[str_len].index(str, str_len);
3456     unsigned offset = hashes[str_len].hash[index];
3457     small_memcpy(hashes[str_len].data.data() + offset, str, str_len);
3458     hashes[str_len].hash[index] += str_len + data_len;
3459     return hashes[str_len].data.data() + offset + str_len;
3460   }
3461   return nullptr;
3462 }
3463 
done_filling()3464 void persistent_unordered_map::done_filling() {
3465   for (auto&& hash : hashes)
3466     for (int i = hash.hash.size() - 1; i >= 0; i--)
3467       hash.hash[i] = i > 0 ? hash.hash[i-1] : 0;
3468 }
3469 
load(binary_decoder & data)3470 void persistent_unordered_map::load(binary_decoder& data) {
3471   unsigned sizes = data.next_1B();
3472 
3473   hashes.clear();
3474   for (unsigned i = 0; i < sizes; i++)
3475     hashes.emplace_back(data);
3476 }
3477 
3478 } // namespace morphodita
3479 
3480 /////////
3481 // File: morphodita/derivator/derivator_dictionary.h
3482 /////////
3483 
3484 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3485 //
3486 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3487 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3488 //
3489 // This Source Code Form is subject to the terms of the Mozilla Public
3490 // License, v. 2.0. If a copy of the MPL was not distributed with this
3491 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3492 
3493 namespace morphodita {
3494 
3495 class derivator_dictionary : public derivator {
3496  public:
3497   virtual bool parent(string_piece lemma, derivated_lemma& parent) const override;
3498   virtual bool children(string_piece lemma, vector<derivated_lemma>& children) const override;
3499 
3500   bool load(istream& is);
3501 
3502  private:
3503   friend class morpho;
3504   const morpho* dictionary;
3505   persistent_unordered_map derinet;
3506 };
3507 
3508 } // namespace morphodita
3509 
3510 /////////
3511 // File: utils/compressor.h
3512 /////////
3513 
3514 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
3515 //
3516 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3517 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3518 //
3519 // This Source Code Form is subject to the terms of the Mozilla Public
3520 // License, v. 2.0. If a copy of the MPL was not distributed with this
3521 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3522 
3523 namespace utils {
3524 
3525 class binary_decoder;
3526 class binary_encoder;
3527 
3528 class compressor {
3529  public:
3530   static bool load(istream& is, binary_decoder& data);
3531   static bool save(ostream& os, const binary_encoder& enc);
3532 };
3533 
3534 } // namespace utils
3535 
3536 /////////
3537 // File: morphodita/derivator/derivator_dictionary.cpp
3538 /////////
3539 
3540 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3541 //
3542 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3543 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3544 //
3545 // This Source Code Form is subject to the terms of the Mozilla Public
3546 // License, v. 2.0. If a copy of the MPL was not distributed with this
3547 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3548 
3549 namespace morphodita {
3550 
parent(string_piece lemma,derivated_lemma & parent) const3551 bool derivator_dictionary::parent(string_piece lemma, derivated_lemma& parent) const {
3552   if (dictionary) lemma.len = dictionary->lemma_id_len(lemma);
3553 
3554   auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) {
3555     data.next<char>(data.next_1B());
3556     data.next_4B();
3557     data.next<uint32_t>(data.next_2B());
3558   });
3559   if (lemma_data) {
3560     auto parent_encoded = *(uint32_t*)(lemma_data + 1 + *lemma_data);
3561     if (parent_encoded) {
3562       unsigned parent_len = parent_encoded & 0xFF;
3563       auto parent_data = derinet.data_start(parent_len) + (parent_encoded >> 8);
3564       parent.lemma.assign((const char*) parent_data, parent_len);
3565       if (parent_data[parent_len])
3566         parent.lemma.append((const char*) parent_data + parent_len + 1, parent_data[parent_len]);
3567       return true;
3568     }
3569   }
3570   parent.lemma.clear();
3571   return false;
3572 }
3573 
children(string_piece lemma,vector<derivated_lemma> & children) const3574 bool derivator_dictionary::children(string_piece lemma, vector<derivated_lemma>& children) const {
3575   if (dictionary) lemma.len = dictionary->lemma_id_len(lemma);
3576 
3577   auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) {
3578     data.next<char>(data.next_1B());
3579     data.next_4B();
3580     data.next<uint32_t>(data.next_2B());
3581   });
3582   if (lemma_data) {
3583     auto children_len = *(uint16_t*)(lemma_data + 1 + *lemma_data + 4);
3584     auto children_encoded = (uint32_t*)(lemma_data + 1 + *lemma_data + 4 + 2);
3585     if (children_len) {
3586       children.resize(children_len);
3587       for (unsigned i = 0; i < children_len; i++) {
3588         unsigned child_len = children_encoded[i] & 0xFF;
3589         auto child_data = derinet.data_start(child_len) + (children_encoded[i] >> 8);
3590         children[i].lemma.assign((const char*) child_data, child_len);
3591         if (child_data[child_len])
3592           children[i].lemma.append((const char*) child_data + child_len + 1, child_data[child_len]);
3593       }
3594       return true;
3595     }
3596   }
3597   children.clear();
3598   return false;
3599 }
3600 
load(istream & is)3601 bool derivator_dictionary::load(istream& is) {
3602   binary_decoder data;
3603   if (!compressor::load(is, data)) return false;
3604 
3605   try {
3606     for (int i = data.next_1B(); i > 0; i--)
3607       derinet.resize(data.next_4B());
3608 
3609     unsigned data_position = data.tell();
3610     vector<char> lemma, parent;
3611     for (int pass = 1; pass <= 3; pass++) {
3612       if (pass > 1) data.seek(data_position);
3613 
3614       lemma.clear();
3615       for (int i = data.next_4B(); i > 0; i--) {
3616         lemma.resize(lemma.size() - data.next_1B());
3617         for (int i = data.next_1B(); i > 0; i--)
3618           lemma.push_back(data.next_1B());
3619 
3620         unsigned char lemma_comment_len = data.next_1B();
3621         const char* lemma_comment = lemma_comment_len ? data.next<char>(lemma_comment_len) : nullptr;
3622 
3623         unsigned children = data.next_2B();
3624 
3625         if (pass == 3) parent.clear();
3626         enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
3627         int operations = data.next_1B();
3628         if (operations) {
3629           int remove_start = operations & REMOVE_START ? data.next_1B() : 0;
3630           int remove_end = operations & REMOVE_END ? data.next_1B() : 0;
3631           if (operations & ADD_START) {
3632             int add_start = data.next_1B();
3633             const char* str = data.next<char>(add_start);
3634             if (pass == 3) parent.assign(str, str + add_start);
3635           }
3636           if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end);
3637           if (operations & ADD_END) {
3638             int add_end = data.next_1B();
3639             const char* str = data.next<char>(add_end);
3640             if (pass == 3) parent.insert(parent.end(), str, str + add_end);
3641           }
3642         }
3643 
3644         if (pass == 1) {
3645           derinet.add(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children);
3646         } else if (pass == 2) {
3647           unsigned char* lemma_data = derinet.fill(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children);
3648           *lemma_data++ = lemma_comment_len;
3649           while (lemma_comment_len--) *lemma_data++ = *lemma_comment++;
3650           *(uint32_t*)(lemma_data) = 0; lemma_data += sizeof(uint32_t);
3651           *(uint16_t*)(lemma_data) = children; lemma_data += sizeof(uint16_t);
3652           if (children) ((uint32_t*)lemma_data)[children - 1] = 0;
3653         } else if (pass == 3 && !parent.empty()) {
3654           auto lemma_data = derinet.at(lemma.data(), lemma.size(), [](pointer_decoder& data) {
3655             data.next<char>(data.next_1B());
3656             data.next_4B();
3657             data.next<uint32_t>(data.next_2B());
3658           });
3659           auto parent_data = derinet.at(parent.data(), parent.size(), [](pointer_decoder& data) {
3660             data.next<char>(data.next_1B());
3661             data.next_4B();
3662             data.next<uint32_t>(data.next_2B());
3663           });
3664           assert(lemma_data && parent_data);
3665 
3666           unsigned parent_offset = parent_data - parent.size() - derinet.data_start(parent.size());
3667           assert(parent.size() < (1<<8) && parent_offset < (1<<24));
3668           *(uint32_t*)(lemma_data + 1 + *lemma_data) = (parent_offset << 8) | parent.size();
3669 
3670           unsigned lemma_offset = lemma_data - lemma.size() - derinet.data_start(lemma.size());
3671           assert(lemma.size() < (1<<8) && lemma_offset < (1<<24));
3672           auto children_len = *(uint16_t*)(parent_data + 1 + *parent_data + 4);
3673           auto children = (uint32_t*)(parent_data + 1 + *parent_data + 4 + 2);
3674           auto child_index = children[children_len-1];
3675           children[child_index] = (lemma_offset << 8) | lemma.size();
3676           if (child_index+1 < children_len) children[children_len-1]++;
3677         }
3678       }
3679 
3680       if (pass == 1)
3681         derinet.done_adding();
3682       if (pass == 2)
3683         derinet.done_filling();
3684     }
3685   } catch (binary_decoder_error&) {
3686     return false;
3687   }
3688   return true;
3689 }
3690 
3691 } // namespace morphodita
3692 
3693 /////////
3694 // File: morphodita/morpho/casing_variants.h
3695 /////////
3696 
3697 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3698 //
3699 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3700 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3701 //
3702 // This Source Code Form is subject to the terms of the Mozilla Public
3703 // License, v. 2.0. If a copy of the MPL was not distributed with this
3704 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3705 
3706 namespace morphodita {
3707 
generate_casing_variants(string_piece form,string & form_uclc,string & form_lc)3708 inline void generate_casing_variants(string_piece form, string& form_uclc, string& form_lc) {
3709   using namespace unilib;
3710 
3711   // Detect uppercase+titlecase characters.
3712   bool first_Lut = false; // first character is uppercase or titlecase
3713   bool rest_has_Lut = false; // any character but first is uppercase or titlecase
3714   {
3715     string_piece form_tmp = form;
3716     first_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut;
3717     while (form_tmp.len && !rest_has_Lut)
3718       rest_has_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut;
3719   }
3720 
3721   // Generate all casing variants if needed (they are different than given form).
3722   // We only replace letters with their lowercase variants.
3723   // - form_uclc: first uppercase, rest lowercase
3724   // - form_lc: all lowercase
3725 
3726   if (first_Lut && !rest_has_Lut) { // common case allowing fast execution
3727     form_lc.reserve(form.len);
3728     string_piece form_tmp = form;
3729     utf8::append(form_lc, unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len)));
3730     form_lc.append(form_tmp.str, form_tmp.len);
3731   } else if (!first_Lut && rest_has_Lut) {
3732     form_lc.reserve(form.len);
3733     utf8::map(unicode::lowercase, form.str, form.len, form_lc);
3734   } else if (first_Lut && rest_has_Lut) {
3735     form_lc.reserve(form.len);
3736     form_uclc.reserve(form.len);
3737     string_piece form_tmp = form;
3738     char32_t first = utf8::decode(form_tmp.str, form_tmp.len);
3739     utf8::append(form_lc, unicode::lowercase(first));
3740     utf8::append(form_uclc, first);
3741     while (form_tmp.len) {
3742       char32_t lowercase = unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len));
3743       utf8::append(form_lc, lowercase);
3744       utf8::append(form_uclc, lowercase);
3745     }
3746   }
3747 }
3748 
3749 } // namespace morphodita
3750 
3751 /////////
3752 // File: morphodita/morpho/czech_lemma_addinfo.h
3753 /////////
3754 
3755 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3756 //
3757 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3758 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3759 //
3760 // This Source Code Form is subject to the terms of the Mozilla Public
3761 // License, v. 2.0. If a copy of the MPL was not distributed with this
3762 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3763 
3764 namespace morphodita {
3765 
3766 // Declarations
3767 struct czech_lemma_addinfo {
3768   inline static int raw_lemma_len(string_piece lemma);
3769   inline static int lemma_id_len(string_piece lemma);
3770   inline static string format(const unsigned char* addinfo, int addinfo_len);
3771   inline static bool generatable(const unsigned char* addinfo, int addinfo_len);
3772 
3773   inline int parse(string_piece lemma, bool die_on_failure = false);
3774   inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len);
3775 
3776   vector<unsigned char> data;
3777 };
3778 
3779 // Definitions
raw_lemma_len(string_piece lemma)3780 int czech_lemma_addinfo::raw_lemma_len(string_piece lemma) {
3781   // Lemma ends by a '-[0-9]', '`' or '_' on non-first position.
3782   for (unsigned len = 1; len < lemma.len; len++)
3783     if (lemma.str[len] == '`' || lemma.str[len] == '_' ||
3784         (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9'))
3785       return len;
3786   return lemma.len;
3787 }
3788 
lemma_id_len(string_piece lemma)3789 int czech_lemma_addinfo::lemma_id_len(string_piece lemma) {
3790   // Lemma ends by a '-[0-9]', '`' or '_' on non-first position.
3791   for (unsigned len = 1; len < lemma.len; len++) {
3792     if (lemma.str[len] == '`' || lemma.str[len] == '_')
3793       return len;
3794     if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') {
3795       len += 2;
3796       while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++;
3797       return len;
3798     }
3799   }
3800   return lemma.len;
3801 }
3802 
format(const unsigned char * addinfo,int addinfo_len)3803 string czech_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) {
3804   string res;
3805 
3806   if (addinfo_len) {
3807     res.reserve(addinfo_len + 4);
3808     if (addinfo[0] != 255) {
3809       char num[5];
3810       sprintf(num, "-%u", addinfo[0]);
3811       res += num;
3812     }
3813     for (int i = 1; i < addinfo_len; i++)
3814       res += addinfo[i];
3815   }
3816 
3817   return res;
3818 }
3819 
generatable(const unsigned char * addinfo,int addinfo_len)3820 bool czech_lemma_addinfo::generatable(const unsigned char* addinfo, int addinfo_len) {
3821   for (int i = 1; i + 2 < addinfo_len; i++)
3822     if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x')
3823       return false;
3824 
3825   return true;
3826 }
3827 
parse(string_piece lemma,bool die_on_failure)3828 int czech_lemma_addinfo::parse(string_piece lemma, bool die_on_failure) {
3829   data.clear();
3830 
3831   const char* lemma_info = lemma.str + raw_lemma_len(lemma);
3832   if (lemma_info < lemma.str + lemma.len) {
3833     int lemma_num = 255;
3834     const char* lemma_additional_info = lemma_info;
3835 
3836     if (*lemma_info == '-') {
3837       lemma_num = strtol(lemma_info + 1, (char**) &lemma_additional_info, 10);
3838 
3839       if (lemma_additional_info == lemma_info + 1 || (*lemma_additional_info != '\0' && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num < 0 || lemma_num >= 255) {
3840         if (die_on_failure)
3841           training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!");
3842         else
3843           lemma_num = 255;
3844       }
3845     }
3846     data.emplace_back(lemma_num);
3847     while (lemma_additional_info < lemma.str + lemma.len)
3848       data.push_back(*(unsigned char*)lemma_additional_info++);
3849 
3850     if (data.size() > 255) {
3851       if (die_on_failure)
3852         training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!');
3853       else
3854         data.resize(255);
3855     }
3856   }
3857 
3858   return lemma_info - lemma.str;
3859 }
3860 
match_lemma_id(const unsigned char * other_addinfo,int other_addinfo_len)3861 bool czech_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) {
3862   if (data.empty()) return true;
3863   if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false;
3864   return true;
3865 }
3866 
3867 } // namespace morphodita
3868 
3869 /////////
3870 // File: morphodita/morpho/tag_filter.h
3871 /////////
3872 
3873 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3874 //
3875 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3876 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3877 //
3878 // This Source Code Form is subject to the terms of the Mozilla Public
3879 // License, v. 2.0. If a copy of the MPL was not distributed with this
3880 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3881 
3882 namespace morphodita {
3883 
3884 // Declarations
3885 class tag_filter {
3886  public:
3887   tag_filter(const char* filter = nullptr);
3888 
3889   inline bool matches(const char* tag) const;
3890 
3891  private:
3892   struct char_filter {
char_filterufal::udpipe::morphodita::tag_filter::char_filter3893     char_filter(int pos, bool negate, const char* chars, int len) : pos(pos), negate(negate), chars(chars), len(len) {}
3894 
3895     int pos;
3896     bool negate;
3897     const char* chars;
3898     int len;
3899   };
3900 
3901   string wildcard;
3902   std::vector<char_filter> filters;
3903 };
3904 
3905 // Definitions
matches(const char * tag) const3906 inline bool tag_filter::matches(const char* tag) const {
3907   if (filters.empty()) return true;
3908 
3909   int tag_pos = 0;
3910   for (auto&& filter : filters) {
3911     while (tag_pos < filter.pos)
3912       if (!tag[tag_pos++])
3913         return true;
3914 
3915     // We assume filter.len >= 1.
3916     bool matched = (*filter.chars == tag[tag_pos]) ^ filter.negate;
3917     for (int i = 1; i < filter.len && !matched; i++)
3918       matched = (filter.chars[i] == tag[tag_pos]) ^ filter.negate;
3919     if (!matched) return false;
3920   }
3921   return true;
3922 }
3923 
3924 } // namespace morphodita
3925 
3926 /////////
3927 // File: morphodita/morpho/morpho_dictionary.h
3928 /////////
3929 
3930 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
3931 //
3932 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3933 // Mathematics and Physics, Charles University in Prague, Czech Republic.
3934 //
3935 // This Source Code Form is subject to the terms of the Mozilla Public
3936 // License, v. 2.0. If a copy of the MPL was not distributed with this
3937 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
3938 
3939 namespace morphodita {
3940 
3941 // Declarations
3942 template <class LemmaAddinfo>
3943 class morpho_dictionary {
3944  public:
3945   void load(binary_decoder& data);
3946   void analyze(string_piece form, vector<tagged_lemma>& lemmas) const;
3947   bool generate(string_piece lemma, const tag_filter& filter, vector<tagged_lemma_forms>& lemmas_forms) const;
3948  private:
3949   persistent_unordered_map lemmas, roots, suffixes;
3950 
3951   vector<string> tags;
3952   vector<vector<pair<string, vector<uint16_t>>>> classes;
3953 };
3954 
3955 // Definitions
3956 template <class LemmaAddinfo>
load(binary_decoder & data)3957 void morpho_dictionary<LemmaAddinfo>::load(binary_decoder& data) {
3958   // Prepare lemmas and roots hashes
3959   for (int i = data.next_1B(); i > 0; i--)
3960     lemmas.resize(data.next_4B());
3961   for (int i = data.next_1B(); i > 0; i--)
3962     roots.resize(data.next_4B());
3963 
3964   // Perform two pass over the lemmas and roots data, filling the hashes.
3965 
3966   vector<char> lemma(max(lemmas.max_length(), roots.max_length()));
3967   vector<char> root(max(lemmas.max_length(), roots.max_length()));
3968   unsigned data_position = data.tell();
3969   for (int pass = 1; pass <= 2; pass++) {
3970     if (pass > 1) data.seek(data_position);
3971 
3972     int lemma_len = 0;
3973     int root_len = 0;
3974 
3975     for (int i = data.next_4B(); i > 0; i--) {
3976       lemma_len -= data.next_1B();
3977       for (int i = data.next_1B(); i > 0; i--)
3978         lemma[lemma_len++] = data.next_1B();
3979       unsigned char lemma_info_len = data.next_1B();
3980       const char* lemma_info = lemma_info_len ? data.next<char>(lemma_info_len) : nullptr;
3981       unsigned lemma_roots = data.next_1B();
3982 
3983       unsigned char* lemma_data /* to keep compiler happy */ = nullptr;
3984       unsigned lemma_offset /* to keep compiler happy */ = 0;
3985 
3986       if (pass == 1) {
3987         lemmas.add(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t)));
3988       } else /*if (pass == 2)*/ {
3989         lemma_data = lemmas.fill(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t)));
3990         lemma_offset = lemma_data - lemma_len - lemmas.data_start(lemma_len);
3991 
3992         *lemma_data++ = lemma_info_len;
3993         if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len;
3994         *lemma_data++ = lemma_roots;
3995       }
3996 
3997       small_memcpy(root.data(), lemma.data(), lemma_len); root_len = lemma_len;
3998       for (unsigned i = 0; i < lemma_roots; i++) {
3999         enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
4000         int operations = data.next_1B();
4001         if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
4002         if (operations & REMOVE_END) root_len -= data.next_1B();
4003         if (operations & ADD_START) {
4004           int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
4005           for (int i = 0; i < to; i++) root[i] = data.next_1B();
4006         }
4007         if (operations & ADD_END)
4008           for (int len = data.next_1B(); len > 0; len--)
4009             root[root_len++] = data.next_1B();
4010         uint16_t clas = data.next_2B();
4011 
4012         if (pass == 1) { // for each root
4013           roots.add(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t));
4014         } else /*if (pass == 2)*/ {
4015           unsigned char* root_data = roots.fill(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t));
4016           unsigned root_offset = root_data - root_len - roots.data_start(root_len);
4017 
4018           *(uint16_t*)(root_data) = clas; root_data += sizeof(uint16_t);
4019           *(uint32_t*)(root_data) = lemma_offset; root_data += sizeof(uint32_t);
4020           *(uint8_t*)(root_data) = lemma_len; root_data += sizeof(uint8_t);
4021           assert(uint8_t(lemma_len) == lemma_len);
4022 
4023           *(uint32_t*)(lemma_data) = root_offset; lemma_data += sizeof(uint32_t);
4024           *(uint8_t*)(lemma_data) = root_len; lemma_data += sizeof(uint8_t);
4025           *(uint16_t*)(lemma_data) = clas; lemma_data += sizeof(uint16_t);
4026           assert(uint8_t(root_len) == root_len);
4027         }
4028       }
4029     }
4030 
4031     if (pass == 1) { // after the whole pass
4032       lemmas.done_adding();
4033       roots.done_adding();
4034     } else /*if (pass == 2)*/ {
4035       lemmas.done_filling();
4036       roots.done_filling();
4037     }
4038   }
4039 
4040   // Load tags
4041   tags.resize(data.next_2B());
4042   for (auto&& tag : tags) {
4043     tag.resize(data.next_1B());
4044     for (unsigned i = 0; i < tag.size(); i++)
4045       tag[i] = data.next_1B();
4046   }
4047 
4048   // Load suffixes
4049   suffixes.load(data);
4050 
4051   // Fill classes from suffixes
4052   suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable {
4053     unsigned classes_len = data.next_2B();
4054     const uint16_t* classes_ptr = data.next<uint16_t>(classes_len);
4055     // Following volatile is needed to overcome vectorizer bug in g++ 6.3.0 (among other versions).
4056     volatile const uint16_t* indices_ptr = data.next<uint16_t>(classes_len + 1);
4057     uint32_t tags_len = indices_ptr[0];
4058     for (unsigned i = 0; i < classes_len; i++)
4059       tags_len += uint16_t(indices_ptr[i + 1] - indices_ptr[i]);
4060     const uint16_t* tags_ptr = data.next<uint16_t>(tags_len);
4061 
4062     string suffix_str(suffix, len);
4063     uint32_t index = indices_ptr[0], prev_index = 0;
4064     for (unsigned i = 0; i < classes_len; i++) {
4065       if (classes_ptr[i] >= classes.size()) classes.resize(classes_ptr[i] + 1);
4066       prev_index = index;
4067       index += uint16_t(indices_ptr[i + 1] - indices_ptr[i]);
4068       classes[classes_ptr[i]].emplace_back(suffix_str, vector<uint16_t>(tags_ptr + prev_index, tags_ptr + index));
4069     }
4070   });
4071 }
4072 
4073 template <class LemmaAddinfo>
analyze(string_piece form,vector<tagged_lemma> & lemmas) const4074 void morpho_dictionary<LemmaAddinfo>::analyze(string_piece form, vector<tagged_lemma>& lemmas) const {
4075   int max_suffix_len = suffixes.max_length();
4076 
4077   uint16_t* suff_stack[16]; vector<uint16_t*> suff_heap;
4078   uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
4079   int suff_len = 0;
4080   for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) {
4081     suff[suff_len] = (uint16_t*) suffixes.at(form.str + i, suff_len, [](pointer_decoder& data) {
4082       data.next<uint16_t>(2 * data.next_2B());
4083       data.next<uint16_t>(data.next_2B());
4084     });
4085     if (!suff[suff_len]) break;
4086   }
4087 
4088   for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
4089     if (*suff[suff_len]) {
4090       unsigned suff_classes = *suff[suff_len];
4091       uint16_t* suff_data = suff[suff_len] + 1;
4092 
4093       roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) {
4094         unsigned root_class = root_data.next_2B();
4095         unsigned lemma_offset = root_data.next_4B();
4096         unsigned lemma_len = root_data.next_1B();
4097 
4098         if (small_memeq(form.str, root, root_len)) {
4099           uint16_t* suffix_class_ptr = lower_bound(suff_data, suff_data + suff_classes, root_class);
4100           if (suffix_class_ptr < suff_data + suff_classes && *suffix_class_ptr == root_class) {
4101             const unsigned char* lemma_data = this->lemmas.data_start(lemma_len) + lemma_offset;
4102             string lemma((const char*)lemma_data, lemma_len);
4103             if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]);
4104 
4105             uint16_t* suff_tag_indices = suff_data + suff_classes;
4106             uint16_t* suff_tags = suff_tag_indices + suff_classes + 1;
4107             for (unsigned i = suff_tag_indices[suffix_class_ptr - suff_data]; i < suff_tag_indices[suffix_class_ptr - suff_data + 1]; i++)
4108               lemmas.emplace_back(lemma, tags[suff_tags[i]]);
4109           }
4110         }
4111       });
4112     }
4113 }
4114 
4115 template <class LemmaAddinfo>
generate(string_piece lemma,const tag_filter & filter,vector<tagged_lemma_forms> & lemmas_forms) const4116 bool morpho_dictionary<LemmaAddinfo>::generate(string_piece lemma, const tag_filter& filter, vector<tagged_lemma_forms>& lemmas_forms) const {
4117   LemmaAddinfo addinfo;
4118   int raw_lemma_len = addinfo.parse(lemma);
4119   bool matched_lemma = false;
4120 
4121   lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) {
4122     unsigned lemma_info_len = data.next_1B();
4123     const auto* lemma_info = data.next<unsigned char>(lemma_info_len);
4124     unsigned lemma_roots_len = data.next_1B();
4125     auto* lemma_roots_ptr = data.next<unsigned char>(lemma_roots_len * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t)));
4126 
4127     if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
4128       matched_lemma = true;
4129 
4130       vector<tagged_form>* forms = nullptr;
4131       pointer_decoder lemma_roots(lemma_roots_ptr);
4132       for (unsigned i = 0; i < lemma_roots_len; i++) {
4133         unsigned root_offset = lemma_roots.next_4B();
4134         unsigned root_len = lemma_roots.next_1B();
4135         unsigned clas = lemma_roots.next_2B();
4136 
4137         const unsigned char* root_data = roots.data_start(root_len) + root_offset;
4138         for (auto&& suffix : classes[clas]) {
4139           string root_with_suffix;
4140           for (auto&& tag : suffix.second)
4141             if (filter.matches(tags[tag].c_str())) {
4142               if (!forms) {
4143                 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
4144                 forms = &lemmas_forms.back().forms;
4145               }
4146 
4147               if (root_with_suffix.empty() && root_len + suffix.first.size()) {
4148                 root_with_suffix.reserve(root_len + suffix.first.size());
4149                 root_with_suffix.assign((const char*)root_data, root_len);
4150                 root_with_suffix.append(suffix.first);
4151               }
4152 
4153               forms->emplace_back(root_with_suffix, tags[tag]);
4154             }
4155         }
4156       }
4157     }
4158   });
4159 
4160   return matched_lemma;
4161 }
4162 
4163 } // namespace morphodita
4164 
4165 /////////
4166 // File: morphodita/morpho/morpho_prefix_guesser.h
4167 /////////
4168 
4169 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4170 //
4171 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4172 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4173 //
4174 // This Source Code Form is subject to the terms of the Mozilla Public
4175 // License, v. 2.0. If a copy of the MPL was not distributed with this
4176 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4177 
4178 namespace morphodita {
4179 
4180 // Declarations
4181 template <class MorphoDictionary>
4182 class morpho_prefix_guesser {
4183  public:
morpho_prefix_guesser(const MorphoDictionary & dictionary)4184   morpho_prefix_guesser(const MorphoDictionary& dictionary) : dictionary(dictionary) {}
4185 
4186   void load(binary_decoder& data);
4187   void analyze(string_piece form, vector<tagged_lemma>& lemmas);
4188   bool generate(string_piece lemma, const tag_filter& filter, vector<tagged_lemma_forms>& lemmas_forms);
4189 
4190  private:
4191   const MorphoDictionary& dictionary;
4192   vector<tag_filter> tag_filters;
4193   persistent_unordered_map prefixes_initial, prefixes_middle;
4194 };
4195 
4196 // Definitions
4197 template <class MorphoDictionary>
load(binary_decoder & data)4198 void morpho_prefix_guesser<MorphoDictionary>::load(binary_decoder& data) {
4199   // Load and construct tag filters
4200   for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) {
4201     unsigned tag_filter_len = data.next_1B();
4202     string tag_filter(data.next<char>(tag_filter_len), tag_filter_len);
4203 
4204     tag_filters.emplace_back(tag_filter.c_str());
4205   }
4206 
4207   // Load prefixes
4208   prefixes_initial.load(data);
4209   prefixes_middle.load(data);
4210 }
4211 
4212 // Analyze can return non-unique lemma-tag pairs.
4213 template <class MorphoDictionary>
analyze(string_piece form,vector<tagged_lemma> & lemmas)4214 void morpho_prefix_guesser<MorphoDictionary>::analyze(string_piece form, vector<tagged_lemma>& lemmas) {
4215   if (!form.len) return;
4216 
4217   vector<char> form_tmp;
4218   vector<unsigned> middle_masks;
4219   middle_masks.reserve(form.len);
4220 
4221   for (unsigned initial = 0; initial < form.len; initial++) {
4222     // Match the initial prefix.
4223     unsigned initial_mask = (1<<tag_filters.size()) - 1; // full mask for empty initial prefix
4224     if (initial) {
4225       auto found = prefixes_initial.at_typed<uint32_t>(form.str, initial);
4226       if (!found) break;
4227       initial_mask = *found;
4228     }
4229 
4230     // If we have found an initial prefix (including the empty one), match middle prefixes.
4231     if (initial_mask) {
4232       middle_masks.resize(initial);
4233       middle_masks.emplace_back(initial_mask);
4234       for (unsigned middle = initial; middle < middle_masks.size(); middle++) {
4235         if (!middle_masks[middle]) continue;
4236         // Try matching middle prefixes from current index.
4237         for (unsigned i = middle + 1; i < form.len; i++) {
4238           auto found = prefixes_middle.at_typed<uint32_t>(form.str + middle, i - middle);
4239           if (!found) break;
4240           if (*found) {
4241             if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1);
4242             middle_masks[i] |= middle_masks[middle] & *found;
4243           }
4244         }
4245 
4246         // Try matching word forms if at least one middle prefix was found.
4247         if (middle > initial && middle < form.len ) {
4248           if (initial) {
4249             if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len);
4250             small_memcpy(form_tmp.data() + middle - initial, form.str, initial);
4251           }
4252           unsigned lemmas_ori_size = lemmas.size();
4253           dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas);
4254           unsigned lemmas_new_size = lemmas_ori_size;
4255           for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) {
4256             for (unsigned filter = 0; filter < tag_filters.size(); filter++)
4257               if ((middle_masks[middle] & (1<<filter)) && tag_filters[filter].matches(lemmas[i].tag.c_str())) {
4258                 if (i == lemmas_new_size) {
4259                   lemmas[lemmas_new_size].lemma.insert(0, form.str + initial, middle - initial);
4260                 } else {
4261                   lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial);
4262                   lemmas[lemmas_new_size].lemma.assign(form.str + initial, middle - initial);
4263                   lemmas[lemmas_new_size].lemma.append(lemmas[i].lemma);
4264                   lemmas[lemmas_new_size].tag = lemmas[i].tag;
4265                 }
4266                 lemmas_new_size++;
4267                 break;
4268               }
4269           }
4270           if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end());
4271         }
4272       }
4273     }
4274   }
4275 }
4276 
4277 template <class MorphoDictionary>
generate(string_piece,const tag_filter &,vector<tagged_lemma_forms> &)4278 bool morpho_prefix_guesser<MorphoDictionary>::generate(string_piece /*lemma*/, const tag_filter& /*filter*/, vector<tagged_lemma_forms>& /*lemmas_forms*/) {
4279   // Not implemented yet. Is it actually needed?
4280   return false;
4281 }
4282 } // namespace morphodita
4283 
4284 /////////
4285 // File: morphodita/morpho/morpho_statistical_guesser.h
4286 /////////
4287 
4288 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4289 //
4290 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4291 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4292 //
4293 // This Source Code Form is subject to the terms of the Mozilla Public
4294 // License, v. 2.0. If a copy of the MPL was not distributed with this
4295 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4296 
4297 namespace morphodita {
4298 
4299 class morpho_statistical_guesser {
4300  public:
4301   void load(binary_decoder& data);
4302   typedef vector<string> used_rules;
4303   void analyze(string_piece form, vector<tagged_lemma>& lemmas, used_rules* used);
4304 
4305  private:
4306   vector<string> tags;
4307   unsigned default_tag;
4308   persistent_unordered_map rules;
4309 };
4310 
4311 } // namespace morphodita
4312 
4313 /////////
4314 // File: morphodita/tokenizer/unicode_tokenizer.h
4315 /////////
4316 
4317 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4318 //
4319 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4320 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4321 //
4322 // This Source Code Form is subject to the terms of the Mozilla Public
4323 // License, v. 2.0. If a copy of the MPL was not distributed with this
4324 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4325 
4326 namespace morphodita {
4327 
4328 class unicode_tokenizer : public tokenizer {
4329  public:
4330   enum { URL_EMAIL_LATEST = 2 };
4331   unicode_tokenizer(unsigned url_email_tokenizer);
4332 
4333   virtual void set_text(string_piece text, bool make_copy = false) override;
4334   virtual bool next_sentence(vector<string_piece>* forms, vector<token_range>* tokens) override;
4335 
4336   virtual bool next_sentence(vector<token_range>& tokens) = 0;
4337 
4338  protected:
4339   struct char_info {
4340     char32_t chr;
4341     unilib::unicode::category_t cat;
4342     const char* str;
4343 
char_infoufal::udpipe::morphodita::unicode_tokenizer::char_info4344     char_info(char32_t chr, const char* str) : chr(chr), cat(unilib::unicode::category(chr)), str(str) {}
4345   };
4346   vector<char_info> chars;
4347   size_t current;
4348 
4349   bool tokenize_url_email(vector<token_range>& tokens);
4350   bool emergency_sentence_split(const vector<token_range>& tokens);
4351   bool is_eos(const vector<token_range>& tokens, char32_t eos_chr, const unordered_set<string>* abbreviations);
4352 
4353  private:
4354   unsigned url_email_tokenizer;
4355   string text_buffer;
4356   vector<token_range> tokens_buffer;
4357   string eos_buffer;
4358 };
4359 
4360 } // namespace morphodita
4361 
4362 /////////
4363 // File: morphodita/tokenizer/ragel_tokenizer.h
4364 /////////
4365 
4366 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4367 //
4368 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4369 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4370 //
4371 // This Source Code Form is subject to the terms of the Mozilla Public
4372 // License, v. 2.0. If a copy of the MPL was not distributed with this
4373 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4374 
4375 namespace morphodita {
4376 
4377 class ragel_tokenizer : public unicode_tokenizer {
4378  public:
4379   ragel_tokenizer(unsigned url_email_tokenizer);
4380 
4381  protected:
4382   static inline uint8_t ragel_char(const char_info& chr);
4383 
4384  private:
4385   static void initialize_ragel_map();
4386   static vector<uint8_t> ragel_map;
4387   static atomic_flag ragel_map_flag;
4388   static void ragel_map_add(char32_t chr, uint8_t mapping);
4389 
4390   friend class unicode_tokenizer;
4391   static bool ragel_url_email(unsigned version, const vector<char_info>& chars, size_t& current_char, vector<token_range>& tokens);
4392 };
4393 
ragel_char(const char_info & chr)4394 uint8_t ragel_tokenizer::ragel_char(const char_info& chr) {
4395   return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
4396 }
4397 
4398 } // namespace morphodita
4399 
4400 /////////
4401 // File: morphodita/tokenizer/czech_tokenizer.h
4402 /////////
4403 
4404 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4405 //
4406 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4407 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4408 //
4409 // This Source Code Form is subject to the terms of the Mozilla Public
4410 // License, v. 2.0. If a copy of the MPL was not distributed with this
4411 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4412 
4413 namespace morphodita {
4414 
4415 class czech_tokenizer : public ragel_tokenizer {
4416  public:
4417   enum tokenizer_language { CZECH = 0, SLOVAK = 1 };
4418   enum { LATEST = 2 };
4419   czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m = nullptr);
4420 
4421   virtual bool next_sentence(vector<token_range>& tokens) override;
4422 
4423  private:
4424   const morpho* m;
4425   const unordered_set<string>* abbreviations;
4426   vector<tagged_lemma> lemmas;
4427 
4428   void merge_hyphenated(vector<token_range>& tokens);
4429 
4430   static const unordered_set<string> abbreviations_czech;
4431   static const unordered_set<string> abbreviations_slovak;
4432 };
4433 
4434 } // namespace morphodita
4435 
4436 /////////
4437 // File: morphodita/morpho/czech_morpho.h
4438 /////////
4439 
4440 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4441 //
4442 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4443 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4444 //
4445 // This Source Code Form is subject to the terms of the Mozilla Public
4446 // License, v. 2.0. If a copy of the MPL was not distributed with this
4447 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4448 
4449 namespace morphodita {
4450 
4451 class czech_morpho : public morpho {
4452  public:
4453   using morpho_language = czech_tokenizer::tokenizer_language;
4454 
czech_morpho(morpho_language language,unsigned version)4455   czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {}
4456 
4457   virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector<tagged_lemma>& lemmas) const override;
4458   virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector<tagged_lemma_forms>& forms) const override;
4459   virtual int raw_lemma_len(string_piece lemma) const override;
4460   virtual int lemma_id_len(string_piece lemma) const override;
4461   virtual int raw_form_len(string_piece form) const override;
4462   virtual tokenizer* new_tokenizer() const override;
4463 
4464   bool load(istream& is);
4465  private:
4466   inline void analyze_special(string_piece form, vector<tagged_lemma>& lemmas) const;
4467 
4468   morpho_language language;
4469   unsigned version;
4470   morpho_dictionary<czech_lemma_addinfo> dictionary;
4471   unique_ptr<morpho_prefix_guesser<decltype(dictionary)>> prefix_guesser;
4472   unique_ptr<morpho_statistical_guesser> statistical_guesser;
4473 
4474   string unknown_tag = "X@-------------";
4475   string number_tag = "C=-------------";
4476   string punctuation_tag = "Z:-------------";
4477 };
4478 
4479 } // namespace morphodita
4480 
4481 /////////
4482 // File: morphodita/morpho/czech_morpho.cpp
4483 /////////
4484 
4485 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4486 //
4487 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4488 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4489 //
4490 // This Source Code Form is subject to the terms of the Mozilla Public
4491 // License, v. 2.0. If a copy of the MPL was not distributed with this
4492 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4493 
4494 namespace morphodita {
4495 
load(istream & is)4496 bool czech_morpho::load(istream& is) {
4497   binary_decoder data;
4498   if (!compressor::load(is, data)) return false;
4499 
4500   try {
4501     // Load tag length
4502     unsigned tag_length = data.next_1B();
4503     if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length);
4504     if (tag_length < number_tag.size()) number_tag.erase(tag_length);
4505     if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length);
4506 
4507     // Load dictionary
4508     dictionary.load(data);
4509 
4510     // Optionally prefix guesser if present
4511     prefix_guesser.reset();
4512     if (data.next_1B()) {
4513       prefix_guesser.reset(new morpho_prefix_guesser<decltype(dictionary)>(dictionary));
4514       prefix_guesser->load(data);
4515     }
4516 
4517     // Optionally statistical guesser if present
4518     statistical_guesser.reset();
4519     if (data.next_1B()) {
4520       statistical_guesser.reset(new morpho_statistical_guesser());
4521       statistical_guesser->load(data);
4522     }
4523   } catch (binary_decoder_error&) {
4524     return false;
4525   }
4526 
4527   return data.is_end();
4528 }
4529 
analyze(string_piece form,guesser_mode guesser,vector<tagged_lemma> & lemmas) const4530 int czech_morpho::analyze(string_piece form, guesser_mode guesser, vector<tagged_lemma>& lemmas) const {
4531   lemmas.clear();
4532 
4533   if (form.len) {
4534     // Generate all casing variants if needed (they are different than given form).
4535     string form_uclc; // first uppercase, rest lowercase
4536     string form_lc;   // all lowercase
4537     generate_casing_variants(form, form_uclc, form_lc);
4538 
4539     // Start by analysing using the dictionary and all casing variants.
4540     dictionary.analyze(form, lemmas);
4541     if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
4542     if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
4543     if (!lemmas.empty()) return NO_GUESSER;
4544 
4545     // Then call analyze_special to handle numbers and punctuation.
4546     analyze_special(form, lemmas);
4547     if (!lemmas.empty()) return NO_GUESSER;
4548 
4549     // For the prefix guesser, use only form_lc.
4550     if (guesser == GUESSER && prefix_guesser)
4551       prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas);
4552     bool prefix_guesser_guesses = !lemmas.empty();
4553 
4554     // For the statistical guesser, use all casing variants.
4555     if (guesser == GUESSER && statistical_guesser) {
4556       if (form_uclc.empty() && form_lc.empty())
4557         statistical_guesser->analyze(form, lemmas, nullptr);
4558       else {
4559         morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3);
4560         statistical_guesser->analyze(form, lemmas, &used_rules);
4561         if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
4562         if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
4563       }
4564     }
4565 
4566     // Make sure results are unique lemma-tag pairs. Statistical guesser produces
4567     // unique lemma-tag pairs, but prefix guesser does not.
4568     if (prefix_guesser_guesses) {
4569       sort(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) {
4570         int lemma_compare = a.lemma.compare(b.lemma);
4571         return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag);
4572       });
4573       auto lemmas_end = unique(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) {
4574         return a.lemma == b.lemma && a.tag == b.tag;
4575       });
4576       if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end());
4577     }
4578 
4579     if (!lemmas.empty()) return GUESSER;
4580   }
4581 
4582   lemmas.emplace_back(string(form.str, form.len), unknown_tag);
4583   return -1;
4584 }
4585 
generate(string_piece lemma,const char * tag_wildcard,morpho::guesser_mode guesser,vector<tagged_lemma_forms> & forms) const4586 int czech_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode guesser, vector<tagged_lemma_forms>& forms) const {
4587   forms.clear();
4588 
4589   tag_filter filter(tag_wildcard);
4590 
4591   if (lemma.len) {
4592     if (dictionary.generate(lemma, filter, forms))
4593       return NO_GUESSER;
4594 
4595     if (guesser == GUESSER && prefix_guesser)
4596       if (prefix_guesser->generate(lemma, filter, forms))
4597         return GUESSER;
4598   }
4599 
4600   return -1;
4601 }
4602 
raw_lemma_len(string_piece lemma) const4603 int czech_morpho::raw_lemma_len(string_piece lemma) const {
4604   return czech_lemma_addinfo::raw_lemma_len(lemma);
4605 }
4606 
lemma_id_len(string_piece lemma) const4607 int czech_morpho::lemma_id_len(string_piece lemma) const {
4608   return czech_lemma_addinfo::lemma_id_len(lemma);
4609 }
4610 
raw_form_len(string_piece form) const4611 int czech_morpho::raw_form_len(string_piece form) const {
4612   return form.len;
4613 }
4614 
new_tokenizer() const4615 tokenizer* czech_morpho::new_tokenizer() const {
4616   return new czech_tokenizer(language, version, this);
4617 }
4618 
4619 // What characters are considered punctuation except for the ones in unicode Punctuation category.
4620 static bool punctuation_additional[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*$*/,
4621   0,0,0,0,0,0,1/*+*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*<*/,1/*=*/,1/*>*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4622   0,0,0,0,0,0,0,0,1/*^*/,0,1/*`*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*|*/,0,1/*~*/,0,0,0,0,0,0,0,0,
4623   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4624   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4625   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4626   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4627   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4628   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4629   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4630   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4631   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4632   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*caron*/};
4633 
4634 // What characters of unicode Punctuation category are not considered punctuation.
4635 static bool punctuation_exceptions[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4636   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4637   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4638   0,0,0,0,0,0,0,0,0,1/*paragraph*/};
4639 
analyze_special(string_piece form,vector<tagged_lemma> & lemmas) const4640 void czech_morpho::analyze_special(string_piece form, vector<tagged_lemma>& lemmas) const {
4641   using namespace unilib;
4642 
4643   // Analyzer for numbers and punctuation.
4644   // Number is anything matching [+-]? is_Pn* ([.,] is_Pn*)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn* nonempty.
4645   // Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character.
4646   // Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number.
4647   if (!form.len) return;
4648 
4649   string_piece form_ori = form;
4650   char32_t first = utf8::decode(form.str, form.len);
4651 
4652   // Try matching a number.
4653   char32_t codepoint = first;
4654   bool any_digit = false;
4655   if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len);
4656   while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
4657   if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len);
4658   while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
4659   if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
4660     codepoint = utf8::decode(form.str, form.len);
4661     if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len);
4662     any_digit = false;
4663     while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
4664   }
4665 
4666   if (any_digit && !form.len && (!codepoint || codepoint == '.')) {
4667     lemmas.emplace_back(string(form_ori.str, form_ori.len - (codepoint == '.')), number_tag);
4668   } else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) ||
4669              ((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first])))
4670     lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag);
4671 }
4672 
4673 } // namespace morphodita
4674 
4675 /////////
4676 // File: morphodita/morpho/english_lemma_addinfo.h
4677 /////////
4678 
4679 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4680 //
4681 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4682 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4683 //
4684 // This Source Code Form is subject to the terms of the Mozilla Public
4685 // License, v. 2.0. If a copy of the MPL was not distributed with this
4686 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4687 
4688 namespace morphodita {
4689 
4690 // Declarations
4691 struct english_lemma_addinfo {
4692   inline static int raw_lemma_len(string_piece lemma);
4693   inline static int lemma_id_len(string_piece lemma);
4694   inline static string format(const unsigned char* addinfo, int addinfo_len);
4695   inline static bool generatable(const unsigned char* addinfo, int addinfo_len);
4696 
4697   inline int parse(string_piece lemma, bool die_on_failure = false);
4698   inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len);
4699 
4700   vector<unsigned char> data;
4701 };
4702 
4703 // Definitions
raw_lemma_len(string_piece lemma)4704 int english_lemma_addinfo::raw_lemma_len(string_piece lemma) {
4705   // Lemma ends either by
4706   // - '^' on non-first position followed by nothing or [A-Za-z][-A-Za-z]*
4707   // - '+' on non-first position followed by nothing
4708   for (unsigned len = 1; len < lemma.len; len++) {
4709     if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+'))
4710       return len;
4711     if (len + 1 < lemma.len && lemma.str[len] == '^') {
4712       bool ok = true;
4713       for (unsigned i = len + 1; ok && i < lemma.len; i++)
4714         ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') ||
4715             (lemma.str[i] >= 'a' && lemma.str[i] <= 'z') ||
4716             (i > len + 1 && lemma.str[i] == '-');
4717       if (ok) return len;
4718     }
4719   }
4720   return lemma.len;
4721 }
4722 
lemma_id_len(string_piece lemma)4723 int english_lemma_addinfo::lemma_id_len(string_piece lemma) {
4724   // No lemma comments.
4725   return lemma.len;
4726 }
4727 
format(const unsigned char * addinfo,int addinfo_len)4728 string english_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) {
4729   return string((const char*) addinfo, addinfo_len);
4730 }
4731 
generatable(const unsigned char *,int)4732 bool english_lemma_addinfo::generatable(const unsigned char* /*addinfo*/, int /*addinfo_len*/) {
4733   return true;
4734 }
4735 
parse(string_piece lemma,bool)4736 int english_lemma_addinfo::parse(string_piece lemma, bool /*die_on_failure*/) {
4737   data.clear();
4738 
4739   size_t len = raw_lemma_len(lemma);
4740   for (size_t i = len; i < lemma.len; i++)
4741     data.push_back(lemma.str[i]);
4742 
4743   return len;
4744 }
4745 
match_lemma_id(const unsigned char * other_addinfo,int other_addinfo_len)4746 bool english_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) {
4747   if (data.empty()) return true;
4748   if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
4749   if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0;
4750   return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len);
4751 }
4752 
4753 } // namespace morphodita
4754 
4755 /////////
4756 // File: morphodita/morpho/english_morpho_guesser.h
4757 /////////
4758 
4759 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4760 //
4761 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4762 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4763 //
4764 // This Source Code Form is subject to the terms of the Mozilla Public
4765 // License, v. 2.0. If a copy of the MPL was not distributed with this
4766 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4767 
4768 namespace morphodita {
4769 
4770 class english_morpho_guesser {
4771  public:
4772   void load(binary_decoder& data);
4773   void analyze(string_piece form, string_piece form_lc, vector<tagged_lemma>& lemmas) const;
4774   bool analyze_proper_names(string_piece form, string_piece form_lc, vector<tagged_lemma>& lemmas) const;
4775 
4776  private:
4777   inline void add(const string& tag, const string& form, vector<tagged_lemma>& lemmas) const;
4778   inline void add(const string& tag, const string& tag2, const string& form, vector<tagged_lemma>& lemmas) const;
4779   inline void add(const string& tag, const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const;
4780   inline void add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const;
4781   void add_NNS(const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const;
4782   void add_NNPS(const string& form, vector<tagged_lemma>& lemmas) const;
4783   void add_VBG(const string& form, vector<tagged_lemma>& lemmas) const;
4784   void add_VBD_VBN(const string& form, vector<tagged_lemma>& lemmas) const;
4785   void add_VBZ(const string& form, vector<tagged_lemma>& lemmas) const;
4786   void add_JJR_RBR(const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const;
4787   void add_JJS_RBS(const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const;
4788 
4789   enum { NEGATION_LEN = 0, TO_FOLLOW = 1, TOTAL = 2 };
4790   vector<string> exceptions_tags;
4791   persistent_unordered_map exceptions;
4792   persistent_unordered_map negations;
4793   string CD = "CD", FW = "FW", JJ = "JJ", JJR = "JJR", JJS = "JJS",
4794          NN = "NN", NNP = "NNP", NNPS = "NNPS", NNS = "NNS", RB = "RB",
4795          RBR = "RBR", RBS = "RBS", SYM = "SYM", VB = "VB", VBD = "VBD",
4796          VBG = "VBG", VBN = "VBN", VBP = "VBP", VBZ = "VBZ";
4797 };
4798 
4799 } // namespace morphodita
4800 
4801 /////////
4802 // File: morphodita/morpho/english_morpho.h
4803 /////////
4804 
4805 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4806 //
4807 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4808 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4809 //
4810 // This Source Code Form is subject to the terms of the Mozilla Public
4811 // License, v. 2.0. If a copy of the MPL was not distributed with this
4812 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4813 
4814 namespace morphodita {
4815 
4816 class english_morpho : public morpho {
4817  public:
english_morpho(unsigned version)4818   english_morpho(unsigned version) : version(version) {}
4819 
4820   virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector<tagged_lemma>& lemmas) const override;
4821   virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector<tagged_lemma_forms>& forms) const override;
4822   virtual int raw_lemma_len(string_piece lemma) const override;
4823   virtual int lemma_id_len(string_piece lemma) const override;
4824   virtual int raw_form_len(string_piece form) const override;
4825   virtual tokenizer* new_tokenizer() const override;
4826 
4827   bool load(istream& is);
4828  private:
4829   inline void analyze_special(string_piece form, vector<tagged_lemma>& lemmas) const;
4830 
4831   unsigned version;
4832   morpho_dictionary<english_lemma_addinfo> dictionary;
4833   english_morpho_guesser morpho_guesser;
4834 
4835   string unknown_tag = "UNK";
4836   string number_tag = "CD", nnp_tag = "NNP", ls_tag = "LS";
4837   string open_quotation_tag = "``", close_quotation_tag = "''";
4838   string open_parenthesis_tag = "(", close_parenthesis_tag = ")";
4839   string comma_tag = ",", dot_tag = ".", punctuation_tag = ":", hash_tag = "#", dollar_tag = "$";
4840   string sym_tag = "SYM", jj_tag = "JJ", nn_tag = "NN", nns_tag = "NNS", cc_tag = "CC", pos_tag = "POS", in_tag = "IN";
4841 };
4842 
4843 } // namespace morphodita
4844 
4845 /////////
4846 // File: morphodita/tokenizer/english_tokenizer.h
4847 /////////
4848 
4849 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4850 //
4851 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4852 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4853 //
4854 // This Source Code Form is subject to the terms of the Mozilla Public
4855 // License, v. 2.0. If a copy of the MPL was not distributed with this
4856 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4857 
4858 namespace morphodita {
4859 
4860 class english_tokenizer : public ragel_tokenizer {
4861  public:
4862   enum { LATEST = 2 };
4863   english_tokenizer(unsigned version);
4864 
4865   virtual bool next_sentence(vector<token_range>& tokens) override;
4866 
4867  private:
4868   void split_token(vector<token_range>& tokens);
4869 
4870   static const unordered_set<string> abbreviations;
4871 };
4872 
4873 } // namespace morphodita
4874 
4875 /////////
4876 // File: morphodita/morpho/english_morpho.cpp
4877 /////////
4878 
4879 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
4880 //
4881 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4882 // Mathematics and Physics, Charles University in Prague, Czech Republic.
4883 //
4884 // This Source Code Form is subject to the terms of the Mozilla Public
4885 // License, v. 2.0. If a copy of the MPL was not distributed with this
4886 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4887 
4888 namespace morphodita {
4889 
load(istream & is)4890 bool english_morpho::load(istream& is) {
4891   binary_decoder data;
4892   if (!compressor::load(is, data)) return false;
4893 
4894   try {
4895     dictionary.load(data);
4896     morpho_guesser.load(data);
4897   } catch (binary_decoder_error&) {
4898     return false;
4899   }
4900 
4901   return data.is_end();
4902 }
4903 
analyze(string_piece form,guesser_mode guesser,vector<tagged_lemma> & lemmas) const4904 int english_morpho::analyze(string_piece form, guesser_mode guesser, vector<tagged_lemma>& lemmas) const {
4905   lemmas.clear();
4906 
4907   if (form.len) {
4908     // Generate all casing variants if needed (they are different than given form).
4909     string form_uclc; // first uppercase, rest lowercase
4910     string form_lc;   // all lowercase
4911     generate_casing_variants(form, form_uclc, form_lc);
4912 
4913     // Start by analysing using the dictionary and all casing variants.
4914     dictionary.analyze(form, lemmas);
4915     if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
4916     if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
4917     if (!lemmas.empty())
4918       return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER;
4919 
4920     // Then call analyze_special to handle numbers, punctuation and symbols.
4921     analyze_special(form, lemmas);
4922     if (!lemmas.empty()) return NO_GUESSER;
4923 
4924     // Use English guesser on form_lc if allowed.
4925     if (guesser == GUESSER)
4926       morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas);
4927     if (!lemmas.empty()) return GUESSER;
4928   }
4929 
4930   lemmas.emplace_back(string(form.str, form.len), unknown_tag);
4931   return -1;
4932 }
4933 
generate(string_piece lemma,const char * tag_wildcard,morpho::guesser_mode,vector<tagged_lemma_forms> & forms) const4934 int english_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector<tagged_lemma_forms>& forms) const {
4935   forms.clear();
4936 
4937   tag_filter filter(tag_wildcard);
4938 
4939   if (lemma.len) {
4940     if (dictionary.generate(lemma, filter, forms))
4941       return NO_GUESSER;
4942   }
4943 
4944   return -1;
4945 }
4946 
raw_lemma_len(string_piece lemma) const4947 int english_morpho::raw_lemma_len(string_piece lemma) const {
4948   return english_lemma_addinfo::raw_lemma_len(lemma);
4949 }
4950 
lemma_id_len(string_piece lemma) const4951 int english_morpho::lemma_id_len(string_piece lemma) const {
4952   return english_lemma_addinfo::lemma_id_len(lemma);
4953 }
4954 
raw_form_len(string_piece form) const4955 int english_morpho::raw_form_len(string_piece form) const {
4956   return form.len;
4957 }
4958 
new_tokenizer() const4959 tokenizer* english_morpho::new_tokenizer() const {
4960   return new english_tokenizer(version <= 2 ? 1 : 2);
4961 }
4962 
analyze_special(string_piece form,vector<tagged_lemma> & lemmas) const4963 void english_morpho::analyze_special(string_piece form, vector<tagged_lemma>& lemmas) const {
4964   using namespace unilib;
4965 
4966   // Analyzer for numbers and punctuation.
4967   if (!form.len) return;
4968 
4969   // One-letter punctuation exceptions.
4970   if (form.len == 1)
4971     switch(*form.str) {
4972       case '.':
4973       case '!':
4974       case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return;
4975       case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return;
4976       case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return;
4977       case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return;
4978       case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
4979       case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
4980       case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag);
4981                 lemmas.emplace_back(string(form.str, form.len), nn_tag); return;
4982       case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag);
4983                 lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
4984       case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag);
4985                 lemmas.emplace_back(string(form.str, form.len), nn_tag); return;
4986       case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag);
4987                 lemmas.emplace_back(string(form.str, form.len), in_tag); return;
4988       case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag);
4989                  lemmas.emplace_back(string(form.str, form.len), pos_tag); return;
4990     }
4991 
4992   // Try matching a number: [+-]? is_Pn* (, is_Pn{3})? (. is_Pn*)? (s | [Ee] [+-]? is_Pn+)? with at least one digit
4993   string_piece number = form;
4994   char32_t codepoint = utf8::decode(number.str, number.len);
4995   bool any_digit = false;
4996   if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
4997   while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
4998   while (codepoint == ',') {
4999     string_piece group = number;
5000     if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5001     if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5002     if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5003     any_digit = true;
5004     number = group;
5005     codepoint = utf8::decode(number.str, number.len);
5006   }
5007   if (codepoint == '.' && number.len) {
5008     codepoint = utf8::decode(number.str, number.len);
5009     while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5010   }
5011   if (version >= 2 && any_digit && codepoint == 's' && !number.len) {
5012     lemmas.emplace_back(string(form.str, form.len), number_tag);
5013     lemmas.emplace_back(string(form.str, form.len - 1), nns_tag);
5014     return;
5015   }
5016   if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
5017     codepoint = utf8::decode(number.str, number.len);
5018     if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
5019     any_digit = false;
5020     while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5021   }
5022   if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
5023     lemmas.emplace_back(string(form.str, form.len - (codepoint == '.')), number_tag);
5024     lemmas.emplace_back(string(form.str, form.len - (codepoint == '.')), nnp_tag);
5025     if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9')
5026       lemmas.emplace_back(string(form.str, form.len - (codepoint == '.')), ls_tag);
5027     return;
5028   }
5029 
5030   // Open quotation, end quotation, open parentheses, end parentheses, symbol, or other
5031   string_piece punctuation = form;
5032   bool open_quotation = true, close_quotation = true, open_parenthesis = true, close_parenthesis = true, any_punctuation = true, symbol = true;
5033   while ((symbol || any_punctuation) && punctuation.len) {
5034     codepoint = utf8::decode(punctuation.str, punctuation.len);
5035     if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi;
5036     if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf;
5037     if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps;
5038     if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe;
5039     if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P;
5040     if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S;
5041   }
5042   if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; }
5043   if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; }
5044   if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; }
5045   if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; }
5046   if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; }
5047   if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; }
5048 }
5049 
5050 } // namespace morphodita
5051 
5052 /////////
5053 // File: morphodita/morpho/english_morpho_guesser.cpp
5054 /////////
5055 
5056 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
5057 //
5058 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5059 // Mathematics and Physics, Charles University in Prague, Czech Republic.
5060 //
5061 // This Source Code Form is subject to the terms of the Mozilla Public
5062 // License, v. 2.0. If a copy of the MPL was not distributed with this
5063 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
5064 
5065 // This code is a reimplementation of morphologic analyzer Morphium
5066 // by Johanka Spoustova (Treex::Tool::EnglishMorpho::Analysis Perl module)
5067 // and reimplementation of morphologic lemmatizer by Martin Popel
5068 // (Treex::Tool::EnglishMorpho::Lemmatizer Perl module). The latter is based
5069 // on morpha:
5070 //   Minnen, G., J. Carroll and D. Pearce (2001). Applied morphological
5071 //   processing of English, Natural Language Engineering, 7(3). 207-223.
5072 // Morpha has been released under LGPL as a part of RASP system
5073 //   http://ilexir.co.uk/applications/rasp/.
5074 
5075 namespace morphodita {
5076 
load(binary_decoder & data)5077 void english_morpho_guesser::load(binary_decoder& data) {
5078   unsigned tags = data.next_2B();
5079   exceptions_tags.clear();
5080   exceptions_tags.reserve(tags);
5081   while (tags--) {
5082     unsigned len = data.next_1B();
5083     exceptions_tags.emplace_back(string(data.next<char>(len), len));
5084   }
5085 
5086   exceptions.load(data);
5087   negations.load(data);
5088 }
5089 
5090 static const char _tag_guesser_actions[] = {
5091 	0, 1, 0, 1, 1, 1, 2, 1,
5092 	3, 1, 4, 1, 5, 1, 6, 1,
5093 	7, 2, 2, 6, 2, 2, 7, 2,
5094 	4, 6, 2, 4, 7, 2, 5, 6,
5095 	2, 5, 7, 2, 6, 7, 3, 2,
5096 	6, 7, 3, 4, 6, 7, 3, 5,
5097 	6, 7
5098 };
5099 
5100 static const unsigned char _tag_guesser_key_offsets[] = {
5101 	0, 19, 26, 34, 42, 50, 58, 66,
5102 	74, 82, 90, 100, 108, 116, 124, 132,
5103 	145, 153, 161, 168, 179, 195, 212, 220,
5104 	228, 236
5105 };
5106 
5107 static const char _tag_guesser_trans_keys[] = {
5108 	45, 46, 99, 100, 103, 105, 109, 110,
5109 	114, 115, 116, 118, 120, 48, 57, 65,
5110 	90, 97, 122, 45, 48, 57, 65, 90,
5111 	97, 122, 45, 114, 48, 57, 65, 90,
5112 	97, 122, 45, 111, 48, 57, 65, 90,
5113 	97, 122, 45, 109, 48, 57, 65, 90,
5114 	97, 122, 45, 101, 48, 57, 65, 90,
5115 	97, 122, 45, 115, 48, 57, 65, 90,
5116 	97, 122, 45, 101, 48, 57, 65, 90,
5117 	97, 122, 45, 108, 48, 57, 65, 90,
5118 	97, 122, 45, 115, 48, 57, 65, 90,
5119 	97, 122, 45, 97, 101, 111, 48, 57,
5120 	65, 90, 98, 122, 45, 101, 48, 57,
5121 	65, 90, 97, 122, 45, 108, 48, 57,
5122 	65, 90, 97, 122, 45, 109, 48, 57,
5123 	65, 90, 97, 122, 45, 105, 48, 57,
5124 	65, 90, 97, 122, 45, 97, 101, 105,
5125 	111, 117, 121, 48, 57, 65, 90, 98,
5126 	122, 45, 115, 48, 57, 65, 90, 97,
5127 	122, 45, 101, 48, 57, 65, 90, 97,
5128 	122, 45, 48, 57, 65, 90, 97, 122,
5129 	45, 101, 114, 115, 116, 48, 57, 65,
5130 	90, 97, 122, 45, 46, 105, 109, 118,
5131 	120, 48, 57, 65, 90, 97, 98, 99,
5132 	100, 101, 122, 45, 46, 101, 105, 109,
5133 	118, 120, 48, 57, 65, 90, 97, 98,
5134 	99, 100, 102, 122, 45, 110, 48, 57,
5135 	65, 90, 97, 122, 45, 105, 48, 57,
5136 	65, 90, 97, 122, 45, 101, 48, 57,
5137 	65, 90, 97, 122, 45, 115, 48, 57,
5138 	65, 90, 97, 122, 0
5139 };
5140 
5141 static const char _tag_guesser_single_lengths[] = {
5142 	13, 1, 2, 2, 2, 2, 2, 2,
5143 	2, 2, 4, 2, 2, 2, 2, 7,
5144 	2, 2, 1, 5, 6, 7, 2, 2,
5145 	2, 2
5146 };
5147 
5148 static const char _tag_guesser_range_lengths[] = {
5149 	3, 3, 3, 3, 3, 3, 3, 3,
5150 	3, 3, 3, 3, 3, 3, 3, 3,
5151 	3, 3, 3, 3, 5, 5, 3, 3,
5152 	3, 3
5153 };
5154 
5155 static const unsigned char _tag_guesser_index_offsets[] = {
5156 	0, 17, 22, 28, 34, 40, 46, 52,
5157 	58, 64, 70, 78, 84, 90, 96, 102,
5158 	113, 119, 125, 130, 139, 151, 164, 170,
5159 	176, 182
5160 };
5161 
5162 static const char _tag_guesser_indicies[] = {
5163 	1, 2, 5, 6, 7, 5, 5, 8,
5164 	9, 10, 11, 5, 5, 3, 4, 4,
5165 	0, 13, 14, 15, 15, 12, 13, 16,
5166 	14, 15, 15, 12, 13, 17, 14, 15,
5167 	15, 12, 13, 18, 14, 15, 15, 12,
5168 	13, 18, 14, 15, 15, 12, 13, 19,
5169 	14, 15, 15, 12, 13, 20, 14, 15,
5170 	15, 12, 13, 18, 14, 15, 15, 12,
5171 	13, 21, 14, 15, 15, 12, 13, 22,
5172 	23, 24, 14, 15, 15, 12, 13, 25,
5173 	14, 15, 15, 12, 13, 23, 14, 15,
5174 	15, 12, 13, 23, 14, 15, 15, 12,
5175 	13, 26, 14, 15, 15, 12, 28, 15,
5176 	15, 15, 15, 15, 15, 29, 26, 26,
5177 	27, 31, 4, 32, 33, 33, 30, 13,
5178 	23, 14, 15, 15, 12, 13, 14, 15,
5179 	15, 12, 13, 34, 35, 36, 37, 14,
5180 	15, 15, 12, 13, 38, 39, 39, 39,
5181 	39, 14, 15, 15, 39, 15, 12, 13,
5182 	38, 40, 39, 39, 39, 39, 14, 15,
5183 	15, 39, 15, 12, 13, 41, 14, 15,
5184 	15, 12, 13, 42, 14, 15, 15, 12,
5185 	13, 18, 14, 15, 15, 12, 13, 43,
5186 	14, 15, 15, 12, 0
5187 };
5188 
5189 static const char _tag_guesser_trans_targs[] = {
5190 	18, 19, 20, 18, 18, 20, 21, 22,
5191 	23, 24, 16, 25, 18, 19, 18, 1,
5192 	3, 4, 18, 7, 8, 10, 11, 18,
5193 	13, 12, 18, 18, 19, 18, 18, 19,
5194 	18, 18, 2, 5, 6, 9, 20, 20,
5195 	18, 14, 15, 17
5196 };
5197 
5198 static const char _tag_guesser_trans_actions[] = {
5199 	29, 46, 29, 32, 11, 11, 11, 11,
5200 	11, 11, 0, 11, 13, 35, 15, 0,
5201 	0, 0, 1, 0, 0, 0, 0, 3,
5202 	0, 0, 5, 17, 38, 20, 23, 42,
5203 	26, 9, 0, 0, 0, 0, 13, 0,
5204 	7, 0, 0, 0
5205 };
5206 
5207 static const char _tag_guesser_eof_actions[] = {
5208 	0, 0, 0, 0, 0, 0, 0, 0,
5209 	0, 0, 0, 0, 0, 0, 0, 0,
5210 	0, 0, 0, 0, 15, 15, 0, 0,
5211 	0, 0
5212 };
5213 
5214 static const int tag_guesser_start = 0;
5215 
analyze(string_piece form,string_piece form_lc,vector<tagged_lemma> & lemmas) const5216 void english_morpho_guesser::analyze(string_piece form, string_piece form_lc, vector<tagged_lemma>& lemmas) const {
5217   // Try exceptions list
5218   auto* exception = exceptions.at(form_lc.str, form_lc.len, [](pointer_decoder& data){
5219     for (unsigned len = data.next_1B(); len; len--) {
5220       data.next<char>(data.next_1B());
5221       data.next<uint16_t>(data.next_1B());
5222     }
5223   });
5224 
5225   if (exception) {
5226     // Found in exceptions list
5227     pointer_decoder data(exception);
5228     for (unsigned len = data.next_1B(); len; len--) {
5229       unsigned lemma_len = data.next_1B();
5230       string lemma(data.next<char>(lemma_len), lemma_len);
5231       for (unsigned tags = data.next_1B(); tags; tags--)
5232         lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]);
5233     }
5234   } else {
5235     // Try stripping negative prefix and use rule guesser
5236     string lemma_lc(form_lc.str, form_lc.len);
5237     // Try finding negative prefix
5238     unsigned negation_len = 0;
5239     for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) {
5240       auto found = negations.at(form_lc.str, prefix, [](pointer_decoder& data){ data.next<unsigned char>(TOTAL); });
5241       if (!found) break;
5242       if (found[NEGATION_LEN]) {
5243         if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN];
5244       }
5245     }
5246 
5247     // Add default tags
5248     add(FW, lemma_lc, lemmas);
5249     add(JJ, lemma_lc, negation_len, lemmas);
5250     add(RB, lemma_lc, negation_len, lemmas);
5251     add(NN, lemma_lc, negation_len, lemmas);
5252     add_NNS(lemma_lc, negation_len, lemmas);
5253 
5254     // Add specialized tags
5255     const char* p = form_lc.str; int cs;
5256     bool added_JJR_RBR = false, added_JJS_RBS = false, added_SYM = false, added_CD = false;
5257 
5258 	{
5259 	cs = tag_guesser_start;
5260 	}
5261 
5262 	{
5263 	int _klen;
5264 	unsigned int _trans;
5265 	const char *_acts;
5266 	unsigned int _nacts;
5267 	const char *_keys;
5268 
5269 	if ( p == ( (form_lc.str + form_lc.len)) )
5270 		goto _test_eof;
5271 _resume:
5272 	_keys = _tag_guesser_trans_keys + _tag_guesser_key_offsets[cs];
5273 	_trans = _tag_guesser_index_offsets[cs];
5274 
5275 	_klen = _tag_guesser_single_lengths[cs];
5276 	if ( _klen > 0 ) {
5277 		const char *_lower = _keys;
5278 		const char *_mid;
5279 		const char *_upper = _keys + _klen - 1;
5280 		while (1) {
5281 			if ( _upper < _lower )
5282 				break;
5283 
5284 			_mid = _lower + ((_upper-_lower) >> 1);
5285 			if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid )
5286 				_upper = _mid - 1;
5287 			else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid )
5288 				_lower = _mid + 1;
5289 			else {
5290 				_trans += (unsigned int)(_mid - _keys);
5291 				goto _match;
5292 			}
5293 		}
5294 		_keys += _klen;
5295 		_trans += _klen;
5296 	}
5297 
5298 	_klen = _tag_guesser_range_lengths[cs];
5299 	if ( _klen > 0 ) {
5300 		const char *_lower = _keys;
5301 		const char *_mid;
5302 		const char *_upper = _keys + (_klen<<1) - 2;
5303 		while (1) {
5304 			if ( _upper < _lower )
5305 				break;
5306 
5307 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
5308 			if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] )
5309 				_upper = _mid - 2;
5310 			else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] )
5311 				_lower = _mid + 2;
5312 			else {
5313 				_trans += (unsigned int)((_mid - _keys)>>1);
5314 				goto _match;
5315 			}
5316 		}
5317 		_trans += _klen;
5318 	}
5319 
5320 _match:
5321 	_trans = _tag_guesser_indicies[_trans];
5322 	cs = _tag_guesser_trans_targs[_trans];
5323 
5324 	if ( _tag_guesser_trans_actions[_trans] == 0 )
5325 		goto _again;
5326 
5327 	_acts = _tag_guesser_actions + _tag_guesser_trans_actions[_trans];
5328 	_nacts = (unsigned int) *_acts++;
5329 	while ( _nacts-- > 0 )
5330 	{
5331 		switch ( *_acts++ )
5332 		{
5333 	case 0:
5334 	{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); }
5335 	break;
5336 	case 1:
5337 	{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); }
5338 	break;
5339 	case 2:
5340 	{ add_VBG(lemma_lc, lemmas); }
5341 	break;
5342 	case 3:
5343 	{ add_VBD_VBN(lemma_lc, lemmas); }
5344 	break;
5345 	case 4:
5346 	{ add_VBZ(lemma_lc, lemmas); }
5347 	break;
5348 	case 5:
5349 	{ add(VB, lemma_lc, lemmas); add(VBP, lemma_lc, lemmas); }
5350 	break;
5351 	case 6:
5352 	{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); }
5353 	break;
5354 	case 7:
5355 	{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); }
5356 	break;
5357 		}
5358 	}
5359 
5360 _again:
5361 	if ( ++p != ( (form_lc.str + form_lc.len)) )
5362 		goto _resume;
5363 	_test_eof: {}
5364 	if ( p == ( (form_lc.str + form_lc.len)) )
5365 	{
5366 	const char *__acts = _tag_guesser_actions + _tag_guesser_eof_actions[cs];
5367 	unsigned int __nacts = (unsigned int) *__acts++;
5368 	while ( __nacts-- > 0 ) {
5369 		switch ( *__acts++ ) {
5370 	case 7:
5371 	{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); }
5372 	break;
5373 		}
5374 	}
5375 	}
5376 
5377 	}
5378 
5379   }
5380 
5381   // Add proper names
5382   analyze_proper_names(form, form_lc, lemmas);
5383 }
5384 
analyze_proper_names(string_piece form,string_piece form_lc,vector<tagged_lemma> & lemmas) const5385 bool english_morpho_guesser::analyze_proper_names(string_piece form, string_piece form_lc, vector<tagged_lemma>& lemmas) const {
5386   // NNP if form_lc != form or form.str[0] =~ /[0-9']/, NNPS if form_lc != form
5387   bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9')));
5388   bool is_NNPS = form.str != form_lc.str;
5389   if (!is_NNP && !is_NNPS) return false;
5390 
5391   bool was_NNP = false, was_NNPS = false;
5392   for (auto&& lemma : lemmas) {
5393     was_NNP |= lemma.tag == NNP;
5394     was_NNPS |= lemma.tag == NNPS;
5395   }
5396   if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false;
5397 
5398   string lemma(form.str, form.len);
5399   if (is_NNP && !was_NNP) add(NNP, lemma, lemmas);
5400   if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas);
5401   return true;
5402 }
5403 
add(const string & tag,const string & form,vector<tagged_lemma> & lemmas) const5404 inline void english_morpho_guesser::add(const string& tag, const string& form, vector<tagged_lemma>& lemmas) const {
5405   lemmas.emplace_back(form, tag);
5406 }
5407 
add(const string & tag,const string & tag2,const string & form,vector<tagged_lemma> & lemmas) const5408 inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, vector<tagged_lemma>& lemmas) const {
5409   add(tag, form, lemmas);
5410   add(tag2, form, lemmas);
5411 }
5412 
add(const string & tag,const string & form,unsigned negation_len,vector<tagged_lemma> & lemmas) const5413 inline void english_morpho_guesser::add(const string& tag, const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const {
5414   lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
5415 }
5416 
add(const string & tag,const string & tag2,const string & form,unsigned negation_len,vector<tagged_lemma> & lemmas) const5417 inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const {
5418   add(tag, form, negation_len, lemmas);
5419   add(tag2, form, negation_len, lemmas);
5420 }
5421 
5422 // Common definitions (written backwards)
5423 #define REM(str, len) (str.substr(0, str.size() - len))
5424 #define REM_ADD(str, len, add) (str.substr(0, str.size() - len).append(add))
5425 
5426 static const char _NNS_actions[] = {
5427 	0, 1, 0, 1, 1, 1, 2, 1,
5428 	3, 1, 4, 1, 5, 1, 6, 1,
5429 	7, 1, 8, 1, 9, 1, 10, 1,
5430 	11, 1, 12, 1, 13
5431 };
5432 
5433 static const char _NNS_key_offsets[] = {
5434 	0, 0, 2, 3, 4, 5, 7, 17,
5435 	17, 29, 30, 35, 35, 36, 37, 37,
5436 	37, 44, 45, 53, 63, 72
5437 };
5438 
5439 static const char _NNS_trans_keys[] = {
5440 	110, 115, 101, 109, 101, 99, 115, 98,
5441 	100, 102, 104, 106, 110, 112, 116, 118,
5442 	122, 104, 122, 98, 100, 102, 103, 106,
5443 	110, 112, 116, 118, 120, 111, 97, 101,
5444 	105, 111, 117, 105, 119, 104, 105, 111,
5445 	115, 118, 120, 122, 115, 97, 101, 105,
5446 	110, 111, 114, 115, 117, 98, 100, 102,
5447 	104, 106, 110, 112, 116, 118, 122, 97,
5448 	101, 105, 111, 117, 121, 122, 98, 120,
5449 	0
5450 };
5451 
5452 static const char _NNS_single_lengths[] = {
5453 	0, 2, 1, 1, 1, 2, 0, 0,
5454 	2, 1, 5, 0, 1, 1, 0, 0,
5455 	7, 1, 8, 0, 7, 0
5456 };
5457 
5458 static const char _NNS_range_lengths[] = {
5459 	0, 0, 0, 0, 0, 0, 5, 0,
5460 	5, 0, 0, 0, 0, 0, 0, 0,
5461 	0, 0, 0, 5, 1, 0
5462 };
5463 
5464 static const char _NNS_index_offsets[] = {
5465 	0, 0, 3, 5, 7, 9, 12, 18,
5466 	19, 27, 29, 35, 36, 38, 40, 41,
5467 	42, 50, 52, 61, 67, 76
5468 };
5469 
5470 static const char _NNS_indicies[] = {
5471 	0, 2, 1, 3, 1, 4, 1, 6,
5472 	5, 7, 7, 1, 8, 8, 8, 8,
5473 	8, 1, 9, 11, 10, 10, 10, 10,
5474 	10, 10, 1, 12, 1, 13, 13, 13,
5475 	13, 13, 1, 14, 15, 1, 16, 1,
5476 	17, 1, 18, 19, 20, 21, 22, 7,
5477 	23, 1, 24, 1, 25, 25, 25, 26,
5478 	25, 27, 28, 29, 1, 30, 30, 30,
5479 	30, 30, 1, 31, 31, 31, 31, 31,
5480 	31, 33, 32, 1, 17, 0
5481 };
5482 
5483 static const char _NNS_trans_targs[] = {
5484 	2, 0, 4, 3, 15, 15, 16, 15,
5485 	7, 15, 15, 17, 15, 11, 15, 13,
5486 	15, 15, 5, 6, 8, 18, 12, 20,
5487 	15, 15, 9, 10, 15, 19, 15, 15,
5488 	14, 21
5489 };
5490 
5491 static const char _NNS_trans_actions[] = {
5492 	0, 0, 0, 0, 1, 27, 27, 21,
5493 	0, 23, 25, 25, 19, 0, 17, 0,
5494 	5, 11, 0, 0, 0, 21, 0, 21,
5495 	3, 9, 0, 0, 15, 9, 7, 13,
5496 	0, 15
5497 };
5498 
5499 static const int NNS_start = 1;
5500 
add_NNS(const string & form,unsigned negation_len,vector<tagged_lemma> & lemmas) const5501 void english_morpho_guesser::add_NNS(const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const {
5502   const char* p = form.c_str() + negation_len; int cs;
5503   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
5504 
5505 	{
5506 	cs = NNS_start;
5507 	}
5508 
5509 	{
5510 	int _klen;
5511 	unsigned int _trans;
5512 	const char *_acts;
5513 	unsigned int _nacts;
5514 	const char *_keys;
5515 
5516 	if ( p == ( (form.c_str() + form.size())) )
5517 		goto _test_eof;
5518 	if ( cs == 0 )
5519 		goto _out;
5520 _resume:
5521 	_keys = _NNS_trans_keys + _NNS_key_offsets[cs];
5522 	_trans = _NNS_index_offsets[cs];
5523 
5524 	_klen = _NNS_single_lengths[cs];
5525 	if ( _klen > 0 ) {
5526 		const char *_lower = _keys;
5527 		const char *_mid;
5528 		const char *_upper = _keys + _klen - 1;
5529 		while (1) {
5530 			if ( _upper < _lower )
5531 				break;
5532 
5533 			_mid = _lower + ((_upper-_lower) >> 1);
5534 			if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
5535 				_upper = _mid - 1;
5536 			else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
5537 				_lower = _mid + 1;
5538 			else {
5539 				_trans += (unsigned int)(_mid - _keys);
5540 				goto _match;
5541 			}
5542 		}
5543 		_keys += _klen;
5544 		_trans += _klen;
5545 	}
5546 
5547 	_klen = _NNS_range_lengths[cs];
5548 	if ( _klen > 0 ) {
5549 		const char *_lower = _keys;
5550 		const char *_mid;
5551 		const char *_upper = _keys + (_klen<<1) - 2;
5552 		while (1) {
5553 			if ( _upper < _lower )
5554 				break;
5555 
5556 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
5557 			if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
5558 				_upper = _mid - 2;
5559 			else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
5560 				_lower = _mid + 2;
5561 			else {
5562 				_trans += (unsigned int)((_mid - _keys)>>1);
5563 				goto _match;
5564 			}
5565 		}
5566 		_trans += _klen;
5567 	}
5568 
5569 _match:
5570 	_trans = _NNS_indicies[_trans];
5571 	cs = _NNS_trans_targs[_trans];
5572 
5573 	if ( _NNS_trans_actions[_trans] == 0 )
5574 		goto _again;
5575 
5576 	_acts = _NNS_actions + _NNS_trans_actions[_trans];
5577 	_nacts = (unsigned int) *_acts++;
5578 	while ( _nacts-- > 0 )
5579 	{
5580 		switch ( *_acts++ )
5581 		{
5582 	case 0:
5583 	{ if (best > 'a') best = 'a', remove = 2, append = "an";    }
5584 	break;
5585 	case 1:
5586 	{ if (best > 'b') best = 'b', remove = 1, append = nullptr; }
5587 	break;
5588 	case 2:
5589 	{ if (best > 'c') best = 'c', remove = 3, append = "fe";    }
5590 	break;
5591 	case 3:
5592 	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
5593 	break;
5594 	case 4:
5595 	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
5596 	break;
5597 	case 5:
5598 	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
5599 	break;
5600 	case 6:
5601 	{ if (best > 'g') best = 'g', remove = 1, append = nullptr; }
5602 	break;
5603 	case 7:
5604 	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
5605 	break;
5606 	case 8:
5607 	{ if (best > 'i') best = 'i', remove = 1, append = nullptr; }
5608 	break;
5609 	case 9:
5610 	{ if (best > 'j') best = 'j', remove = 1, append = nullptr; }
5611 	break;
5612 	case 10:
5613 	{ if (best > 'k') best = 'k', remove = 2, append = nullptr; }
5614 	break;
5615 	case 11:
5616 	{ if (best > 'l') best = 'l', remove = 3, append = "y";     }
5617 	break;
5618 	case 12:
5619 	{ if (best > 'm') best = 'm', remove = 2, append = nullptr; }
5620 	break;
5621 	case 13:
5622 	{ if (best > 'n') best = 'n', remove = 1, append = nullptr; }
5623 	break;
5624 		}
5625 	}
5626 
5627 _again:
5628 	if ( cs == 0 )
5629 		goto _out;
5630 	if ( ++p != ( (form.c_str() + form.size())) )
5631 		goto _resume;
5632 	_test_eof: {}
5633 	_out: {}
5634 	}
5635 
5636   add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
5637 }
5638 
5639 static const char _NNPS_actions[] = {
5640 	0, 1, 1, 1, 2, 1, 4, 1,
5641 	5, 1, 6, 1, 7, 1, 8, 1,
5642 	9, 1, 10, 1, 11, 1, 12, 1,
5643 	14, 1, 15, 1, 16, 2, 0, 1,
5644 	2, 3, 4, 2, 13, 14
5645 };
5646 
5647 static const unsigned char _NNPS_key_offsets[] = {
5648 	0, 0, 4, 6, 8, 10, 12, 16,
5649 	36, 36, 60, 62, 72, 72, 74, 76,
5650 	78, 78, 98, 98, 100, 102, 104, 104,
5651 	118, 120, 136, 156, 174, 174
5652 };
5653 
5654 static const char _NNPS_trans_keys[] = {
5655 	78, 83, 110, 115, 69, 101, 77, 109,
5656 	77, 109, 69, 101, 67, 83, 99, 115,
5657 	66, 68, 70, 72, 74, 78, 80, 84,
5658 	86, 90, 98, 100, 102, 104, 106, 110,
5659 	112, 116, 118, 122, 72, 90, 104, 122,
5660 	66, 68, 70, 71, 74, 78, 80, 84,
5661 	86, 88, 98, 100, 102, 103, 106, 110,
5662 	112, 116, 118, 120, 79, 111, 65, 69,
5663 	73, 79, 85, 97, 101, 105, 111, 117,
5664 	73, 105, 87, 119, 87, 119, 66, 68,
5665 	70, 72, 74, 78, 80, 84, 86, 90,
5666 	98, 100, 102, 104, 106, 110, 112, 116,
5667 	118, 122, 73, 105, 69, 101, 69, 101,
5668 	72, 73, 79, 83, 86, 88, 90, 104,
5669 	105, 111, 115, 118, 120, 122, 83, 115,
5670 	65, 69, 73, 78, 79, 82, 83, 85,
5671 	97, 101, 105, 110, 111, 114, 115, 117,
5672 	66, 68, 70, 72, 74, 78, 80, 84,
5673 	86, 90, 98, 100, 102, 104, 106, 110,
5674 	112, 116, 118, 122, 65, 69, 73, 79,
5675 	85, 89, 90, 97, 101, 105, 111, 117,
5676 	121, 122, 66, 88, 98, 120, 72, 73,
5677 	79, 83, 86, 88, 90, 104, 105, 111,
5678 	115, 118, 120, 122, 0
5679 };
5680 
5681 static const char _NNPS_single_lengths[] = {
5682 	0, 4, 2, 2, 2, 2, 4, 0,
5683 	0, 4, 2, 10, 0, 2, 2, 2,
5684 	0, 0, 0, 2, 2, 2, 0, 14,
5685 	2, 16, 0, 14, 0, 14
5686 };
5687 
5688 static const char _NNPS_range_lengths[] = {
5689 	0, 0, 0, 0, 0, 0, 0, 10,
5690 	0, 10, 0, 0, 0, 0, 0, 0,
5691 	0, 10, 0, 0, 0, 0, 0, 0,
5692 	0, 0, 10, 2, 0, 0
5693 };
5694 
5695 static const unsigned char _NNPS_index_offsets[] = {
5696 	0, 0, 5, 8, 11, 14, 17, 22,
5697 	33, 34, 49, 52, 63, 64, 67, 70,
5698 	73, 74, 85, 86, 89, 92, 95, 96,
5699 	111, 114, 131, 142, 159, 160
5700 };
5701 
5702 static const char _NNPS_indicies[] = {
5703 	0, 2, 3, 4, 1, 5, 6, 1,
5704 	7, 8, 1, 8, 8, 1, 10, 11,
5705 	9, 12, 12, 12, 12, 1, 13, 13,
5706 	13, 13, 13, 13, 13, 13, 13, 13,
5707 	1, 14, 16, 15, 16, 15, 15, 15,
5708 	15, 15, 15, 15, 15, 15, 15, 15,
5709 	1, 17, 17, 1, 18, 18, 18, 18,
5710 	18, 18, 18, 18, 18, 18, 1, 19,
5711 	20, 21, 1, 22, 23, 1, 23, 23,
5712 	1, 24, 25, 25, 25, 25, 25, 25,
5713 	25, 25, 25, 25, 1, 26, 21, 21,
5714 	1, 6, 6, 1, 11, 11, 9, 1,
5715 	27, 28, 29, 30, 31, 12, 32, 27,
5716 	33, 29, 30, 34, 12, 32, 1, 35,
5717 	35, 1, 36, 36, 36, 37, 36, 38,
5718 	39, 40, 36, 36, 36, 37, 36, 38,
5719 	39, 40, 1, 41, 41, 41, 41, 41,
5720 	41, 41, 41, 41, 41, 1, 42, 42,
5721 	42, 42, 42, 42, 44, 42, 42, 42,
5722 	42, 42, 42, 44, 43, 43, 1, 24,
5723 	27, 33, 29, 30, 34, 12, 32, 27,
5724 	33, 29, 30, 34, 12, 32, 1, 0
5725 };
5726 
5727 static const char _NNPS_trans_targs[] = {
5728 	2, 0, 5, 20, 21, 3, 4, 22,
5729 	22, 22, 23, 29, 22, 8, 22, 22,
5730 	24, 22, 12, 22, 14, 15, 22, 22,
5731 	22, 18, 22, 6, 7, 9, 25, 13,
5732 	27, 17, 19, 22, 22, 10, 11, 22,
5733 	26, 22, 22, 16, 28
5734 };
5735 
5736 static const char _NNPS_trans_actions[] = {
5737 	0, 0, 0, 0, 0, 0, 0, 29,
5738 	1, 27, 27, 27, 21, 0, 35, 25,
5739 	25, 19, 0, 17, 0, 0, 32, 5,
5740 	11, 0, 23, 0, 0, 0, 21, 0,
5741 	21, 0, 0, 3, 9, 0, 0, 15,
5742 	9, 7, 13, 0, 15
5743 };
5744 
5745 static const int NNPS_start = 1;
5746 
add_NNPS(const string & form,vector<tagged_lemma> & lemmas) const5747 void english_morpho_guesser::add_NNPS(const string& form, vector<tagged_lemma>& lemmas) const {
5748   const char* p = form.c_str(); int cs;
5749   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
5750 
5751 	{
5752 	cs = NNPS_start;
5753 	}
5754 
5755 	{
5756 	int _klen;
5757 	unsigned int _trans;
5758 	const char *_acts;
5759 	unsigned int _nacts;
5760 	const char *_keys;
5761 
5762 	if ( p == ( (form.c_str() + form.size())) )
5763 		goto _test_eof;
5764 	if ( cs == 0 )
5765 		goto _out;
5766 _resume:
5767 	_keys = _NNPS_trans_keys + _NNPS_key_offsets[cs];
5768 	_trans = _NNPS_index_offsets[cs];
5769 
5770 	_klen = _NNPS_single_lengths[cs];
5771 	if ( _klen > 0 ) {
5772 		const char *_lower = _keys;
5773 		const char *_mid;
5774 		const char *_upper = _keys + _klen - 1;
5775 		while (1) {
5776 			if ( _upper < _lower )
5777 				break;
5778 
5779 			_mid = _lower + ((_upper-_lower) >> 1);
5780 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
5781 				_upper = _mid - 1;
5782 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
5783 				_lower = _mid + 1;
5784 			else {
5785 				_trans += (unsigned int)(_mid - _keys);
5786 				goto _match;
5787 			}
5788 		}
5789 		_keys += _klen;
5790 		_trans += _klen;
5791 	}
5792 
5793 	_klen = _NNPS_range_lengths[cs];
5794 	if ( _klen > 0 ) {
5795 		const char *_lower = _keys;
5796 		const char *_mid;
5797 		const char *_upper = _keys + (_klen<<1) - 2;
5798 		while (1) {
5799 			if ( _upper < _lower )
5800 				break;
5801 
5802 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
5803 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
5804 				_upper = _mid - 2;
5805 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
5806 				_lower = _mid + 2;
5807 			else {
5808 				_trans += (unsigned int)((_mid - _keys)>>1);
5809 				goto _match;
5810 			}
5811 		}
5812 		_trans += _klen;
5813 	}
5814 
5815 _match:
5816 	_trans = _NNPS_indicies[_trans];
5817 	cs = _NNPS_trans_targs[_trans];
5818 
5819 	if ( _NNPS_trans_actions[_trans] == 0 )
5820 		goto _again;
5821 
5822 	_acts = _NNPS_actions + _NNPS_trans_actions[_trans];
5823 	_nacts = (unsigned int) *_acts++;
5824 	while ( _nacts-- > 0 )
5825 	{
5826 		switch ( *_acts++ )
5827 		{
5828 	case 0:
5829 	{ if (best > 'a') best = 'a', remove = 2, append = "AN";    }
5830 	break;
5831 	case 1:
5832 	{ if (best > 'b') best = 'b', remove = 2, append = "an";    }
5833 	break;
5834 	case 2:
5835 	{ if (best > 'c') best = 'c', remove = 1, append = nullptr; }
5836 	break;
5837 	case 3:
5838 	{ if (best > 'd') best = 'd', remove = 3, append = "FE";    }
5839 	break;
5840 	case 4:
5841 	{ if (best > 'e') best = 'e', remove = 3, append = "fe";    }
5842 	break;
5843 	case 5:
5844 	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
5845 	break;
5846 	case 6:
5847 	{ if (best > 'g') best = 'g', remove = 1, append = nullptr; }
5848 	break;
5849 	case 7:
5850 	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
5851 	break;
5852 	case 8:
5853 	{ if (best > 'i') best = 'i', remove = 1, append = nullptr; }
5854 	break;
5855 	case 9:
5856 	{ if (best > 'j') best = 'j', remove = 2, append = nullptr; }
5857 	break;
5858 	case 10:
5859 	{ if (best > 'k') best = 'k', remove = 1, append = nullptr; }
5860 	break;
5861 	case 11:
5862 	{ if (best > 'l') best = 'l', remove = 1, append = nullptr; }
5863 	break;
5864 	case 12:
5865 	{ if (best > 'm') best = 'm', remove = 2, append = nullptr; }
5866 	break;
5867 	case 13:
5868 	{ if (best > 'n') best = 'n', remove = 3, append = "Y";     }
5869 	break;
5870 	case 14:
5871 	{ if (best > 'o') best = 'o', remove = 3, append = "y";     }
5872 	break;
5873 	case 15:
5874 	{ if (best > 'p') best = 'p', remove = 2, append = nullptr; }
5875 	break;
5876 	case 16:
5877 	{ if (best > 'q') best = 'q', remove = 1, append = nullptr; }
5878 	break;
5879 		}
5880 	}
5881 
5882 _again:
5883 	if ( cs == 0 )
5884 		goto _out;
5885 	if ( ++p != ( (form.c_str() + form.size())) )
5886 		goto _resume;
5887 	_test_eof: {}
5888 	_out: {}
5889 	}
5890 
5891   add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
5892 }
5893 
5894 static const char _VBG_actions[] = {
5895 	0, 1, 1, 1, 2, 1, 4, 1,
5896 	5, 1, 6, 1, 7, 1, 9, 1,
5897 	10, 1, 11, 1, 12, 1, 13, 1,
5898 	14, 1, 15, 1, 16, 1, 17, 2,
5899 	0, 12, 2, 3, 4, 2, 5, 9,
5900 	2, 5, 10, 2, 8, 9, 2, 9,
5901 	10, 2, 11, 12, 3, 0, 2, 12,
5902 	3, 2, 11, 12
5903 };
5904 
5905 static const short _VBG_key_offsets[] = {
5906 	0, 0, 1, 2, 3, 9, 14, 24,
5907 	29, 34, 44, 46, 47, 48, 49, 50,
5908 	51, 52, 59, 66, 68, 70, 71, 72,
5909 	73, 74, 75, 76, 81, 89, 90, 91,
5910 	92, 93, 94, 96, 97, 98, 99, 100,
5911 	101, 102, 127, 127, 136, 137, 142, 153,
5912 	162, 171, 181, 186, 191, 197, 207, 207,
5913 	216, 228, 229, 240, 240, 249, 258, 267,
5914 	276, 285, 290, 302, 313, 318, 324, 334,
5915 	344, 355, 362, 373, 382, 391, 391, 402,
5916 	413, 415, 416, 417, 417, 418, 426, 437,
5917 	442, 448, 458, 468, 479, 486, 497, 504,
5918 	510, 519, 528, 537, 543
5919 };
5920 
5921 static const char _VBG_trans_keys[] = {
5922 	103, 110, 105, 97, 101, 105, 111, 117,
5923 	121, 97, 101, 105, 111, 117, 98, 100,
5924 	102, 104, 106, 110, 112, 116, 118, 122,
5925 	97, 101, 105, 111, 117, 97, 101, 105,
5926 	111, 117, 98, 100, 102, 104, 106, 110,
5927 	112, 116, 118, 122, 98, 114, 105, 114,
5928 	112, 105, 109, 101, 97, 101, 105, 111,
5929 	117, 98, 122, 97, 101, 105, 111, 117,
5930 	98, 122, 97, 122, 98, 114, 105, 114,
5931 	112, 105, 109, 101, 97, 101, 105, 111,
5932 	117, 97, 101, 105, 110, 111, 115, 117,
5933 	120, 105, 112, 105, 109, 101, 98, 114,
5934 	105, 114, 112, 105, 109, 101, 98, 99,
5935 	100, 102, 103, 104, 106, 107, 108, 109,
5936 	110, 111, 112, 113, 114, 115, 116, 117,
5937 	118, 119, 120, 121, 122, 97, 105, 97,
5938 	98, 101, 105, 111, 117, 122, 99, 120,
5939 	113, 97, 101, 105, 111, 117, 98, 99,
5940 	100, 105, 111, 117, 122, 97, 101, 102,
5941 	120, 97, 100, 101, 105, 111, 117, 122,
5942 	98, 120, 97, 101, 102, 105, 111, 117,
5943 	122, 98, 120, 97, 101, 103, 105, 110,
5944 	111, 117, 122, 98, 120, 97, 101, 105,
5945 	111, 117, 101, 110, 111, 115, 120, 101,
5946 	110, 111, 112, 115, 120, 97, 101, 104,
5947 	105, 111, 116, 117, 122, 98, 120, 97,
5948 	101, 105, 106, 111, 117, 122, 98, 120,
5949 	98, 99, 100, 105, 107, 111, 117, 122,
5950 	97, 101, 102, 120, 105, 97, 101, 105,
5951 	108, 111, 114, 117, 119, 122, 98, 120,
5952 	97, 101, 105, 109, 111, 117, 122, 98,
5953 	120, 97, 101, 105, 110, 111, 117, 122,
5954 	98, 120, 97, 101, 105, 111, 112, 117,
5955 	122, 98, 120, 97, 101, 105, 111, 113,
5956 	117, 122, 98, 120, 97, 101, 105, 111,
5957 	114, 117, 122, 98, 120, 97, 101, 105,
5958 	111, 117, 98, 99, 100, 105, 108, 111,
5959 	116, 117, 97, 101, 102, 122, 101, 110,
5960 	111, 115, 120, 98, 104, 106, 116, 118,
5961 	122, 101, 110, 111, 115, 120, 101, 110,
5962 	111, 112, 115, 120, 101, 105, 110, 111,
5963 	115, 120, 98, 116, 118, 122, 101, 105,
5964 	110, 111, 115, 120, 98, 116, 118, 122,
5965 	101, 110, 111, 115, 120, 98, 104, 106,
5966 	116, 118, 122, 98, 101, 110, 111, 114,
5967 	115, 120, 101, 110, 111, 115, 120, 98,
5968 	104, 106, 116, 118, 122, 97, 101, 105,
5969 	111, 115, 117, 122, 98, 120, 97, 101,
5970 	105, 111, 116, 117, 122, 98, 120, 122,
5971 	98, 100, 102, 104, 106, 110, 112, 116,
5972 	118, 120, 122, 98, 100, 102, 104, 106,
5973 	110, 112, 116, 118, 120, 98, 114, 112,
5974 	114, 113, 97, 101, 105, 108, 111, 117,
5975 	98, 122, 101, 110, 111, 115, 120, 98,
5976 	104, 106, 116, 118, 122, 101, 110, 111,
5977 	115, 120, 101, 110, 111, 112, 115, 120,
5978 	101, 105, 110, 111, 115, 120, 98, 116,
5979 	118, 122, 101, 105, 110, 111, 115, 120,
5980 	98, 116, 118, 122, 101, 110, 111, 115,
5981 	120, 98, 104, 106, 116, 118, 122, 98,
5982 	101, 110, 111, 114, 115, 120, 101, 110,
5983 	111, 115, 120, 98, 104, 106, 116, 118,
5984 	122, 97, 101, 105, 111, 117, 98, 122,
5985 	97, 101, 105, 111, 117, 121, 97, 101,
5986 	105, 111, 117, 118, 122, 98, 120, 97,
5987 	101, 105, 111, 117, 119, 122, 98, 120,
5988 	97, 101, 105, 111, 117, 120, 122, 98,
5989 	119, 97, 101, 105, 111, 117, 121, 97,
5990 	101, 105, 111, 117, 121, 122, 98, 120,
5991 	0
5992 };
5993 
5994 static const char _VBG_single_lengths[] = {
5995 	0, 1, 1, 1, 6, 5, 0, 5,
5996 	5, 0, 2, 1, 1, 1, 1, 1,
5997 	1, 5, 5, 0, 2, 1, 1, 1,
5998 	1, 1, 1, 5, 8, 1, 1, 1,
5999 	1, 1, 2, 1, 1, 1, 1, 1,
6000 	1, 23, 0, 7, 1, 5, 7, 7,
6001 	7, 8, 5, 5, 6, 8, 0, 7,
6002 	8, 1, 9, 0, 7, 7, 7, 7,
6003 	7, 5, 8, 5, 5, 6, 6, 6,
6004 	5, 7, 5, 7, 7, 0, 1, 1,
6005 	2, 1, 1, 0, 1, 6, 5, 5,
6006 	6, 6, 6, 5, 7, 5, 5, 6,
6007 	7, 7, 7, 6, 7
6008 };
6009 
6010 static const char _VBG_range_lengths[] = {
6011 	0, 0, 0, 0, 0, 0, 5, 0,
6012 	0, 5, 0, 0, 0, 0, 0, 0,
6013 	0, 1, 1, 1, 0, 0, 0, 0,
6014 	0, 0, 0, 0, 0, 0, 0, 0,
6015 	0, 0, 0, 0, 0, 0, 0, 0,
6016 	0, 1, 0, 1, 0, 0, 2, 1,
6017 	1, 1, 0, 0, 0, 1, 0, 1,
6018 	2, 0, 1, 0, 1, 1, 1, 1,
6019 	1, 0, 2, 3, 0, 0, 2, 2,
6020 	3, 0, 3, 1, 1, 0, 5, 5,
6021 	0, 0, 0, 0, 0, 1, 3, 0,
6022 	0, 2, 2, 3, 0, 3, 1, 0,
6023 	1, 1, 1, 0, 1
6024 };
6025 
6026 static const short _VBG_index_offsets[] = {
6027 	0, 0, 2, 4, 6, 13, 19, 25,
6028 	31, 37, 43, 46, 48, 50, 52, 54,
6029 	56, 58, 65, 72, 74, 77, 79, 81,
6030 	83, 85, 87, 89, 95, 104, 106, 108,
6031 	110, 112, 114, 117, 119, 121, 123, 125,
6032 	127, 129, 154, 155, 164, 166, 172, 182,
6033 	191, 200, 210, 216, 222, 229, 239, 240,
6034 	249, 260, 262, 273, 274, 283, 292, 301,
6035 	310, 319, 325, 336, 345, 351, 358, 367,
6036 	376, 385, 393, 402, 411, 420, 421, 428,
6037 	435, 438, 440, 442, 443, 445, 453, 462,
6038 	468, 475, 484, 493, 502, 510, 519, 526,
6039 	533, 542, 551, 560, 567
6040 };
6041 
6042 static const unsigned char _VBG_indicies[] = {
6043 	0, 1, 2, 1, 3, 1, 4, 4,
6044 	4, 4, 4, 4, 1, 5, 5, 5,
6045 	5, 6, 1, 7, 7, 7, 7, 7,
6046 	1, 8, 8, 8, 8, 9, 1, 5,
6047 	5, 5, 5, 10, 1, 11, 11, 11,
6048 	11, 11, 1, 11, 12, 1, 11, 1,
6049 	13, 1, 11, 1, 14, 1, 11, 1,
6050 	11, 1, 5, 5, 5, 5, 6, 15,
6051 	1, 5, 5, 5, 5, 6, 16, 1,
6052 	4, 1, 17, 18, 1, 17, 1, 19,
6053 	1, 17, 1, 20, 1, 17, 1, 17,
6054 	1, 21, 22, 21, 23, 24, 1, 25,
6055 	26, 25, 27, 28, 29, 25, 30, 1,
6056 	31, 1, 31, 1, 32, 1, 31, 1,
6057 	31, 1, 33, 34, 1, 33, 1, 35,
6058 	1, 33, 1, 36, 1, 33, 1, 33,
6059 	1, 38, 39, 40, 41, 42, 43, 44,
6060 	45, 46, 47, 48, 49, 50, 51, 52,
6061 	53, 54, 55, 56, 57, 58, 59, 60,
6062 	37, 1, 1, 61, 62, 61, 61, 61,
6063 	61, 63, 63, 1, 64, 1, 65, 65,
6064 	65, 65, 65, 1, 67, 68, 67, 66,
6065 	66, 66, 67, 66, 67, 1, 69, 62,
6066 	69, 69, 69, 69, 63, 63, 1, 61,
6067 	61, 62, 61, 61, 61, 63, 63, 1,
6068 	66, 66, 68, 66, 70, 66, 66, 67,
6069 	67, 1, 71, 71, 71, 71, 71, 1,
6070 	72, 73, 74, 75, 76, 1, 72, 73,
6071 	74, 11, 75, 76, 1, 61, 61, 62,
6072 	61, 61, 77, 61, 63, 63, 1, 78,
6073 	61, 61, 61, 62, 61, 61, 63, 63,
6074 	1, 63, 79, 63, 61, 62, 61, 61,
6075 	63, 61, 63, 1, 7, 1, 61, 61,
6076 	61, 68, 61, 80, 61, 80, 67, 67,
6077 	1, 5, 61, 61, 61, 62, 61, 61,
6078 	63, 63, 1, 81, 81, 82, 62, 81,
6079 	81, 63, 63, 1, 81, 81, 81, 81,
6080 	62, 81, 63, 63, 1, 61, 61, 61,
6081 	61, 62, 61, 63, 63, 1, 61, 83,
6082 	61, 84, 62, 61, 63, 63, 1, 5,
6083 	5, 5, 5, 6, 1, 85, 86, 85,
6084 	5, 86, 5, 86, 6, 5, 85, 1,
6085 	87, 88, 89, 90, 91, 85, 85, 85,
6086 	1, 87, 92, 89, 93, 94, 1, 87,
6087 	92, 89, 17, 93, 94, 1, 87, 17,
6088 	88, 89, 90, 91, 85, 85, 1, 87,
6089 	20, 88, 89, 90, 91, 85, 85, 1,
6090 	95, 88, 89, 90, 91, 85, 85, 85,
6091 	1, 17, 87, 92, 89, 18, 93, 94,
6092 	1, 87, 97, 89, 98, 99, 96, 96,
6093 	96, 1, 66, 66, 66, 66, 100, 66,
6094 	67, 67, 1, 101, 102, 103, 61, 62,
6095 	61, 63, 63, 1, 104, 106, 106, 106,
6096 	106, 106, 106, 105, 107, 107, 107, 107,
6097 	107, 107, 1, 31, 108, 1, 31, 1,
6098 	109, 1, 105, 110, 104, 5, 5, 5,
6099 	112, 5, 6, 111, 1, 113, 114, 115,
6100 	116, 117, 111, 111, 111, 1, 113, 118,
6101 	115, 119, 120, 1, 113, 118, 115, 33,
6102 	119, 120, 1, 113, 33, 114, 115, 116,
6103 	117, 111, 111, 1, 113, 36, 114, 115,
6104 	116, 117, 111, 111, 1, 121, 114, 115,
6105 	116, 117, 111, 111, 111, 1, 33, 113,
6106 	118, 115, 34, 119, 120, 1, 113, 123,
6107 	115, 124, 125, 122, 122, 122, 1, 5,
6108 	5, 5, 5, 6, 111, 1, 4, 4,
6109 	4, 4, 4, 4, 1, 66, 66, 66,
6110 	66, 66, 68, 67, 67, 1, 81, 81,
6111 	81, 81, 81, 62, 63, 63, 1, 81,
6112 	81, 81, 81, 81, 62, 63, 63, 1,
6113 	126, 126, 126, 126, 126, 4, 1, 127,
6114 	127, 127, 127, 127, 129, 130, 128, 1,
6115 	0
6116 };
6117 
6118 static const char _VBG_trans_targs[] = {
6119 	2, 0, 3, 41, 42, 42, 44, 42,
6120 	42, 44, 44, 51, 52, 13, 15, 42,
6121 	42, 68, 69, 23, 25, 77, 78, 83,
6122 	84, 42, 80, 29, 82, 31, 33, 42,
6123 	32, 87, 88, 37, 39, 4, 43, 46,
6124 	47, 48, 49, 53, 55, 56, 58, 60,
6125 	61, 19, 62, 63, 64, 75, 76, 95,
6126 	96, 97, 98, 99, 100, 5, 45, 42,
6127 	42, 6, 7, 42, 45, 8, 50, 9,
6128 	10, 11, 12, 14, 16, 54, 42, 57,
6129 	59, 17, 18, 65, 66, 67, 74, 20,
6130 	70, 22, 71, 72, 21, 24, 26, 73,
6131 	67, 70, 71, 72, 45, 27, 85, 94,
6132 	42, 42, 79, 28, 81, 30, 42, 86,
6133 	93, 34, 89, 36, 90, 91, 35, 38,
6134 	40, 92, 86, 89, 90, 91, 65, 65,
6135 	42, 42, 45
6136 };
6137 
6138 static const char _VBG_trans_actions[] = {
6139 	0, 0, 0, 29, 23, 15, 15, 3,
6140 	46, 46, 40, 0, 0, 0, 0, 5,
6141 	34, 0, 0, 0, 0, 15, 15, 15,
6142 	15, 11, 11, 0, 11, 0, 0, 9,
6143 	0, 0, 0, 0, 0, 0, 0, 0,
6144 	0, 0, 0, 0, 0, 0, 0, 0,
6145 	0, 0, 0, 0, 0, 0, 0, 21,
6146 	0, 0, 0, 23, 0, 0, 19, 19,
6147 	7, 0, 0, 49, 49, 0, 49, 0,
6148 	0, 0, 0, 0, 0, 19, 17, 19,
6149 	49, 0, 0, 27, 27, 0, 0, 0,
6150 	0, 0, 0, 0, 0, 0, 0, 0,
6151 	25, 25, 25, 25, 56, 0, 9, 9,
6152 	13, 43, 43, 0, 9, 0, 37, 0,
6153 	0, 0, 0, 0, 0, 0, 0, 0,
6154 	0, 0, 7, 7, 7, 7, 23, 1,
6155 	31, 1, 52
6156 };
6157 
6158 static const char _VBG_eof_actions[] = {
6159 	0, 0, 0, 0, 0, 0, 0, 0,
6160 	0, 0, 0, 0, 0, 0, 0, 0,
6161 	0, 0, 0, 0, 0, 0, 0, 0,
6162 	0, 0, 0, 0, 0, 0, 0, 0,
6163 	0, 0, 0, 0, 0, 0, 0, 0,
6164 	0, 0, 0, 3, 0, 0, 3, 3,
6165 	3, 3, 0, 3, 3, 3, 0, 3,
6166 	3, 0, 3, 0, 3, 3, 3, 3,
6167 	3, 0, 0, 25, 25, 25, 25, 25,
6168 	25, 25, 25, 3, 3, 0, 0, 0,
6169 	0, 0, 0, 0, 0, 0, 7, 7,
6170 	7, 7, 7, 7, 7, 7, 0, 0,
6171 	3, 3, 3, 0, 3
6172 };
6173 
6174 static const int VBG_start = 1;
6175 
add_VBG(const string & form,vector<tagged_lemma> & lemmas) const6176 void english_morpho_guesser::add_VBG(const string& form, vector<tagged_lemma>& lemmas) const {
6177   const char* p = form.c_str(); int cs;
6178   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
6179 
6180 	{
6181 	cs = VBG_start;
6182 	}
6183 
6184 	{
6185 	int _klen;
6186 	unsigned int _trans;
6187 	const char *_acts;
6188 	unsigned int _nacts;
6189 	const char *_keys;
6190 
6191 	if ( p == ( (form.c_str() + form.size())) )
6192 		goto _test_eof;
6193 	if ( cs == 0 )
6194 		goto _out;
6195 _resume:
6196 	_keys = _VBG_trans_keys + _VBG_key_offsets[cs];
6197 	_trans = _VBG_index_offsets[cs];
6198 
6199 	_klen = _VBG_single_lengths[cs];
6200 	if ( _klen > 0 ) {
6201 		const char *_lower = _keys;
6202 		const char *_mid;
6203 		const char *_upper = _keys + _klen - 1;
6204 		while (1) {
6205 			if ( _upper < _lower )
6206 				break;
6207 
6208 			_mid = _lower + ((_upper-_lower) >> 1);
6209 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6210 				_upper = _mid - 1;
6211 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6212 				_lower = _mid + 1;
6213 			else {
6214 				_trans += (unsigned int)(_mid - _keys);
6215 				goto _match;
6216 			}
6217 		}
6218 		_keys += _klen;
6219 		_trans += _klen;
6220 	}
6221 
6222 	_klen = _VBG_range_lengths[cs];
6223 	if ( _klen > 0 ) {
6224 		const char *_lower = _keys;
6225 		const char *_mid;
6226 		const char *_upper = _keys + (_klen<<1) - 2;
6227 		while (1) {
6228 			if ( _upper < _lower )
6229 				break;
6230 
6231 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
6232 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6233 				_upper = _mid - 2;
6234 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6235 				_lower = _mid + 2;
6236 			else {
6237 				_trans += (unsigned int)((_mid - _keys)>>1);
6238 				goto _match;
6239 			}
6240 		}
6241 		_trans += _klen;
6242 	}
6243 
6244 _match:
6245 	_trans = _VBG_indicies[_trans];
6246 	cs = _VBG_trans_targs[_trans];
6247 
6248 	if ( _VBG_trans_actions[_trans] == 0 )
6249 		goto _again;
6250 
6251 	_acts = _VBG_actions + _VBG_trans_actions[_trans];
6252 	_nacts = (unsigned int) *_acts++;
6253 	while ( _nacts-- > 0 )
6254 	{
6255 		switch ( *_acts++ )
6256 		{
6257 	case 0:
6258 	{ if (best > 'a') best = 'a', remove = 3, append = nullptr; }
6259 	break;
6260 	case 1:
6261 	{ if (best > 'b') best = 'b', remove = 3, append = "e";     }
6262 	break;
6263 	case 2:
6264 	{ if (best > 'c') best = 'c', remove = 3, append = nullptr; }
6265 	break;
6266 	case 3:
6267 	{ if (best > 'd') best = 'd', remove = 3, append = "e";     }
6268 	break;
6269 	case 4:
6270 	{ if (best > 'e') best = 'e', remove = 3, append = nullptr; }
6271 	break;
6272 	case 5:
6273 	{ if (best > 'f') best = 'f', remove = 3, append = "e";     }
6274 	break;
6275 	case 6:
6276 	{ if (best > 'g') best = 'g', remove = 3, append = nullptr; }
6277 	break;
6278 	case 7:
6279 	{ if (best > 'h') best = 'h', remove = 3, append = "e";     }
6280 	break;
6281 	case 8:
6282 	{ if (best > 'i') best = 'i', remove = 3, append = nullptr; }
6283 	break;
6284 	case 9:
6285 	{ if (best > 'j') best = 'j', remove = 3, append = "e";     }
6286 	break;
6287 	case 10:
6288 	{ if (best > 'k') best = 'k', remove = 3, append = nullptr; }
6289 	break;
6290 	case 11:
6291 	{ if (best > 'l') best = 'l', remove = 3, append = "e";     }
6292 	break;
6293 	case 12:
6294 	{ if (best > 'm') best = 'm', remove = 3, append = nullptr; }
6295 	break;
6296 	case 13:
6297 	{ if (best > 'n') best = 'n', remove = 3, append = "e";     }
6298 	break;
6299 	case 14:
6300 	{ if (best > 'o') best = 'o', remove = 3, append = nullptr; }
6301 	break;
6302 	case 15:
6303 	{ if (best > 'p') best = 'p', remove = 3, append = "e";     }
6304 	break;
6305 	case 16:
6306 	{ if (best > 'q') best = 'q', remove = 3, append = nullptr; }
6307 	break;
6308 	case 17:
6309 	{ if (best > 'r') best = 'r', remove = 3, append = "e";     }
6310 	break;
6311 		}
6312 	}
6313 
6314 _again:
6315 	if ( cs == 0 )
6316 		goto _out;
6317 	if ( ++p != ( (form.c_str() + form.size())) )
6318 		goto _resume;
6319 	_test_eof: {}
6320 	if ( p == ( (form.c_str() + form.size())) )
6321 	{
6322 	const char *__acts = _VBG_actions + _VBG_eof_actions[cs];
6323 	unsigned int __nacts = (unsigned int) *__acts++;
6324 	while ( __nacts-- > 0 ) {
6325 		switch ( *__acts++ ) {
6326 	case 2:
6327 	{ if (best > 'c') best = 'c', remove = 3, append = nullptr; }
6328 	break;
6329 	case 5:
6330 	{ if (best > 'f') best = 'f', remove = 3, append = "e";     }
6331 	break;
6332 	case 15:
6333 	{ if (best > 'p') best = 'p', remove = 3, append = "e";     }
6334 	break;
6335 		}
6336 	}
6337 	}
6338 
6339 	_out: {}
6340 	}
6341 
6342   add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
6343 }
6344 
6345 static const char _VBD_VBN_actions[] = {
6346 	0, 1, 0, 1, 2, 1, 3, 1,
6347 	4, 1, 5, 1, 6, 1, 7, 1,
6348 	8, 1, 9, 1, 10, 1, 11, 1,
6349 	13, 1, 14, 1, 15, 1, 16, 1,
6350 	17, 2, 1, 16, 2, 4, 5, 2,
6351 	8, 16, 2, 9, 13, 2, 9, 14,
6352 	2, 12, 13, 2, 13, 14, 2, 15,
6353 	16, 3, 1, 3, 16, 3, 3, 15,
6354 	16
6355 };
6356 
6357 static const short _VBD_VBN_key_offsets[] = {
6358 	0, 0, 2, 3, 9, 14, 24, 29,
6359 	34, 44, 46, 47, 48, 49, 50, 51,
6360 	52, 60, 67, 74, 76, 77, 78, 79,
6361 	80, 81, 82, 87, 95, 96, 97, 98,
6362 	99, 100, 102, 103, 104, 105, 106, 107,
6363 	108, 114, 115, 140, 140, 149, 150, 155,
6364 	166, 175, 184, 194, 199, 204, 210, 220,
6365 	220, 229, 241, 242, 253, 253, 262, 271,
6366 	280, 289, 298, 303, 316, 327, 332, 338,
6367 	348, 358, 369, 376, 387, 396, 405, 405,
6368 	416, 427, 429, 430, 431, 431, 432, 440,
6369 	451, 456, 462, 472, 482, 493, 500, 511,
6370 	518, 524, 533, 542, 551
6371 };
6372 
6373 static const char _VBD_VBN_trans_keys[] = {
6374 	100, 110, 101, 97, 101, 105, 111, 117,
6375 	121, 97, 101, 105, 111, 117, 98, 100,
6376 	102, 104, 106, 110, 112, 116, 118, 122,
6377 	97, 101, 105, 111, 117, 97, 101, 105,
6378 	111, 117, 98, 100, 102, 104, 106, 110,
6379 	112, 116, 118, 122, 98, 114, 105, 114,
6380 	112, 105, 109, 101, 97, 101, 105, 111,
6381 	117, 121, 98, 122, 97, 101, 105, 111,
6382 	117, 98, 122, 97, 101, 105, 111, 117,
6383 	98, 122, 98, 114, 105, 114, 112, 105,
6384 	109, 101, 97, 101, 105, 111, 117, 97,
6385 	101, 105, 110, 111, 115, 117, 120, 105,
6386 	112, 105, 109, 101, 98, 114, 105, 114,
6387 	112, 105, 109, 101, 97, 101, 105, 111,
6388 	117, 121, 101, 98, 99, 100, 102, 103,
6389 	104, 105, 106, 107, 108, 109, 110, 112,
6390 	113, 114, 115, 116, 117, 118, 119, 120,
6391 	121, 122, 97, 111, 97, 98, 101, 105,
6392 	111, 117, 122, 99, 120, 113, 97, 101,
6393 	105, 111, 117, 98, 99, 100, 105, 111,
6394 	117, 122, 97, 101, 102, 120, 97, 100,
6395 	101, 105, 111, 117, 122, 98, 120, 97,
6396 	101, 102, 105, 111, 117, 122, 98, 120,
6397 	97, 101, 103, 105, 110, 111, 117, 122,
6398 	98, 120, 97, 101, 105, 111, 117, 101,
6399 	110, 111, 115, 120, 101, 110, 111, 112,
6400 	115, 120, 97, 101, 104, 105, 111, 116,
6401 	117, 122, 98, 120, 97, 101, 105, 106,
6402 	111, 117, 122, 98, 120, 98, 99, 100,
6403 	105, 107, 111, 117, 122, 97, 101, 102,
6404 	120, 105, 97, 101, 105, 108, 111, 114,
6405 	117, 119, 122, 98, 120, 97, 101, 105,
6406 	109, 111, 117, 122, 98, 120, 97, 101,
6407 	105, 110, 111, 117, 122, 98, 120, 97,
6408 	101, 105, 111, 112, 117, 122, 98, 120,
6409 	97, 101, 105, 111, 113, 117, 122, 98,
6410 	120, 97, 101, 105, 111, 114, 117, 122,
6411 	98, 120, 97, 101, 105, 111, 117, 98,
6412 	99, 100, 105, 108, 110, 111, 116, 117,
6413 	97, 101, 102, 122, 101, 110, 111, 115,
6414 	120, 98, 104, 106, 116, 118, 122, 101,
6415 	110, 111, 115, 120, 101, 110, 111, 112,
6416 	115, 120, 101, 105, 110, 111, 115, 120,
6417 	98, 116, 118, 122, 101, 105, 110, 111,
6418 	115, 120, 98, 116, 118, 122, 101, 110,
6419 	111, 115, 120, 98, 104, 106, 116, 118,
6420 	122, 98, 101, 110, 111, 114, 115, 120,
6421 	101, 110, 111, 115, 120, 98, 104, 106,
6422 	116, 118, 122, 97, 101, 105, 111, 115,
6423 	117, 122, 98, 120, 97, 101, 105, 111,
6424 	116, 117, 122, 98, 120, 122, 98, 100,
6425 	102, 104, 106, 110, 112, 116, 118, 120,
6426 	122, 98, 100, 102, 104, 106, 110, 112,
6427 	116, 118, 120, 98, 114, 112, 114, 113,
6428 	97, 101, 105, 108, 111, 117, 98, 122,
6429 	101, 110, 111, 115, 120, 98, 104, 106,
6430 	116, 118, 122, 101, 110, 111, 115, 120,
6431 	101, 110, 111, 112, 115, 120, 101, 105,
6432 	110, 111, 115, 120, 98, 116, 118, 122,
6433 	101, 105, 110, 111, 115, 120, 98, 116,
6434 	118, 122, 101, 110, 111, 115, 120, 98,
6435 	104, 106, 116, 118, 122, 98, 101, 110,
6436 	111, 114, 115, 120, 101, 110, 111, 115,
6437 	120, 98, 104, 106, 116, 118, 122, 97,
6438 	101, 105, 111, 117, 98, 122, 97, 101,
6439 	105, 111, 117, 121, 97, 101, 105, 111,
6440 	117, 118, 122, 98, 120, 97, 101, 105,
6441 	111, 117, 119, 122, 98, 120, 97, 101,
6442 	105, 111, 117, 120, 122, 98, 119, 97,
6443 	101, 105, 111, 117, 121, 122, 98, 120,
6444 	0
6445 };
6446 
6447 static const char _VBD_VBN_single_lengths[] = {
6448 	0, 2, 1, 6, 5, 0, 5, 5,
6449 	0, 2, 1, 1, 1, 1, 1, 1,
6450 	6, 5, 5, 2, 1, 1, 1, 1,
6451 	1, 1, 5, 8, 1, 1, 1, 1,
6452 	1, 2, 1, 1, 1, 1, 1, 1,
6453 	6, 1, 23, 0, 7, 1, 5, 7,
6454 	7, 7, 8, 5, 5, 6, 8, 0,
6455 	7, 8, 1, 9, 0, 7, 7, 7,
6456 	7, 7, 5, 9, 5, 5, 6, 6,
6457 	6, 5, 7, 5, 7, 7, 0, 1,
6458 	1, 2, 1, 1, 0, 1, 6, 5,
6459 	5, 6, 6, 6, 5, 7, 5, 5,
6460 	6, 7, 7, 7, 7
6461 };
6462 
6463 static const char _VBD_VBN_range_lengths[] = {
6464 	0, 0, 0, 0, 0, 5, 0, 0,
6465 	5, 0, 0, 0, 0, 0, 0, 0,
6466 	1, 1, 1, 0, 0, 0, 0, 0,
6467 	0, 0, 0, 0, 0, 0, 0, 0,
6468 	0, 0, 0, 0, 0, 0, 0, 0,
6469 	0, 0, 1, 0, 1, 0, 0, 2,
6470 	1, 1, 1, 0, 0, 0, 1, 0,
6471 	1, 2, 0, 1, 0, 1, 1, 1,
6472 	1, 1, 0, 2, 3, 0, 0, 2,
6473 	2, 3, 0, 3, 1, 1, 0, 5,
6474 	5, 0, 0, 0, 0, 0, 1, 3,
6475 	0, 0, 2, 2, 3, 0, 3, 1,
6476 	0, 1, 1, 1, 1
6477 };
6478 
6479 static const short _VBD_VBN_index_offsets[] = {
6480 	0, 0, 3, 5, 12, 18, 24, 30,
6481 	36, 42, 45, 47, 49, 51, 53, 55,
6482 	57, 65, 72, 79, 82, 84, 86, 88,
6483 	90, 92, 94, 100, 109, 111, 113, 115,
6484 	117, 119, 122, 124, 126, 128, 130, 132,
6485 	134, 141, 143, 168, 169, 178, 180, 186,
6486 	196, 205, 214, 224, 230, 236, 243, 253,
6487 	254, 263, 274, 276, 287, 288, 297, 306,
6488 	315, 324, 333, 339, 351, 360, 366, 373,
6489 	382, 391, 400, 408, 417, 426, 435, 436,
6490 	443, 450, 453, 455, 457, 458, 460, 468,
6491 	477, 483, 490, 499, 508, 517, 525, 534,
6492 	541, 548, 557, 566, 575
6493 };
6494 
6495 static const unsigned char _VBD_VBN_indicies[] = {
6496 	0, 2, 1, 3, 1, 4, 4, 4,
6497 	4, 4, 4, 1, 5, 5, 5, 5,
6498 	6, 1, 7, 7, 7, 7, 7, 1,
6499 	8, 8, 8, 8, 9, 1, 5, 5,
6500 	5, 5, 10, 1, 11, 11, 11, 11,
6501 	11, 1, 11, 12, 1, 11, 1, 13,
6502 	1, 11, 1, 14, 1, 11, 1, 11,
6503 	1, 4, 4, 4, 4, 4, 16, 15,
6504 	1, 5, 5, 5, 5, 6, 17, 1,
6505 	5, 5, 5, 5, 6, 18, 1, 19,
6506 	20, 1, 19, 1, 21, 1, 19, 1,
6507 	22, 1, 19, 1, 19, 1, 23, 24,
6508 	23, 25, 26, 1, 27, 28, 27, 29,
6509 	30, 31, 27, 32, 1, 33, 1, 33,
6510 	1, 34, 1, 33, 1, 33, 1, 35,
6511 	36, 1, 35, 1, 37, 1, 35, 1,
6512 	38, 1, 35, 1, 35, 1, 39, 39,
6513 	39, 39, 39, 4, 1, 40, 1, 42,
6514 	43, 44, 45, 46, 47, 48, 49, 50,
6515 	51, 52, 53, 54, 55, 56, 57, 58,
6516 	59, 60, 61, 62, 63, 64, 41, 1,
6517 	1, 65, 66, 65, 65, 65, 65, 4,
6518 	4, 1, 67, 1, 68, 68, 68, 68,
6519 	68, 1, 70, 71, 70, 69, 69, 69,
6520 	70, 69, 70, 1, 72, 66, 72, 72,
6521 	72, 72, 4, 4, 1, 65, 65, 66,
6522 	65, 65, 65, 4, 4, 1, 69, 69,
6523 	71, 69, 73, 69, 69, 70, 70, 1,
6524 	74, 74, 74, 74, 74, 1, 75, 76,
6525 	77, 78, 79, 1, 75, 76, 77, 11,
6526 	78, 79, 1, 65, 65, 66, 65, 65,
6527 	80, 65, 4, 4, 1, 81, 65, 65,
6528 	65, 66, 65, 65, 4, 4, 1, 4,
6529 	82, 4, 65, 66, 65, 65, 4, 65,
6530 	4, 1, 7, 1, 65, 65, 65, 71,
6531 	65, 83, 65, 83, 70, 70, 1, 5,
6532 	65, 65, 65, 66, 65, 65, 4, 4,
6533 	1, 84, 84, 85, 66, 84, 84, 4,
6534 	4, 1, 84, 84, 84, 84, 66, 84,
6535 	4, 4, 1, 65, 65, 65, 65, 66,
6536 	65, 4, 4, 1, 65, 86, 65, 87,
6537 	66, 65, 4, 4, 1, 5, 5, 5,
6538 	5, 6, 1, 88, 89, 88, 5, 89,
6539 	89, 5, 89, 6, 5, 88, 1, 90,
6540 	91, 92, 93, 94, 88, 88, 88, 1,
6541 	90, 95, 92, 96, 97, 1, 90, 95,
6542 	92, 19, 96, 97, 1, 90, 19, 91,
6543 	92, 93, 94, 88, 88, 1, 90, 22,
6544 	91, 92, 93, 94, 88, 88, 1, 98,
6545 	91, 92, 93, 94, 88, 88, 88, 1,
6546 	19, 90, 95, 92, 20, 96, 97, 1,
6547 	90, 100, 92, 101, 102, 99, 99, 99,
6548 	1, 69, 69, 69, 69, 103, 69, 70,
6549 	70, 1, 104, 105, 106, 65, 66, 65,
6550 	4, 4, 1, 107, 109, 109, 109, 109,
6551 	109, 109, 108, 110, 110, 110, 110, 110,
6552 	110, 1, 33, 111, 1, 33, 1, 112,
6553 	1, 108, 113, 107, 5, 5, 5, 115,
6554 	5, 6, 114, 1, 116, 117, 118, 119,
6555 	120, 114, 114, 114, 1, 116, 121, 118,
6556 	122, 123, 1, 116, 121, 118, 35, 122,
6557 	123, 1, 116, 35, 117, 118, 119, 120,
6558 	114, 114, 1, 116, 38, 117, 118, 119,
6559 	120, 114, 114, 1, 124, 117, 118, 119,
6560 	120, 114, 114, 114, 1, 35, 116, 121,
6561 	118, 36, 122, 123, 1, 116, 126, 118,
6562 	127, 128, 125, 125, 125, 1, 5, 5,
6563 	5, 5, 6, 114, 1, 4, 4, 4,
6564 	4, 4, 4, 1, 69, 69, 69, 69,
6565 	69, 71, 70, 70, 1, 84, 84, 84,
6566 	84, 84, 66, 4, 4, 1, 84, 84,
6567 	84, 84, 84, 66, 4, 4, 1, 129,
6568 	129, 129, 129, 129, 131, 132, 130, 1,
6569 	0
6570 };
6571 
6572 static const char _VBD_VBN_trans_targs[] = {
6573 	2, 0, 41, 42, 43, 43, 45, 43,
6574 	43, 45, 45, 52, 53, 12, 14, 43,
6575 	43, 43, 43, 69, 70, 22, 24, 78,
6576 	79, 84, 85, 43, 81, 28, 83, 30,
6577 	32, 43, 31, 88, 89, 36, 38, 66,
6578 	43, 3, 44, 47, 48, 49, 50, 54,
6579 	16, 56, 57, 59, 61, 62, 63, 64,
6580 	65, 76, 77, 96, 97, 98, 99, 40,
6581 	100, 4, 46, 43, 5, 6, 43, 46,
6582 	7, 51, 8, 9, 10, 11, 13, 15,
6583 	55, 43, 58, 60, 17, 18, 66, 67,
6584 	68, 75, 19, 71, 21, 72, 73, 20,
6585 	23, 25, 74, 68, 71, 72, 73, 46,
6586 	26, 86, 95, 43, 43, 80, 27, 82,
6587 	29, 43, 87, 94, 33, 90, 35, 91,
6588 	92, 34, 37, 39, 93, 87, 90, 91,
6589 	92, 66, 43, 43, 46
6590 };
6591 
6592 static const char _VBD_VBN_trans_actions[] = {
6593 	0, 0, 0, 31, 29, 25, 25, 5,
6594 	51, 51, 45, 0, 0, 0, 0, 15,
6595 	39, 9, 36, 0, 0, 0, 0, 25,
6596 	25, 25, 25, 21, 21, 0, 21, 0,
6597 	0, 19, 0, 0, 0, 0, 0, 29,
6598 	1, 0, 0, 0, 0, 0, 0, 0,
6599 	0, 0, 0, 0, 0, 0, 0, 0,
6600 	0, 0, 0, 27, 0, 0, 0, 0,
6601 	0, 0, 29, 17, 0, 0, 54, 54,
6602 	0, 54, 0, 0, 0, 0, 0, 0,
6603 	29, 27, 29, 54, 0, 0, 13, 13,
6604 	0, 0, 0, 0, 0, 0, 0, 0,
6605 	0, 0, 0, 7, 7, 7, 7, 61,
6606 	0, 19, 19, 23, 48, 48, 0, 19,
6607 	0, 42, 0, 0, 0, 0, 0, 0,
6608 	0, 0, 0, 0, 0, 17, 17, 17,
6609 	17, 3, 33, 3, 57
6610 };
6611 
6612 static const char _VBD_VBN_eof_actions[] = {
6613 	0, 0, 0, 0, 0, 0, 0, 0,
6614 	0, 0, 0, 0, 0, 0, 0, 0,
6615 	0, 0, 0, 0, 0, 0, 0, 0,
6616 	0, 0, 0, 0, 0, 0, 0, 0,
6617 	0, 0, 0, 0, 0, 0, 0, 0,
6618 	0, 0, 0, 0, 5, 0, 0, 5,
6619 	5, 5, 5, 0, 5, 5, 5, 0,
6620 	5, 5, 0, 5, 0, 5, 5, 5,
6621 	5, 5, 0, 0, 11, 11, 11, 11,
6622 	11, 11, 11, 11, 5, 5, 0, 0,
6623 	0, 0, 0, 0, 0, 0, 0, 17,
6624 	17, 17, 17, 17, 17, 17, 17, 0,
6625 	0, 5, 5, 5, 5
6626 };
6627 
6628 static const int VBD_VBN_start = 1;
6629 
add_VBD_VBN(const string & form,vector<tagged_lemma> & lemmas) const6630 void english_morpho_guesser::add_VBD_VBN(const string& form, vector<tagged_lemma>& lemmas) const {
6631   const char* p = form.c_str(); int cs;
6632   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
6633 
6634 	{
6635 	cs = VBD_VBN_start;
6636 	}
6637 
6638 	{
6639 	int _klen;
6640 	unsigned int _trans;
6641 	const char *_acts;
6642 	unsigned int _nacts;
6643 	const char *_keys;
6644 
6645 	if ( p == ( (form.c_str() + form.size())) )
6646 		goto _test_eof;
6647 	if ( cs == 0 )
6648 		goto _out;
6649 _resume:
6650 	_keys = _VBD_VBN_trans_keys + _VBD_VBN_key_offsets[cs];
6651 	_trans = _VBD_VBN_index_offsets[cs];
6652 
6653 	_klen = _VBD_VBN_single_lengths[cs];
6654 	if ( _klen > 0 ) {
6655 		const char *_lower = _keys;
6656 		const char *_mid;
6657 		const char *_upper = _keys + _klen - 1;
6658 		while (1) {
6659 			if ( _upper < _lower )
6660 				break;
6661 
6662 			_mid = _lower + ((_upper-_lower) >> 1);
6663 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6664 				_upper = _mid - 1;
6665 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6666 				_lower = _mid + 1;
6667 			else {
6668 				_trans += (unsigned int)(_mid - _keys);
6669 				goto _match;
6670 			}
6671 		}
6672 		_keys += _klen;
6673 		_trans += _klen;
6674 	}
6675 
6676 	_klen = _VBD_VBN_range_lengths[cs];
6677 	if ( _klen > 0 ) {
6678 		const char *_lower = _keys;
6679 		const char *_mid;
6680 		const char *_upper = _keys + (_klen<<1) - 2;
6681 		while (1) {
6682 			if ( _upper < _lower )
6683 				break;
6684 
6685 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
6686 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6687 				_upper = _mid - 2;
6688 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6689 				_lower = _mid + 2;
6690 			else {
6691 				_trans += (unsigned int)((_mid - _keys)>>1);
6692 				goto _match;
6693 			}
6694 		}
6695 		_trans += _klen;
6696 	}
6697 
6698 _match:
6699 	_trans = _VBD_VBN_indicies[_trans];
6700 	cs = _VBD_VBN_trans_targs[_trans];
6701 
6702 	if ( _VBD_VBN_trans_actions[_trans] == 0 )
6703 		goto _again;
6704 
6705 	_acts = _VBD_VBN_actions + _VBD_VBN_trans_actions[_trans];
6706 	_nacts = (unsigned int) *_acts++;
6707 	while ( _nacts-- > 0 )
6708 	{
6709 		switch ( *_acts++ )
6710 		{
6711 	case 0:
6712 	{ if (best > 'a') best = 'a', remove = 1, append = nullptr; }
6713 	break;
6714 	case 1:
6715 	{ if (best > 'b') best = 'b', remove = 2, append = nullptr; }
6716 	break;
6717 	case 2:
6718 	{ if (best > 'c') best = 'c', remove = 1, append = nullptr; }
6719 	break;
6720 	case 3:
6721 	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
6722 	break;
6723 	case 4:
6724 	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
6725 	break;
6726 	case 5:
6727 	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
6728 	break;
6729 	case 7:
6730 	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
6731 	break;
6732 	case 8:
6733 	{ if (best > 'i') best = 'i', remove = 3, append = "y";     }
6734 	break;
6735 	case 9:
6736 	{ if (best > 'j') best = 'j', remove = 1, append = nullptr; }
6737 	break;
6738 	case 10:
6739 	{ if (best > 'k') best = 'k', remove = 2, append = nullptr; }
6740 	break;
6741 	case 11:
6742 	{ if (best > 'l') best = 'l', remove = 1, append = nullptr; }
6743 	break;
6744 	case 12:
6745 	{ if (best > 'm') best = 'm', remove = 2, append = nullptr; }
6746 	break;
6747 	case 13:
6748 	{ if (best > 'n') best = 'n', remove = 1, append = nullptr; }
6749 	break;
6750 	case 14:
6751 	{ if (best > 'o') best = 'o', remove = 2, append = nullptr; }
6752 	break;
6753 	case 15:
6754 	{ if (best > 'p') best = 'p', remove = 1, append = nullptr; }
6755 	break;
6756 	case 16:
6757 	{ if (best > 'q') best = 'q', remove = 2, append = nullptr; }
6758 	break;
6759 	case 17:
6760 	{ if (best > 'r') best = 'r', remove = 1, append = nullptr; }
6761 	break;
6762 		}
6763 	}
6764 
6765 _again:
6766 	if ( cs == 0 )
6767 		goto _out;
6768 	if ( ++p != ( (form.c_str() + form.size())) )
6769 		goto _resume;
6770 	_test_eof: {}
6771 	if ( p == ( (form.c_str() + form.size())) )
6772 	{
6773 	const char *__acts = _VBD_VBN_actions + _VBD_VBN_eof_actions[cs];
6774 	unsigned int __nacts = (unsigned int) *__acts++;
6775 	while ( __nacts-- > 0 ) {
6776 		switch ( *__acts++ ) {
6777 	case 3:
6778 	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
6779 	break;
6780 	case 6:
6781 	{ if (best > 'g') best = 'g', remove = 1, append = nullptr; }
6782 	break;
6783 	case 9:
6784 	{ if (best > 'j') best = 'j', remove = 1, append = nullptr; }
6785 	break;
6786 		}
6787 	}
6788 	}
6789 
6790 	_out: {}
6791 	}
6792 
6793   add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
6794 }
6795 
6796 static const char _VBZ_actions[] = {
6797 	0, 1, 0, 1, 1, 1, 2, 1,
6798 	3, 1, 4, 1, 5, 1, 6, 1,
6799 	7, 1, 8
6800 };
6801 
6802 static const char _VBZ_key_offsets[] = {
6803 	0, 0, 1, 2, 4, 14, 14, 25,
6804 	26, 31, 31, 31, 31, 37, 45, 54
6805 };
6806 
6807 static const char _VBZ_trans_keys[] = {
6808 	115, 101, 99, 115, 98, 100, 102, 104,
6809 	106, 110, 112, 116, 118, 122, 122, 98,
6810 	100, 102, 104, 106, 110, 112, 116, 118,
6811 	120, 111, 97, 101, 105, 111, 117, 104,
6812 	105, 111, 115, 120, 122, 97, 101, 105,
6813 	110, 111, 114, 115, 117, 97, 101, 105,
6814 	111, 117, 121, 122, 98, 120, 0
6815 };
6816 
6817 static const char _VBZ_single_lengths[] = {
6818 	0, 1, 1, 2, 0, 0, 1, 1,
6819 	5, 0, 0, 0, 6, 8, 7, 0
6820 };
6821 
6822 static const char _VBZ_range_lengths[] = {
6823 	0, 0, 0, 0, 5, 0, 5, 0,
6824 	0, 0, 0, 0, 0, 0, 1, 0
6825 };
6826 
6827 static const char _VBZ_index_offsets[] = {
6828 	0, 0, 2, 4, 7, 13, 14, 21,
6829 	23, 29, 30, 31, 32, 39, 48, 57
6830 };
6831 
6832 static const char _VBZ_indicies[] = {
6833 	0, 1, 3, 2, 4, 4, 1, 5,
6834 	5, 5, 5, 5, 1, 6, 7, 7,
6835 	7, 7, 7, 7, 1, 8, 1, 9,
6836 	9, 9, 9, 9, 1, 8, 10, 1,
6837 	11, 12, 13, 14, 4, 15, 1, 16,
6838 	16, 16, 17, 16, 18, 19, 16, 1,
6839 	20, 20, 20, 20, 20, 20, 22, 21,
6840 	1, 10, 0
6841 };
6842 
6843 static const char _VBZ_trans_targs[] = {
6844 	2, 0, 11, 12, 11, 5, 11, 11,
6845 	11, 9, 11, 3, 4, 6, 13, 14,
6846 	11, 7, 8, 11, 11, 10, 15
6847 };
6848 
6849 static const char _VBZ_trans_actions[] = {
6850 	0, 0, 17, 17, 11, 0, 13, 15,
6851 	9, 0, 3, 0, 0, 0, 11, 11,
6852 	1, 0, 0, 7, 5, 0, 7
6853 };
6854 
6855 static const int VBZ_start = 1;
6856 
add_VBZ(const string & form,vector<tagged_lemma> & lemmas) const6857 void english_morpho_guesser::add_VBZ(const string& form, vector<tagged_lemma>& lemmas) const {
6858   const char* p = form.c_str(); int cs;
6859   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
6860 
6861 	{
6862 	cs = VBZ_start;
6863 	}
6864 
6865 	{
6866 	int _klen;
6867 	unsigned int _trans;
6868 	const char *_acts;
6869 	unsigned int _nacts;
6870 	const char *_keys;
6871 
6872 	if ( p == ( (form.c_str() + form.size())) )
6873 		goto _test_eof;
6874 	if ( cs == 0 )
6875 		goto _out;
6876 _resume:
6877 	_keys = _VBZ_trans_keys + _VBZ_key_offsets[cs];
6878 	_trans = _VBZ_index_offsets[cs];
6879 
6880 	_klen = _VBZ_single_lengths[cs];
6881 	if ( _klen > 0 ) {
6882 		const char *_lower = _keys;
6883 		const char *_mid;
6884 		const char *_upper = _keys + _klen - 1;
6885 		while (1) {
6886 			if ( _upper < _lower )
6887 				break;
6888 
6889 			_mid = _lower + ((_upper-_lower) >> 1);
6890 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6891 				_upper = _mid - 1;
6892 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6893 				_lower = _mid + 1;
6894 			else {
6895 				_trans += (unsigned int)(_mid - _keys);
6896 				goto _match;
6897 			}
6898 		}
6899 		_keys += _klen;
6900 		_trans += _klen;
6901 	}
6902 
6903 	_klen = _VBZ_range_lengths[cs];
6904 	if ( _klen > 0 ) {
6905 		const char *_lower = _keys;
6906 		const char *_mid;
6907 		const char *_upper = _keys + (_klen<<1) - 2;
6908 		while (1) {
6909 			if ( _upper < _lower )
6910 				break;
6911 
6912 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
6913 			if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6914 				_upper = _mid - 2;
6915 			else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6916 				_lower = _mid + 2;
6917 			else {
6918 				_trans += (unsigned int)((_mid - _keys)>>1);
6919 				goto _match;
6920 			}
6921 		}
6922 		_trans += _klen;
6923 	}
6924 
6925 _match:
6926 	_trans = _VBZ_indicies[_trans];
6927 	cs = _VBZ_trans_targs[_trans];
6928 
6929 	if ( _VBZ_trans_actions[_trans] == 0 )
6930 		goto _again;
6931 
6932 	_acts = _VBZ_actions + _VBZ_trans_actions[_trans];
6933 	_nacts = (unsigned int) *_acts++;
6934 	while ( _nacts-- > 0 )
6935 	{
6936 		switch ( *_acts++ )
6937 		{
6938 	case 0:
6939 	{ if (best > 'a') best = 'a', remove = 1, append = nullptr; }
6940 	break;
6941 	case 1:
6942 	{ if (best > 'b') best = 'b', remove = 2, append = nullptr; }
6943 	break;
6944 	case 2:
6945 	{ if (best > 'c') best = 'c', remove = 1, append = nullptr; }
6946 	break;
6947 	case 3:
6948 	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
6949 	break;
6950 	case 4:
6951 	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
6952 	break;
6953 	case 5:
6954 	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
6955 	break;
6956 	case 6:
6957 	{ if (best > 'g') best = 'g', remove = 3, append = "y";     }
6958 	break;
6959 	case 7:
6960 	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
6961 	break;
6962 	case 8:
6963 	{ if (best > 'i') best = 'i', remove = 1, append = nullptr; }
6964 	break;
6965 		}
6966 	}
6967 
6968 _again:
6969 	if ( cs == 0 )
6970 		goto _out;
6971 	if ( ++p != ( (form.c_str() + form.size())) )
6972 		goto _resume;
6973 	_test_eof: {}
6974 	_out: {}
6975 	}
6976 
6977   add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
6978 }
6979 
6980 static const char _JJR_RBR_actions[] = {
6981 	0, 1, 0, 1, 1, 1, 3, 1,
6982 	4, 1, 5, 2, 1, 4, 2, 2,
6983 	5, 2, 4, 5
6984 };
6985 
6986 static const unsigned char _JJR_RBR_key_offsets[] = {
6987 	0, 0, 1, 2, 26, 26, 32, 37,
6988 	50, 56, 62, 73, 79, 85, 91, 102,
6989 	103, 109, 115, 117, 123, 129, 135, 146,
6990 	152, 163, 169, 175, 181
6991 };
6992 
6993 static const char _JJR_RBR_trans_keys[] = {
6994 	114, 101, 98, 99, 100, 101, 102, 103,
6995 	104, 105, 106, 107, 108, 109, 110, 112,
6996 	113, 114, 115, 116, 117, 118, 119, 120,
6997 	121, 122, 97, 98, 101, 105, 111, 117,
6998 	97, 101, 105, 111, 117, 98, 99, 100,
6999 	105, 111, 117, 122, 97, 101, 102, 109,
7000 	112, 120, 97, 100, 101, 105, 111, 117,
7001 	97, 101, 102, 105, 111, 117, 97, 101,
7002 	103, 105, 111, 117, 122, 98, 109, 112,
7003 	120, 97, 101, 104, 105, 111, 117, 97,
7004 	101, 105, 106, 111, 117, 97, 101, 105,
7005 	107, 111, 117, 97, 101, 105, 108, 111,
7006 	117, 122, 98, 109, 112, 120, 101, 97,
7007 	101, 105, 109, 111, 117, 97, 101, 105,
7008 	110, 111, 117, 97, 122, 97, 101, 105,
7009 	111, 112, 117, 97, 101, 105, 111, 113,
7010 	117, 97, 101, 105, 111, 114, 117, 97,
7011 	101, 105, 111, 115, 117, 122, 98, 109,
7012 	112, 120, 97, 101, 105, 111, 116, 117,
7013 	97, 101, 105, 111, 117, 118, 122, 98,
7014 	109, 112, 120, 97, 101, 105, 111, 117,
7015 	119, 97, 101, 105, 111, 117, 120, 97,
7016 	101, 105, 111, 117, 121, 97, 101, 105,
7017 	111, 117, 122, 0
7018 };
7019 
7020 static const char _JJR_RBR_single_lengths[] = {
7021 	0, 1, 1, 24, 0, 6, 5, 7,
7022 	6, 6, 7, 6, 6, 6, 7, 1,
7023 	6, 6, 0, 6, 6, 6, 7, 6,
7024 	7, 6, 6, 6, 6
7025 };
7026 
7027 static const char _JJR_RBR_range_lengths[] = {
7028 	0, 0, 0, 0, 0, 0, 0, 3,
7029 	0, 0, 2, 0, 0, 0, 2, 0,
7030 	0, 0, 1, 0, 0, 0, 2, 0,
7031 	2, 0, 0, 0, 0
7032 };
7033 
7034 static const unsigned char _JJR_RBR_index_offsets[] = {
7035 	0, 0, 2, 4, 29, 30, 37, 43,
7036 	54, 61, 68, 78, 85, 92, 99, 109,
7037 	111, 118, 125, 127, 134, 141, 148, 158,
7038 	165, 175, 182, 189, 196
7039 };
7040 
7041 static const char _JJR_RBR_indicies[] = {
7042 	0, 1, 2, 1, 4, 5, 6, 7,
7043 	8, 9, 10, 11, 12, 13, 14, 15,
7044 	16, 17, 18, 19, 20, 21, 7, 22,
7045 	23, 24, 25, 26, 3, 1, 27, 28,
7046 	27, 27, 27, 27, 1, 29, 29, 29,
7047 	29, 29, 1, 30, 31, 30, 27, 27,
7048 	27, 30, 27, 30, 30, 1, 27, 28,
7049 	27, 27, 27, 27, 1, 27, 27, 28,
7050 	27, 27, 27, 1, 27, 27, 31, 27,
7051 	27, 27, 30, 30, 30, 1, 27, 27,
7052 	28, 27, 27, 27, 1, 27, 27, 27,
7053 	28, 27, 27, 1, 27, 27, 27, 28,
7054 	27, 27, 1, 27, 27, 27, 32, 27,
7055 	27, 30, 30, 30, 1, 1, 33, 27,
7056 	27, 27, 28, 27, 27, 1, 34, 34,
7057 	34, 28, 34, 34, 1, 29, 1, 34,
7058 	34, 34, 34, 28, 34, 1, 27, 27,
7059 	27, 27, 28, 27, 1, 27, 27, 27,
7060 	27, 28, 27, 1, 27, 27, 27, 27,
7061 	31, 27, 30, 30, 30, 1, 27, 27,
7062 	27, 27, 28, 27, 1, 27, 27, 27,
7063 	27, 27, 31, 30, 30, 30, 1, 34,
7064 	34, 34, 34, 34, 28, 1, 34, 34,
7065 	34, 34, 34, 28, 1, 27, 27, 27,
7066 	27, 27, 28, 1, 27, 27, 27, 27,
7067 	27, 28, 1, 0
7068 };
7069 
7070 static const char _JJR_RBR_trans_targs[] = {
7071 	2, 0, 3, 4, 5, 7, 8, 4,
7072 	9, 10, 11, 4, 12, 13, 14, 16,
7073 	17, 19, 20, 21, 22, 23, 24, 25,
7074 	26, 27, 28, 6, 4, 4, 4, 4,
7075 	15, 4, 18
7076 };
7077 
7078 static const char _JJR_RBR_trans_actions[] = {
7079 	0, 0, 0, 9, 9, 9, 9, 17,
7080 	9, 9, 9, 14, 9, 9, 9, 9,
7081 	9, 9, 9, 9, 9, 9, 9, 9,
7082 	9, 9, 9, 7, 3, 5, 7, 11,
7083 	11, 1, 7
7084 };
7085 
7086 static const int JJR_RBR_start = 1;
7087 
add_JJR_RBR(const string & form,unsigned negation_len,vector<tagged_lemma> & lemmas) const7088 void english_morpho_guesser::add_JJR_RBR(const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const {
7089   const char* p = form.c_str() + negation_len; int cs;
7090   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
7091 
7092 	{
7093 	cs = JJR_RBR_start;
7094 	}
7095 
7096 	{
7097 	int _klen;
7098 	unsigned int _trans;
7099 	const char *_acts;
7100 	unsigned int _nacts;
7101 	const char *_keys;
7102 
7103 	if ( p == ( (form.c_str() + form.size())) )
7104 		goto _test_eof;
7105 	if ( cs == 0 )
7106 		goto _out;
7107 _resume:
7108 	_keys = _JJR_RBR_trans_keys + _JJR_RBR_key_offsets[cs];
7109 	_trans = _JJR_RBR_index_offsets[cs];
7110 
7111 	_klen = _JJR_RBR_single_lengths[cs];
7112 	if ( _klen > 0 ) {
7113 		const char *_lower = _keys;
7114 		const char *_mid;
7115 		const char *_upper = _keys + _klen - 1;
7116 		while (1) {
7117 			if ( _upper < _lower )
7118 				break;
7119 
7120 			_mid = _lower + ((_upper-_lower) >> 1);
7121 			if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
7122 				_upper = _mid - 1;
7123 			else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
7124 				_lower = _mid + 1;
7125 			else {
7126 				_trans += (unsigned int)(_mid - _keys);
7127 				goto _match;
7128 			}
7129 		}
7130 		_keys += _klen;
7131 		_trans += _klen;
7132 	}
7133 
7134 	_klen = _JJR_RBR_range_lengths[cs];
7135 	if ( _klen > 0 ) {
7136 		const char *_lower = _keys;
7137 		const char *_mid;
7138 		const char *_upper = _keys + (_klen<<1) - 2;
7139 		while (1) {
7140 			if ( _upper < _lower )
7141 				break;
7142 
7143 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
7144 			if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
7145 				_upper = _mid - 2;
7146 			else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
7147 				_lower = _mid + 2;
7148 			else {
7149 				_trans += (unsigned int)((_mid - _keys)>>1);
7150 				goto _match;
7151 			}
7152 		}
7153 		_trans += _klen;
7154 	}
7155 
7156 _match:
7157 	_trans = _JJR_RBR_indicies[_trans];
7158 	cs = _JJR_RBR_trans_targs[_trans];
7159 
7160 	if ( _JJR_RBR_trans_actions[_trans] == 0 )
7161 		goto _again;
7162 
7163 	_acts = _JJR_RBR_actions + _JJR_RBR_trans_actions[_trans];
7164 	_nacts = (unsigned int) *_acts++;
7165 	while ( _nacts-- > 0 )
7166 	{
7167 		switch ( *_acts++ )
7168 		{
7169 	case 0:
7170 	{ if (best > 'a') best = 'a', remove = 2, append = nullptr; }
7171 	break;
7172 	case 1:
7173 	{ if (best > 'b') best = 'b', remove = 3, append = nullptr; }
7174 	break;
7175 	case 2:
7176 	{ if (best > 'c') best = 'c', remove = 3, append = "y";     }
7177 	break;
7178 	case 3:
7179 	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7180 	break;
7181 	case 4:
7182 	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7183 	break;
7184 	case 5:
7185 	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7186 	break;
7187 		}
7188 	}
7189 
7190 _again:
7191 	if ( cs == 0 )
7192 		goto _out;
7193 	if ( ++p != ( (form.c_str() + form.size())) )
7194 		goto _resume;
7195 	_test_eof: {}
7196 	_out: {}
7197 	}
7198 
7199   add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
7200 }
7201 
7202 static const char _JJS_RBS_actions[] = {
7203 	0, 1, 1, 1, 2, 1, 4, 1,
7204 	5, 2, 0, 5, 2, 1, 4, 2,
7205 	3, 5
7206 };
7207 
7208 static const unsigned char _JJS_RBS_key_offsets[] = {
7209 	0, 0, 1, 2, 3, 25, 25, 25,
7210 	31, 44, 50, 56, 67, 73, 79, 85,
7211 	96, 102, 108, 114, 120, 126, 137, 143,
7212 	154, 160, 166, 172, 178, 178, 183, 183,
7213 	183, 184
7214 };
7215 
7216 static const char _JJS_RBS_trans_keys[] = {
7217 	116, 115, 101, 98, 99, 100, 102, 103,
7218 	104, 105, 106, 107, 108, 109, 110, 112,
7219 	113, 114, 115, 116, 118, 119, 120, 121,
7220 	122, 97, 98, 101, 105, 111, 117, 98,
7221 	99, 100, 105, 111, 117, 122, 97, 101,
7222 	102, 109, 112, 120, 97, 100, 101, 105,
7223 	111, 117, 97, 101, 102, 105, 111, 117,
7224 	97, 101, 103, 105, 111, 117, 122, 98,
7225 	109, 112, 120, 97, 101, 104, 105, 111,
7226 	117, 97, 101, 105, 106, 111, 117, 97,
7227 	101, 105, 107, 111, 117, 97, 101, 105,
7228 	108, 111, 117, 122, 98, 109, 112, 120,
7229 	97, 101, 105, 109, 111, 117, 97, 101,
7230 	105, 110, 111, 117, 97, 101, 105, 111,
7231 	112, 117, 97, 101, 105, 111, 113, 117,
7232 	97, 101, 105, 111, 114, 117, 97, 101,
7233 	105, 111, 115, 117, 122, 98, 109, 112,
7234 	120, 97, 101, 105, 111, 116, 117, 97,
7235 	101, 105, 111, 117, 118, 122, 98, 109,
7236 	112, 120, 97, 101, 105, 111, 117, 119,
7237 	97, 101, 105, 111, 117, 120, 97, 101,
7238 	105, 111, 117, 121, 97, 101, 105, 111,
7239 	117, 122, 97, 101, 105, 111, 117, 101,
7240 	97, 122, 0
7241 };
7242 
7243 static const char _JJS_RBS_single_lengths[] = {
7244 	0, 1, 1, 1, 22, 0, 0, 6,
7245 	7, 6, 6, 7, 6, 6, 6, 7,
7246 	6, 6, 6, 6, 6, 7, 6, 7,
7247 	6, 6, 6, 6, 0, 5, 0, 0,
7248 	1, 0
7249 };
7250 
7251 static const char _JJS_RBS_range_lengths[] = {
7252 	0, 0, 0, 0, 0, 0, 0, 0,
7253 	3, 0, 0, 2, 0, 0, 0, 2,
7254 	0, 0, 0, 0, 0, 2, 0, 2,
7255 	0, 0, 0, 0, 0, 0, 0, 0,
7256 	0, 1
7257 };
7258 
7259 static const unsigned char _JJS_RBS_index_offsets[] = {
7260 	0, 0, 2, 4, 6, 29, 30, 31,
7261 	38, 49, 56, 63, 73, 80, 87, 94,
7262 	104, 111, 118, 125, 132, 139, 149, 156,
7263 	166, 173, 180, 187, 194, 195, 201, 202,
7264 	203, 205
7265 };
7266 
7267 static const char _JJS_RBS_indicies[] = {
7268 	0, 1, 2, 1, 3, 1, 5, 6,
7269 	7, 8, 9, 10, 11, 12, 13, 14,
7270 	15, 16, 17, 18, 19, 20, 21, 22,
7271 	23, 24, 25, 26, 4, 27, 28, 29,
7272 	30, 29, 29, 29, 29, 27, 31, 32,
7273 	31, 29, 29, 29, 31, 29, 31, 31,
7274 	27, 29, 30, 29, 29, 29, 29, 27,
7275 	29, 29, 30, 29, 29, 29, 27, 29,
7276 	29, 32, 29, 29, 29, 31, 31, 31,
7277 	27, 29, 29, 30, 29, 29, 29, 27,
7278 	29, 29, 29, 30, 29, 29, 27, 29,
7279 	29, 29, 30, 29, 29, 27, 29, 29,
7280 	29, 33, 29, 29, 31, 31, 31, 27,
7281 	29, 29, 29, 30, 29, 29, 27, 34,
7282 	34, 34, 30, 34, 34, 27, 34, 34,
7283 	34, 34, 30, 34, 27, 29, 29, 29,
7284 	29, 30, 29, 27, 29, 29, 29, 29,
7285 	30, 29, 27, 29, 29, 29, 29, 32,
7286 	29, 31, 31, 31, 27, 29, 29, 29,
7287 	29, 30, 29, 27, 29, 29, 29, 29,
7288 	29, 32, 31, 31, 31, 27, 34, 34,
7289 	34, 34, 34, 30, 27, 34, 34, 34,
7290 	34, 34, 30, 27, 29, 29, 29, 29,
7291 	29, 30, 27, 29, 29, 29, 29, 29,
7292 	30, 27, 1, 35, 35, 35, 35, 35,
7293 	28, 28, 27, 28, 36, 35, 28, 0
7294 };
7295 
7296 static const char _JJS_RBS_trans_targs[] = {
7297 	2, 0, 3, 4, 5, 7, 8, 9,
7298 	10, 11, 12, 31, 13, 14, 15, 16,
7299 	17, 18, 19, 20, 21, 22, 23, 24,
7300 	25, 26, 27, 6, 28, 29, 30, 30,
7301 	30, 32, 33, 28, 28
7302 };
7303 
7304 static const char _JJS_RBS_trans_actions[] = {
7305 	0, 0, 0, 0, 0, 0, 0, 0,
7306 	0, 0, 0, 3, 0, 0, 0, 0,
7307 	0, 0, 0, 0, 0, 0, 0, 0,
7308 	0, 0, 0, 0, 7, 5, 1, 5,
7309 	12, 12, 5, 15, 9
7310 };
7311 
7312 static const int JJS_RBS_start = 1;
7313 
add_JJS_RBS(const string & form,unsigned negation_len,vector<tagged_lemma> & lemmas) const7314 void english_morpho_guesser::add_JJS_RBS(const string& form, unsigned negation_len, vector<tagged_lemma>& lemmas) const {
7315   const char* p = form.c_str() + negation_len; int cs;
7316   char best = 'z'; unsigned remove = 0; const char* append = nullptr;
7317 
7318 	{
7319 	cs = JJS_RBS_start;
7320 	}
7321 
7322 	{
7323 	int _klen;
7324 	unsigned int _trans;
7325 	const char *_acts;
7326 	unsigned int _nacts;
7327 	const char *_keys;
7328 
7329 	if ( p == ( (form.c_str() + form.size())) )
7330 		goto _test_eof;
7331 	if ( cs == 0 )
7332 		goto _out;
7333 _resume:
7334 	_keys = _JJS_RBS_trans_keys + _JJS_RBS_key_offsets[cs];
7335 	_trans = _JJS_RBS_index_offsets[cs];
7336 
7337 	_klen = _JJS_RBS_single_lengths[cs];
7338 	if ( _klen > 0 ) {
7339 		const char *_lower = _keys;
7340 		const char *_mid;
7341 		const char *_upper = _keys + _klen - 1;
7342 		while (1) {
7343 			if ( _upper < _lower )
7344 				break;
7345 
7346 			_mid = _lower + ((_upper-_lower) >> 1);
7347 			if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
7348 				_upper = _mid - 1;
7349 			else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
7350 				_lower = _mid + 1;
7351 			else {
7352 				_trans += (unsigned int)(_mid - _keys);
7353 				goto _match;
7354 			}
7355 		}
7356 		_keys += _klen;
7357 		_trans += _klen;
7358 	}
7359 
7360 	_klen = _JJS_RBS_range_lengths[cs];
7361 	if ( _klen > 0 ) {
7362 		const char *_lower = _keys;
7363 		const char *_mid;
7364 		const char *_upper = _keys + (_klen<<1) - 2;
7365 		while (1) {
7366 			if ( _upper < _lower )
7367 				break;
7368 
7369 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
7370 			if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
7371 				_upper = _mid - 2;
7372 			else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
7373 				_lower = _mid + 2;
7374 			else {
7375 				_trans += (unsigned int)((_mid - _keys)>>1);
7376 				goto _match;
7377 			}
7378 		}
7379 		_trans += _klen;
7380 	}
7381 
7382 _match:
7383 	_trans = _JJS_RBS_indicies[_trans];
7384 	cs = _JJS_RBS_trans_targs[_trans];
7385 
7386 	if ( _JJS_RBS_trans_actions[_trans] == 0 )
7387 		goto _again;
7388 
7389 	_acts = _JJS_RBS_actions + _JJS_RBS_trans_actions[_trans];
7390 	_nacts = (unsigned int) *_acts++;
7391 	while ( _nacts-- > 0 )
7392 	{
7393 		switch ( *_acts++ )
7394 		{
7395 	case 0:
7396 	{ if (best > 'a') best = 'a', remove = 3, append = nullptr; }
7397 	break;
7398 	case 1:
7399 	{ if (best > 'b') best = 'b', remove = 4, append = nullptr; }
7400 	break;
7401 	case 2:
7402 	{ if (best > 'c') best = 'c', remove = 4, append = "y";     }
7403 	break;
7404 	case 3:
7405 	{ if (best > 'd') best = 'd', remove = 3, append = nullptr; }
7406 	break;
7407 	case 4:
7408 	{ if (best > 'e') best = 'e', remove = 2, append = nullptr; }
7409 	break;
7410 	case 5:
7411 	{ if (best > 'f') best = 'f', remove = 3, append = nullptr; }
7412 	break;
7413 		}
7414 	}
7415 
7416 _again:
7417 	if ( cs == 0 )
7418 		goto _out;
7419 	if ( ++p != ( (form.c_str() + form.size())) )
7420 		goto _resume;
7421 	_test_eof: {}
7422 	_out: {}
7423 	}
7424 
7425   add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
7426 }
7427 
7428 } // namespace morphodita
7429 
7430 /////////
7431 // File: morphodita/morpho/external_morpho.h
7432 /////////
7433 
7434 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7435 //
7436 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7437 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7438 //
7439 // This Source Code Form is subject to the terms of the Mozilla Public
7440 // License, v. 2.0. If a copy of the MPL was not distributed with this
7441 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7442 
7443 namespace morphodita {
7444 
7445 class external_morpho : public morpho {
7446  public:
external_morpho(unsigned version)7447   external_morpho(unsigned version) : version(version) {}
7448 
7449   virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector<tagged_lemma>& lemmas) const override;
7450   virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector<tagged_lemma_forms>& forms) const override;
7451   virtual int raw_lemma_len(string_piece lemma) const override;
7452   virtual int lemma_id_len(string_piece lemma) const override;
7453   virtual int raw_form_len(string_piece form) const override;
7454   virtual tokenizer* new_tokenizer() const override;
7455 
7456   bool load(istream& is);
7457 
7458  private:
7459   unsigned version;
7460 
7461   string unknown_tag;
7462 };
7463 
7464 } // namespace morphodita
7465 
7466 /////////
7467 // File: morphodita/tokenizer/generic_tokenizer.h
7468 /////////
7469 
7470 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7471 //
7472 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7473 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7474 //
7475 // This Source Code Form is subject to the terms of the Mozilla Public
7476 // License, v. 2.0. If a copy of the MPL was not distributed with this
7477 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7478 
7479 namespace morphodita {
7480 
7481 class generic_tokenizer : public ragel_tokenizer {
7482  public:
7483   enum { LATEST = 2 };
7484   generic_tokenizer(unsigned version);
7485 
7486   virtual bool next_sentence(vector<token_range>& tokens) override;
7487 };
7488 
7489 } // namespace morphodita
7490 
7491 /////////
7492 // File: morphodita/morpho/external_morpho.cpp
7493 /////////
7494 
7495 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7496 //
7497 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7498 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7499 //
7500 // This Source Code Form is subject to the terms of the Mozilla Public
7501 // License, v. 2.0. If a copy of the MPL was not distributed with this
7502 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7503 
7504 namespace morphodita {
7505 
load(istream & is)7506 bool external_morpho::load(istream& is) {
7507   binary_decoder data;
7508   if (!compressor::load(is, data)) return false;
7509 
7510   try {
7511     // Load unknown_tag
7512     unsigned length = data.next_1B();
7513     unknown_tag.assign(data.next<char>(length), length);
7514   } catch (binary_decoder_error&) {
7515     return false;
7516   }
7517 
7518   return data.is_end();
7519 }
7520 
analyze(string_piece form,guesser_mode,vector<tagged_lemma> & lemmas) const7521 int external_morpho::analyze(string_piece form, guesser_mode /*guesser*/, vector<tagged_lemma>& lemmas) const {
7522   lemmas.clear();
7523 
7524   if (form.len) {
7525     // Start by skipping the first form
7526     string_piece lemmatags = form;
7527     while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
7528     if (lemmatags.len) lemmatags.len--, lemmatags.str++;
7529 
7530     // Split lemmatags using ' ' into lemma-tag pairs.
7531     while (lemmatags.len) {
7532       auto lemma_start = lemmatags.str;
7533       while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
7534       if (!lemmatags.len) break;
7535       auto lemma_len = lemmatags.str - lemma_start;
7536       lemmatags.len--, lemmatags.str++;
7537 
7538       auto tag_start = lemmatags.str;
7539       while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
7540       auto tag_len = lemmatags.str - tag_start;
7541       if (lemmatags.len) lemmatags.len--, lemmatags.str++;
7542 
7543       lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len));
7544     }
7545 
7546     if (!lemmas.empty()) return NO_GUESSER;
7547   }
7548 
7549   lemmas.emplace_back(string(form.str, form.len), unknown_tag);
7550   return -1;
7551 }
7552 
generate(string_piece lemma,const char * tag_wildcard,morpho::guesser_mode,vector<tagged_lemma_forms> & forms) const7553 int external_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector<tagged_lemma_forms>& forms) const {
7554   forms.clear();
7555 
7556   tag_filter filter(tag_wildcard);
7557 
7558   if (lemma.len) {
7559     // Start by locating the lemma
7560     string_piece formtags = lemma;
7561     while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
7562     string_piece real_lemma(lemma.str, lemma.len - formtags.len);
7563     if (formtags.len) formtags.len--, formtags.str++;
7564 
7565     // Split formtags using ' ' into form-tag pairs.
7566     bool any_result = false;
7567     while (formtags.len) {
7568       auto form_start = formtags.str;
7569       while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
7570       if (!formtags.len) break;
7571       auto form_len = formtags.str - form_start;
7572       formtags.len--, formtags.str++;
7573 
7574       auto tag_start = formtags.str;
7575       while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
7576       auto tag_len = formtags.str - tag_start;
7577       if (formtags.len) formtags.len--, formtags.str++;
7578 
7579       any_result = true;
7580       string tag(tag_start, tag_len);
7581       if (filter.matches(tag.c_str())) {
7582         if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len));
7583         forms.back().forms.emplace_back(string(form_start, form_len), tag);
7584       }
7585     }
7586 
7587     if (any_result) return NO_GUESSER;
7588   }
7589 
7590   return -1;
7591 }
7592 
raw_lemma_len(string_piece lemma) const7593 int external_morpho::raw_lemma_len(string_piece lemma) const {
7594   unsigned lemma_len = 0;
7595   while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
7596   return lemma_len;
7597 }
7598 
lemma_id_len(string_piece lemma) const7599 int external_morpho::lemma_id_len(string_piece lemma) const {
7600   unsigned lemma_len = 0;
7601   while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
7602   return lemma_len;
7603 }
7604 
raw_form_len(string_piece form) const7605 int external_morpho::raw_form_len(string_piece form) const {
7606   unsigned form_len = 0;
7607   while (form_len < form.len && form.str[form_len] != ' ') form_len++;
7608   return form_len;
7609 }
7610 
new_tokenizer() const7611 tokenizer* external_morpho::new_tokenizer() const {
7612   return new generic_tokenizer(version);
7613 }
7614 
7615 } // namespace morphodita
7616 
7617 /////////
7618 // File: morphodita/morpho/generic_lemma_addinfo.h
7619 /////////
7620 
7621 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7622 //
7623 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7624 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7625 //
7626 // This Source Code Form is subject to the terms of the Mozilla Public
7627 // License, v. 2.0. If a copy of the MPL was not distributed with this
7628 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7629 
7630 namespace morphodita {
7631 
7632 // Declarations
7633 struct generic_lemma_addinfo {
7634   inline static int raw_lemma_len(string_piece lemma);
7635   inline static int lemma_id_len(string_piece lemma);
7636   inline static string format(const unsigned char* addinfo, int addinfo_len);
7637   inline static bool generatable(const unsigned char* addinfo, int addinfo_len);
7638 
7639   inline int parse(string_piece lemma, bool die_on_failure = false);
7640   inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len);
7641 
7642   vector<unsigned char> data;
7643 };
7644 
7645 // Definitions
raw_lemma_len(string_piece lemma)7646 int generic_lemma_addinfo::raw_lemma_len(string_piece lemma) {
7647   return lemma.len;
7648 }
7649 
lemma_id_len(string_piece lemma)7650 int generic_lemma_addinfo::lemma_id_len(string_piece lemma) {
7651   return lemma.len;
7652 }
7653 
format(const unsigned char *,int)7654 string generic_lemma_addinfo::format(const unsigned char* /*addinfo*/, int /*addinfo_len*/) {
7655   return string();
7656 }
7657 
generatable(const unsigned char *,int)7658 bool generic_lemma_addinfo::generatable(const unsigned char* /*addinfo*/, int /*addinfo_len*/) {
7659   return true;
7660 }
7661 
parse(string_piece lemma,bool)7662 int generic_lemma_addinfo::parse(string_piece lemma, bool /*die_on_failure*/) {
7663   return lemma.len;
7664 }
7665 
match_lemma_id(const unsigned char *,int)7666 bool generic_lemma_addinfo::match_lemma_id(const unsigned char* /*other_addinfo*/, int /*other_addinfo_len*/) {
7667   return true;
7668 }
7669 
7670 } // namespace morphodita
7671 
7672 /////////
7673 // File: morphodita/morpho/generic_morpho.h
7674 /////////
7675 
7676 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7677 //
7678 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7679 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7680 //
7681 // This Source Code Form is subject to the terms of the Mozilla Public
7682 // License, v. 2.0. If a copy of the MPL was not distributed with this
7683 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7684 
7685 namespace morphodita {
7686 
7687 class generic_morpho : public morpho {
7688  public:
generic_morpho(unsigned version)7689   generic_morpho(unsigned version) : version(version) {}
7690 
7691   virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector<tagged_lemma>& lemmas) const override;
7692   virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector<tagged_lemma_forms>& forms) const override;
7693   virtual int raw_lemma_len(string_piece lemma) const override;
7694   virtual int lemma_id_len(string_piece lemma) const override;
7695   virtual int raw_form_len(string_piece form) const override;
7696   virtual tokenizer* new_tokenizer() const override;
7697 
7698   bool load(istream& is);
7699  private:
7700   inline void analyze_special(string_piece form, vector<tagged_lemma>& lemmas) const;
7701 
7702   unsigned version;
7703   morpho_dictionary<generic_lemma_addinfo> dictionary;
7704   unique_ptr<morpho_statistical_guesser> statistical_guesser;
7705 
7706   string unknown_tag, number_tag, punctuation_tag, symbol_tag;
7707 };
7708 
7709 } // namespace morphodita
7710 
7711 /////////
7712 // File: morphodita/morpho/generic_morpho.cpp
7713 /////////
7714 
7715 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7716 //
7717 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7718 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7719 //
7720 // This Source Code Form is subject to the terms of the Mozilla Public
7721 // License, v. 2.0. If a copy of the MPL was not distributed with this
7722 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7723 
7724 namespace morphodita {
7725 
load(istream & is)7726 bool generic_morpho::load(istream& is) {
7727   binary_decoder data;
7728   if (!compressor::load(is, data)) return false;
7729 
7730   try {
7731     // Load tags
7732     unsigned length = data.next_1B();
7733     unknown_tag.assign(data.next<char>(length), length);
7734     length = data.next_1B();
7735     number_tag.assign(data.next<char>(length), length);
7736     length = data.next_1B();
7737     punctuation_tag.assign(data.next<char>(length), length);
7738     length = data.next_1B();
7739     symbol_tag.assign(data.next<char>(length), length);
7740 
7741     // Load dictionary
7742     dictionary.load(data);
7743 
7744     // Optionally statistical guesser if present
7745     statistical_guesser.reset();
7746     if (data.next_1B()) {
7747       statistical_guesser.reset(new morpho_statistical_guesser());
7748       statistical_guesser->load(data);
7749     }
7750   } catch (binary_decoder_error&) {
7751     return false;
7752   }
7753 
7754   return data.is_end();
7755 }
7756 
analyze(string_piece form,guesser_mode guesser,vector<tagged_lemma> & lemmas) const7757 int generic_morpho::analyze(string_piece form, guesser_mode guesser, vector<tagged_lemma>& lemmas) const {
7758   lemmas.clear();
7759 
7760   if (form.len) {
7761     // Generate all casing variants if needed (they are different than given form).
7762     string form_uclc; // first uppercase, rest lowercase
7763     string form_lc;   // all lowercase
7764     generate_casing_variants(form, form_uclc, form_lc);
7765 
7766     // Start by analysing using the dictionary and all casing variants.
7767     dictionary.analyze(form, lemmas);
7768     if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
7769     if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
7770     if (!lemmas.empty()) return NO_GUESSER;
7771 
7772     // Then call analyze_special to handle numbers, punctuation and symbols.
7773     analyze_special(form, lemmas);
7774     if (!lemmas.empty()) return NO_GUESSER;
7775 
7776     // For the statistical guesser, use all casing variants.
7777     if (guesser == GUESSER && statistical_guesser) {
7778       if (form_uclc.empty() && form_lc.empty())
7779         statistical_guesser->analyze(form, lemmas, nullptr);
7780       else {
7781         morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3);
7782         statistical_guesser->analyze(form, lemmas, &used_rules);
7783         if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
7784         if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
7785       }
7786     }
7787     if (!lemmas.empty()) return GUESSER;
7788   }
7789 
7790   lemmas.emplace_back(string(form.str, form.len), unknown_tag);
7791   return -1;
7792 }
7793 
generate(string_piece lemma,const char * tag_wildcard,morpho::guesser_mode,vector<tagged_lemma_forms> & forms) const7794 int generic_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector<tagged_lemma_forms>& forms) const {
7795   forms.clear();
7796 
7797   tag_filter filter(tag_wildcard);
7798 
7799   if (lemma.len) {
7800     if (dictionary.generate(lemma, filter, forms))
7801       return NO_GUESSER;
7802   }
7803 
7804   return -1;
7805 }
7806 
raw_lemma_len(string_piece lemma) const7807 int generic_morpho::raw_lemma_len(string_piece lemma) const {
7808   return generic_lemma_addinfo::raw_lemma_len(lemma);
7809 }
7810 
lemma_id_len(string_piece lemma) const7811 int generic_morpho::lemma_id_len(string_piece lemma) const {
7812   return generic_lemma_addinfo::lemma_id_len(lemma);
7813 }
7814 
raw_form_len(string_piece form) const7815 int generic_morpho::raw_form_len(string_piece form) const {
7816   return form.len;
7817 }
7818 
new_tokenizer() const7819 tokenizer* generic_morpho::new_tokenizer() const {
7820   return new generic_tokenizer(version);
7821 }
7822 
analyze_special(string_piece form,vector<tagged_lemma> & lemmas) const7823 void generic_morpho::analyze_special(string_piece form, vector<tagged_lemma>& lemmas) const {
7824   using namespace unilib;
7825 
7826   // Analyzer for numbers, punctuation and symbols.
7827   // Number is anything matching [+-]? is_Pn* ([.,] is_Pn*)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn* nonempty.
7828   // Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character.
7829   // Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number.
7830   if (!form.len) return;
7831 
7832   string_piece number = form;
7833   char32_t first = utf8::decode(number.str, number.len);
7834 
7835   // Try matching a number.
7836   char32_t codepoint = first;
7837   bool any_digit = false;
7838   if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
7839   while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
7840   if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len);
7841   while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
7842   if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
7843     codepoint = utf8::decode(number.str, number.len);
7844     if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
7845     any_digit = false;
7846     while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
7847   }
7848 
7849   if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
7850     lemmas.emplace_back(string(form.str, form.len - (codepoint == '.')), number_tag);
7851     return;
7852   }
7853 
7854   // Try matching punctuation or symbol.
7855   bool punctuation = true, symbol = true;
7856   string_piece form_ori = form;
7857   while (form.len) {
7858     codepoint = utf8::decode(form.str, form.len);
7859     punctuation = punctuation && unicode::category(codepoint) & unicode::P;
7860     symbol = symbol && unicode::category(codepoint) & unicode::S;
7861   }
7862   if (punctuation)
7863     lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag);
7864   else if (symbol)
7865     lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag);
7866 }
7867 
7868 } // namespace morphodita
7869 
7870 /////////
7871 // File: morphodita/morpho/generic_morpho_encoder.h
7872 /////////
7873 
7874 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7875 //
7876 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7877 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7878 //
7879 // This Source Code Form is subject to the terms of the Mozilla Public
7880 // License, v. 2.0. If a copy of the MPL was not distributed with this
7881 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7882 
7883 namespace morphodita {
7884 
7885 class generic_morpho_encoder {
7886  public:
7887   struct tags {
7888     string unknown_tag, number_tag, punctuation_tag, symbol_tag;
7889   };
7890   static void encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho);
7891 };
7892 
7893 } // namespace morphodita
7894 
7895 /////////
7896 // File: morphodita/morpho/persistent_unordered_map_encoder.h
7897 /////////
7898 
7899 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7900 //
7901 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7902 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7903 //
7904 // This Source Code Form is subject to the terms of the Mozilla Public
7905 // License, v. 2.0. If a copy of the MPL was not distributed with this
7906 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7907 
7908 namespace morphodita {
7909 
7910 template <class Entry, class EntryEncode>
persistent_unordered_map(const unordered_map<string,Entry> & map,double load_factor,EntryEncode entry_encode)7911 persistent_unordered_map::persistent_unordered_map(const unordered_map<string, Entry>& map, double load_factor, EntryEncode entry_encode) {
7912   construct(std::map<string, Entry>(map.begin(), map.end()), load_factor, entry_encode);
7913 }
7914 
7915 template <class Entry, class EntryEncode>
persistent_unordered_map(const unordered_map<string,Entry> & map,double load_factor,bool add_prefixes,bool add_suffixes,EntryEncode entry_encode)7916 persistent_unordered_map::persistent_unordered_map(const unordered_map<string, Entry>& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode) {
7917   // Copy data, possibly including prefixes and suffixes
7918   std::map<string, Entry> enlarged_map(map.begin(), map.end());
7919 
7920   for (auto&& entry : map) {
7921     const string& key = entry.first;
7922 
7923     if (!key.empty() && add_prefixes)
7924       for (unsigned i = key.size() - 1; i; i--)
7925         enlarged_map[key.substr(0, i)];
7926 
7927     if (!key.empty() && add_suffixes)
7928       for (unsigned i = 1; i < key.size(); i++)
7929         enlarged_map[key.substr(i)];
7930   }
7931 
7932   construct(enlarged_map, load_factor, entry_encode);
7933 }
7934 
7935 // We could (and used to) use unordered_map as input parameter.
7936 // Nevertheless, as order is unspecified, the resulting persistent_unordered_map
7937 // has different collision chains when generated on 32-bit and 64-bit machines.
7938 // To guarantee uniform binary representation, we use map instead.
7939 template <class Entry, class EntryEncode>
construct(const map<string,Entry> & map,double load_factor,EntryEncode entry_encode)7940 void persistent_unordered_map::construct(const map<string, Entry>& map, double load_factor, EntryEncode entry_encode) {
7941   // 1) Count number of elements for each size
7942   vector<int> sizes;
7943   for (auto&& elem : map) {
7944     unsigned len = elem.first.size();
7945     if (len >= sizes.size()) sizes.resize(len + 1);
7946     sizes[len]++;
7947   }
7948   for (auto&& size : sizes)
7949     resize(unsigned(load_factor * size));
7950 
7951   // 2) Add sizes of element data
7952   for (auto&& elem : map) {
7953     binary_encoder enc;
7954     entry_encode(enc, elem.second);
7955     add(elem.first.c_str(), elem.first.size(), enc.data.size());
7956   }
7957   done_adding();
7958 
7959   // 3) Fill in element data
7960   for (auto&& elem : map) {
7961     binary_encoder enc;
7962     entry_encode(enc, elem.second);
7963     small_memcpy(fill(elem.first.c_str(), elem.first.size(), enc.data.size()), enc.data.data(), enc.data.size());
7964   }
7965   done_filling();
7966 }
7967 
save(binary_encoder & enc)7968 void persistent_unordered_map::save(binary_encoder& enc) {
7969   enc.add_1B(hashes.size());
7970 
7971   for (auto&& hash : hashes)
7972     hash.save(enc);
7973 }
7974 
save(binary_encoder & enc)7975 void persistent_unordered_map::fnv_hash::save(binary_encoder& enc) {
7976   enc.add_4B(hash.size());
7977   enc.add_data(hash);
7978 
7979   enc.add_4B(data.size());
7980   enc.add_data(data);
7981 }
7982 
7983 } // namespace morphodita
7984 
7985 /////////
7986 // File: morphodita/morpho/raw_morpho_dictionary_reader.h
7987 /////////
7988 
7989 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
7990 //
7991 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7992 // Mathematics and Physics, Charles University in Prague, Czech Republic.
7993 //
7994 // This Source Code Form is subject to the terms of the Mozilla Public
7995 // License, v. 2.0. If a copy of the MPL was not distributed with this
7996 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
7997 
7998 namespace morphodita {
7999 
8000 class raw_morpho_dictionary_reader {
8001  public:
raw_morpho_dictionary_reader(istream & in)8002   raw_morpho_dictionary_reader(istream& in) : in(in) {}
8003   bool next_lemma(string& lemma, vector<pair<string, string>>& tagged_forms);
8004  private:
8005   istream& in;
8006   string line;
8007   vector<string> tokens;
8008   unordered_set<string> seen_lemmas;
8009 };
8010 
8011 } // namespace morphodita
8012 
8013 /////////
8014 // File: utils/new_unique_ptr.h
8015 /////////
8016 
8017 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
8018 //
8019 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8020 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8021 //
8022 // This Source Code Form is subject to the terms of the Mozilla Public
8023 // License, v. 2.0. If a copy of the MPL was not distributed with this
8024 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8025 
8026 namespace utils {
8027 
8028 template<typename T, typename... Args>
new_unique_ptr(Args &&...args)8029 unique_ptr<T> new_unique_ptr(Args&&... args) {
8030   return unique_ptr<T>(new T(std::forward<Args>(args)...));
8031 }
8032 
8033 } // namespace utils
8034 
8035 /////////
8036 // File: morphodita/morpho/morpho_dictionary_encoder.h
8037 /////////
8038 
8039 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8040 //
8041 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8042 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8043 //
8044 // This Source Code Form is subject to the terms of the Mozilla Public
8045 // License, v. 2.0. If a copy of the MPL was not distributed with this
8046 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8047 
8048 namespace morphodita {
8049 
8050 // Declarations
8051 template <class LemmaAddinfo>
8052 class morpho_dictionary_encoder {
8053  public:
8054   static void encode(istream& is, int max_suffix_len, binary_encoder& enc);
8055 };
8056 
8057 // Definitions
8058 template <class LemmaAddinfo>
8059 class dictionary {
8060  public:
8061   void load(istream& is, int max_suffix_len);
8062   void encode(binary_encoder& enc);
8063 
8064  private:
8065   class trie {
8066    public:
trie()8067     trie() : depth(0) {}
8068 
add(const char * str)8069     void add(const char* str) {
8070       if (!*str) return;
8071 
8072       for (auto&& child : children)
8073         if (child.first == *str) {
8074           child.second->add(str + 1);
8075           depth = max(depth, 1 + child.second->depth);
8076           return;
8077         }
8078       children.emplace_back(*str, new_unique_ptr<trie>());
8079       children.back().second->add(str + 1);
8080       depth = max(depth, 1 + children.back().second->depth);
8081     }
8082 
find_candidate_prefix(int max_suffix_len)8083     string find_candidate_prefix(int max_suffix_len) {
8084       string current, best;
8085       int best_length = 0;
8086       find_candidate_prefix(max_suffix_len, current, best, best_length, 0);
8087       return best;
8088     }
find_candidate_prefix(int max_suffix_len,string & current,string & best,int & best_length,int length)8089     void find_candidate_prefix(int max_suffix_len, string& current, string& best, int& best_length, int length) {
8090       if (depth < max_suffix_len && length > best_length) {
8091         best = current;
8092         best_length = length;
8093       }
8094       for (auto&& child : children) {
8095         current.push_back(child.first);
8096         child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1);
8097         current.resize(current.size() - 1);
8098       }
8099     }
8100 
8101     vector<pair<char, unique_ptr<trie>>> children;
8102     int depth;
8103   };
8104 
8105   class histogram {
8106    public:
add(const string & str)8107     void add(const string& str) {
8108       if (str.size() >= lengths.size()) lengths.resize(str.size() + 1);
8109       lengths[str.size()].insert(str);
8110     }
8111 
encode(binary_encoder & enc)8112     void encode(binary_encoder& enc) {
8113       enc.add_1B(lengths.size());
8114       for (auto&& set : lengths)
8115         enc.add_4B(set.size());
8116     }
8117 
8118     vector<unordered_set<string>> lengths;
8119   };
8120 
8121   struct lemma_info {
lemma_infoufal::udpipe::morphodita::dictionary::lemma_info8122     lemma_info(string lemma) {
8123       this->lemma = lemma.substr(0, addinfo.parse(lemma, true));
8124     }
8125 
8126     string lemma;
8127     LemmaAddinfo addinfo;
8128     struct lemma_form_info {
lemma_form_infoufal::udpipe::morphodita::dictionary::lemma_info::lemma_form_info8129       lemma_form_info(string form, int clas) : form(form), clas(clas) {}
8130 
8131       string form;
8132       int clas;
8133 
operator <ufal::udpipe::morphodita::dictionary::lemma_info::lemma_form_info8134       bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); }
8135     };
8136     vector<lemma_form_info> forms;
8137 
operator <ufal::udpipe::morphodita::dictionary::lemma_info8138     bool operator<(const lemma_info& other) const { return lemma < other.lemma || (lemma == other.lemma && addinfo.data < other.addinfo.data); }
8139   };
8140 
8141   unordered_map<string, int> classes;
8142   unordered_map<string, map<int, vector<int>>> suffixes;
8143 
8144   vector<string> tags;
8145   unordered_map<string, int> tags_map;
8146 
8147   histogram lemmas_hist, forms_hist;
8148 
8149   vector<lemma_info> lemmas;
8150 };
8151 
8152 template <class LemmaAddinfo>
encode(istream & is,int max_suffix_len,binary_encoder & enc)8153 void morpho_dictionary_encoder<LemmaAddinfo>::encode(istream& is, int max_suffix_len, binary_encoder& enc) {
8154   dictionary<LemmaAddinfo> dict;
8155 
8156   // Load the dictionary and create classes
8157   dict.load(is, max_suffix_len);
8158 
8159   // Encode the dictionary
8160   dict.encode(enc);
8161 }
8162 
8163 template <class LemmaAddinfo>
load(istream & is,int max_suffix_len)8164 void dictionary<LemmaAddinfo>::load(istream& is, int max_suffix_len) {
8165   // Load lemmas and create classes
8166   raw_morpho_dictionary_reader raw(is);
8167   string lemma;
8168   vector<pair<string, string>> forms;
8169   while(raw.next_lemma(lemma, forms)) {
8170     // Make sure forms are unique
8171     sort(forms.begin(), forms.end());
8172     auto forms_end = unique(forms.begin(), forms.end());
8173     if (forms_end != forms.end()) {
8174 //      cerr << "Warning: repeated form-tag in lemma " << lemma << '.' << endl;
8175       forms.erase(forms_end, forms.end());
8176     }
8177 
8178     // Create lemma_info
8179     lemmas.emplace_back(lemma);
8180     auto& lemma_info = lemmas.back();
8181     lemmas_hist.add(lemma_info.lemma);
8182 
8183     // Create classes
8184     while (!forms.empty()) {
8185       trie t;
8186       for (auto&& form : forms)
8187         t.add(form.first.c_str());
8188 
8189       // Find prefix of forms in class being added.
8190       string prefix = t.find_candidate_prefix(max_suffix_len);
8191 
8192       // Find forms of the class being added.
8193       auto start = forms.begin();
8194       while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++;
8195       if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!");
8196       auto end = start;
8197       while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++;
8198 
8199       // Find common prefix of class forms -- may be larger than prefix.
8200       int common_prefix = prefix.size();
8201       while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++;
8202 
8203       string clas;
8204       for (auto form = start; form != end; form++) {
8205         if (!clas.empty()) clas.push_back('\t');
8206         clas.append(form->first, common_prefix, string::npos);
8207         clas.push_back('\t');
8208         clas.append(form->second);
8209       }
8210 
8211       auto class_it = classes.emplace(clas, classes.size());
8212       int class_id = class_it.first->second;
8213       if (class_it.second) {
8214         // New class, add it, together with its tags.
8215         for (auto form = start; form != end; form++) {
8216           int tag = tags_map.emplace(form->second, tags.size()).first->second;
8217           if (tag >= int(tags.size())) tags.emplace_back(form->second);
8218           suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag);
8219         }
8220       }
8221 
8222       // Move forms in the class being added to lemma and remove them from unprocessed forms.
8223       lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id);
8224       forms_hist.add(lemma_info.forms.back().form);
8225       forms.erase(start, end);
8226     }
8227     stable_sort(lemma_info.forms.begin(), lemma_info.forms.end());
8228   }
8229   stable_sort(lemmas.begin(), lemmas.end());
8230 }
8231 
8232 template <class LemmaAddinfo>
encode(binary_encoder & enc)8233 void dictionary<LemmaAddinfo>::encode(binary_encoder& enc) {
8234   // Encode lemmas and forms
8235   lemmas_hist.encode(enc);
8236   forms_hist.encode(enc);
8237 
8238   string prev = "";
8239   enc.add_4B(lemmas.size());
8240   for (auto&& lemma : lemmas) {
8241     int cpl = 0;
8242     while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++;
8243 
8244     enc.add_1B(prev.length() - cpl);
8245     enc.add_1B(lemma.lemma.size() - cpl);
8246     enc.add_data(lemma.lemma.substr(cpl));
8247     enc.add_1B(lemma.addinfo.data.size());
8248     enc.add_data(lemma.addinfo.data);
8249     enc.add_1B(lemma.forms.size());
8250 
8251     string prev_form = lemma.lemma;
8252     for (auto&& lemma_form : lemma.forms) {
8253       unsigned best_prev_from = 0, best_form_from = 0, best_len = 0;
8254       for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++)
8255         for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) {
8256           unsigned len = 0;
8257           while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++;
8258           if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len;
8259         }
8260 
8261       enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
8262       enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len<prev_form.size()) +
8263              ADD_START * (best_form_from>0) + ADD_END * (best_form_from+best_len<lemma_form.form.size()));
8264       if (best_prev_from > 0) enc.add_1B(best_prev_from);
8265       if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len);
8266       if (best_form_from > 0) {
8267         enc.add_1B(best_form_from);
8268         enc.add_data(lemma_form.form.substr(0, best_form_from));
8269       }
8270       if (best_form_from + best_len < lemma_form.form.size()) {
8271         enc.add_1B(lemma_form.form.size() - best_form_from - best_len);
8272         enc.add_data(lemma_form.form.substr(best_form_from + best_len));
8273       }
8274       enc.add_2B(lemma_form.clas);
8275 
8276       prev_form = lemma_form.form;
8277     }
8278 
8279     prev = lemma.lemma;
8280   }
8281 
8282   // Encode tags
8283   enc.add_2B(tags.size());
8284   for (auto&& tag : tags) {
8285     enc.add_1B(tag.size());
8286     enc.add_data(tag);
8287   }
8288 
8289   // Encode classes
8290   persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map<int, vector<int>>& suffix) {
8291     enc.add_2B(suffix.size());
8292     for (auto&& clas : suffix)
8293       enc.add_2B(clas.first);
8294     uint32_t tags = 0, prev_tags = 0;
8295     for (auto&& clas : suffix) {
8296       enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags);
8297       prev_tags = tags;
8298       tags += clas.second.size();
8299     }
8300     enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags);
8301     for (auto&& clas : suffix)
8302       for (auto&& tag : clas.second)
8303         enc.add_2B(tag);
8304   }).save(enc);
8305 }
8306 
8307 } // namespace morphodita
8308 
8309 /////////
8310 // File: morphodita/morpho/morpho_prefix_guesser_encoder.h
8311 /////////
8312 
8313 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8314 //
8315 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8316 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8317 //
8318 // This Source Code Form is subject to the terms of the Mozilla Public
8319 // License, v. 2.0. If a copy of the MPL was not distributed with this
8320 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8321 
8322 namespace morphodita {
8323 
8324 class morpho_prefix_guesser_encoder {
8325  public:
8326   static void encode(istream& is, binary_encoder& enc);
8327 };
8328 
8329 } // namespace morphodita
8330 
8331 /////////
8332 // File: morphodita/morpho/morpho_statistical_guesser_encoder.h
8333 /////////
8334 
8335 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8336 //
8337 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8338 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8339 //
8340 // This Source Code Form is subject to the terms of the Mozilla Public
8341 // License, v. 2.0. If a copy of the MPL was not distributed with this
8342 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8343 
8344 namespace morphodita {
8345 
8346 class morpho_statistical_guesser_encoder {
8347  public:
8348   static void encode(istream& is, binary_encoder& enc);
8349 };
8350 
8351 } // namespace morphodita
8352 
8353 /////////
8354 // File: morphodita/morpho/generic_morpho_encoder.cpp
8355 /////////
8356 
8357 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8358 //
8359 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8360 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8361 //
8362 // This Source Code Form is subject to the terms of the Mozilla Public
8363 // License, v. 2.0. If a copy of the MPL was not distributed with this
8364 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8365 
8366 namespace morphodita {
8367 
encode(istream & in_dictionary,int max_suffix_len,const tags & tags,istream & in_statistical_guesser,ostream & out_morpho)8368 void generic_morpho_encoder::encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho) {
8369   binary_encoder enc;
8370 
8371   enc.add_1B(tags.unknown_tag.size());
8372   enc.add_data(tags.unknown_tag);
8373   enc.add_1B(tags.number_tag.size());
8374   enc.add_data(tags.number_tag);
8375   enc.add_1B(tags.punctuation_tag.size());
8376   enc.add_data(tags.punctuation_tag);
8377   enc.add_1B(tags.symbol_tag.size());
8378   enc.add_data(tags.symbol_tag);
8379 
8380 //  cerr << "Encoding dictionary." << endl;
8381   morpho_dictionary_encoder<generic_lemma_addinfo>::encode(in_dictionary, max_suffix_len, enc);
8382 
8383   // Load and encode statistical guesser if requested
8384   enc.add_1B(bool(in_statistical_guesser));
8385   if (in_statistical_guesser) {
8386 //    cerr << "Encoding statistical guesser." << endl;
8387     morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc);
8388   }
8389 
8390   // done, save the dictionary
8391 //  cerr << "Compressing dictionary." << endl;
8392   if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!");
8393 //  cerr << "Dictionary saved." << endl;
8394 }
8395 
8396 } // namespace morphodita
8397 
8398 /////////
8399 // File: morphodita/morpho/morpho_ids.h
8400 /////////
8401 
8402 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8403 //
8404 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8405 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8406 //
8407 // This Source Code Form is subject to the terms of the Mozilla Public
8408 // License, v. 2.0. If a copy of the MPL was not distributed with this
8409 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8410 
8411 namespace morphodita {
8412 
8413 class morpho_ids {
8414  public:
8415   enum morpho_id {
8416     CZECH = 0,
8417     ENGLISH_V1 = 1,
8418     GENERIC = 2,
8419     EXTERNAL = 3,
8420     ENGLISH_V2 = 4,
8421     ENGLISH_V3 = 5, ENGLISH = ENGLISH_V3,
8422     SLOVAK_PDT = 6,
8423     DERIVATOR_DICTIONARY = 7,
8424   };
8425 
parse(const string & str,morpho_id & id)8426   static bool parse(const string& str, morpho_id& id) {
8427     if (str == "czech") return id = CZECH, true;
8428     if (str == "english") return id = ENGLISH, true;
8429     if (str == "external") return id = EXTERNAL, true;
8430     if (str == "generic") return id = GENERIC, true;
8431     if (str == "slovak_pdt") return id = SLOVAK_PDT, true;
8432     return false;
8433   }
8434 };
8435 
8436 typedef morpho_ids::morpho_id morpho_id;
8437 
8438 } // namespace morphodita
8439 
8440 /////////
8441 // File: morphodita/morpho/morpho.cpp
8442 /////////
8443 
8444 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8445 //
8446 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8447 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8448 //
8449 // This Source Code Form is subject to the terms of the Mozilla Public
8450 // License, v. 2.0. If a copy of the MPL was not distributed with this
8451 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8452 
8453 namespace morphodita {
8454 
load(istream & is)8455 morpho* morpho::load(istream& is) {
8456   morpho_id id = morpho_id(is.get());
8457   switch (id) {
8458     case morpho_ids::CZECH:
8459       {
8460         auto res = new_unique_ptr<czech_morpho>(czech_morpho::morpho_language::CZECH, 1);
8461         if (res->load(is)) return res.release();
8462         break;
8463       }
8464     case morpho_ids::ENGLISH_V1:
8465     case morpho_ids::ENGLISH_V2:
8466     case morpho_ids::ENGLISH_V3:
8467       {
8468         auto res = new_unique_ptr<english_morpho>(id == morpho_ids::ENGLISH_V1 ? 1 :
8469                                                   id == morpho_ids::ENGLISH_V2 ? 2 :
8470                                                   3);
8471         if (res->load(is)) return res.release();
8472         break;
8473       }
8474     case morpho_ids::EXTERNAL:
8475       {
8476         auto res = new_unique_ptr<external_morpho>(1);
8477         if (res->load(is)) return res.release();
8478         break;
8479       }
8480     case morpho_ids::GENERIC:
8481       {
8482         auto res = new_unique_ptr<generic_morpho>(1);
8483         if (res->load(is)) return res.release();
8484         break;
8485       }
8486     case morpho_ids::SLOVAK_PDT:
8487       {
8488         auto res = new_unique_ptr<czech_morpho>(czech_morpho::morpho_language::SLOVAK, 3);
8489         if (res->load(is)) return res.release();
8490         break;
8491       }
8492     case morpho_ids::DERIVATOR_DICTIONARY:
8493       {
8494         auto derinet = new_unique_ptr<derivator_dictionary>();
8495         if (!derinet->load(is)) return nullptr;
8496 
8497         unique_ptr<morpho> dictionary(load(is));
8498         if (!dictionary) return nullptr;
8499         derinet->dictionary = dictionary.get();
8500         dictionary->derinet.reset(derinet.release());
8501         return dictionary.release();
8502       }
8503   }
8504 
8505   return nullptr;
8506 }
8507 
load(const char * fname)8508 morpho* morpho::load(const char* fname) {
8509   ifstream f(fname, ifstream::binary);
8510   if (!f) return nullptr;
8511 
8512   return load(f);
8513 }
8514 
get_derivator() const8515 const derivator* morpho::get_derivator() const {
8516   return derinet.get();
8517 }
8518 
8519 } // namespace morphodita
8520 
8521 /////////
8522 // File: morphodita/morpho/morpho_statistical_guesser.cpp
8523 /////////
8524 
8525 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8526 //
8527 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8528 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8529 //
8530 // This Source Code Form is subject to the terms of the Mozilla Public
8531 // License, v. 2.0. If a copy of the MPL was not distributed with this
8532 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8533 
8534 namespace morphodita {
8535 
load(binary_decoder & data)8536 void morpho_statistical_guesser::load(binary_decoder& data) {
8537   // Load tags and default tag
8538   tags.resize(data.next_2B());
8539   for (auto&& tag : tags) {
8540     tag.resize(data.next_1B());
8541     for (unsigned i = 0; i < tag.size(); i++)
8542       tag[i] = data.next_1B();
8543   }
8544   default_tag = data.next_2B();
8545 
8546   // Load rules
8547   rules.load(data);
8548 }
8549 
8550 // Helper method for analyze.
contains(morpho_statistical_guesser::used_rules * used,const string & rule)8551 static bool contains(morpho_statistical_guesser::used_rules* used, const string& rule) {
8552   if (!used) return false;
8553 
8554   for (auto&& used_rule : *used)
8555     if (used_rule == rule)
8556       return true;
8557 
8558   return false;
8559 }
8560 
8561 // Produces unique lemma-tag pairs.
analyze(string_piece form,vector<tagged_lemma> & lemmas,morpho_statistical_guesser::used_rules * used)8562 void morpho_statistical_guesser::analyze(string_piece form, vector<tagged_lemma>& lemmas, morpho_statistical_guesser::used_rules* used) {
8563   unsigned lemmas_initial_size = lemmas.size();
8564 
8565   // We have rules in format "suffix prefix" in rules.
8566   // Find the matching rule with longest suffix and of those with longest prefix.
8567   string rule_label; rule_label.reserve(12);
8568   unsigned suffix_len = 0;
8569   for (; suffix_len < form.len; suffix_len++) {
8570     rule_label.push_back(form.str[form.len - (suffix_len + 1)]);
8571     if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next<char>(data.next_2B()); }))
8572       break;
8573   }
8574 
8575   for (suffix_len++; suffix_len--; ) {
8576     rule_label.resize(suffix_len);
8577     rule_label.push_back(' ');
8578 
8579     const unsigned char* rule = nullptr;
8580     unsigned rule_prefix_len = 0;
8581     for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) {
8582       if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]);
8583       const unsigned char* found = rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next<char>(data.next_2B()); });
8584       if (!found) break;
8585       if (*(found += sizeof(uint16_t))) {
8586         rule = found;
8587         rule_prefix_len = prefix_len;
8588       }
8589     }
8590 
8591     if (rule) {
8592       rule_label.resize(suffix_len + 1 + rule_prefix_len);
8593       if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' '
8594         if (used) used->push_back(rule_label);
8595         for (int rules_len = *rule++; rules_len; rules_len--) {
8596           unsigned pref_del_len = *rule++; const char* pref_del = (const char*)rule; rule += pref_del_len;
8597           unsigned pref_add_len = *rule++; const char* pref_add = (const char*)rule; rule += pref_add_len;
8598           unsigned suff_del_len = *rule++; const char* suff_del = (const char*)rule; rule += suff_del_len;
8599           unsigned suff_add_len = *rule++; const char* suff_add = (const char*)rule; rule += suff_add_len;
8600           unsigned tags_len = *rule++; const uint16_t* tags = (const uint16_t*)rule; rule += tags_len * sizeof(uint16_t);
8601 
8602           if (pref_del_len + suff_del_len > form.len ||
8603               (pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) ||
8604               (suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) ||
8605               (form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len == 0))
8606             continue;
8607 
8608           string lemma;
8609           lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len);
8610           if (pref_add_len) lemma.append(pref_add, pref_add_len);
8611           if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len);
8612           if (suff_add_len) lemma.append(suff_add, suff_add_len);
8613           while (tags_len--)
8614             lemmas.emplace_back(lemma, this->tags[*tags++]);
8615         }
8616       }
8617       break;
8618     }
8619   }
8620 
8621   // If nothing was found, use default tag.
8622   if (lemmas.size() == lemmas_initial_size)
8623     if (!contains(used, string())) {
8624       if (used) used->push_back(string());
8625       lemmas.emplace_back(string(form.str, form.len), tags[default_tag]);
8626     }
8627 }
8628 
8629 } // namespace morphodita
8630 
8631 /////////
8632 // File: utils/split.h
8633 /////////
8634 
8635 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
8636 //
8637 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8638 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8639 //
8640 // This Source Code Form is subject to the terms of the Mozilla Public
8641 // License, v. 2.0. If a copy of the MPL was not distributed with this
8642 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8643 
8644 namespace utils {
8645 
8646 //
8647 // Declarations
8648 //
8649 
8650 // Split given text on the separator character.
8651 inline void split(const string& text, char sep, vector<string>& tokens);
8652 inline void split(string_piece text, char sep, vector<string_piece>& tokens);
8653 
8654 //
8655 // Definitions
8656 //
8657 
split(const string & text,char sep,vector<string> & tokens)8658 void split(const string& text, char sep, vector<string>& tokens) {
8659   tokens.clear();
8660   if (text.empty()) return;
8661 
8662   string::size_type index = 0;
8663   for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1)
8664     tokens.emplace_back(text, index, next - index);
8665 
8666   tokens.emplace_back(text, index);
8667 }
8668 
split(string_piece text,char sep,vector<string_piece> & tokens)8669 void split(string_piece text, char sep, vector<string_piece>& tokens) {
8670   tokens.clear();
8671   if (!text.len) return;
8672 
8673   const char* str = text.str;
8674   for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1)
8675     tokens.emplace_back(str, next - str);
8676 
8677   tokens.emplace_back(str, text.str + text.len - str);
8678 }
8679 
8680 } // namespace utils
8681 
8682 /////////
8683 // File: morphodita/morpho/morpho_statistical_guesser_encoder.cpp
8684 /////////
8685 
8686 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8687 //
8688 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8689 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8690 //
8691 // This Source Code Form is subject to the terms of the Mozilla Public
8692 // License, v. 2.0. If a copy of the MPL was not distributed with this
8693 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8694 
8695 namespace morphodita {
8696 
encode(istream & is,binary_encoder & enc)8697 void morpho_statistical_guesser_encoder::encode(istream& is, binary_encoder& enc) {
8698   unordered_map<string, vector<pair<vector<string>, vector<int>>>> statistical_guesser;
8699   vector<string> tags;
8700   unordered_map<string, int> tags_map;
8701 
8702   // Load statistical guesser
8703   string line;
8704   vector<string> tokens;
8705   if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file");
8706   int statistical_guesser_default = tags_map.emplace(line.data(), tags.size()).first->second;
8707   if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data());
8708 
8709   while (getline(is, line)) {
8710     split(line, '\t', tokens);
8711     if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
8712 
8713     vector<string> affixes;
8714     split(tokens[0], ' ', affixes);
8715     if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!");
8716     reverse(affixes[1].begin(), affixes[1].end());
8717 
8718     auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]];
8719     for (unsigned i = 1; i < tokens.size(); i+= 2) {
8720       vector<string> replacements;
8721       split(tokens[i], ' ', replacements);
8722       if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!");
8723 
8724       vector<string> rule_tags;
8725       split(tokens[i+1], ' ', rule_tags);
8726       vector<int> decoded_tags;
8727       for (auto&& rule_tag : rule_tags) {
8728         int tag = tags_map.emplace(rule_tag, tags.size()).first->second;
8729         if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag);
8730         decoded_tags.emplace_back(tag);
8731       }
8732 
8733       rules.emplace_back(replacements, decoded_tags);
8734     }
8735   }
8736 
8737   // Encode statistical guesser
8738   enc.add_2B(tags.size());
8739   for (auto&& tag : tags) {
8740     enc.add_1B(tag.size());
8741     enc.add_data(tag);
8742   }
8743   enc.add_2B(statistical_guesser_default);
8744 
8745   persistent_unordered_map(statistical_guesser, 5, true, false, [](binary_encoder& enc, vector<pair<vector<string>, vector<int>>> rules) {
8746     binary_encoder e;
8747     e.add_1B(rules.size());
8748     for (auto&& rule : rules) {
8749       if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!");
8750       for (auto&& affix : rule.first) {
8751         e.add_1B(affix.size());
8752         e.add_data(affix);
8753       }
8754       e.add_1B(rule.second.size());
8755       for (auto&& tag : rule.second)
8756         e.add_2B(tag);
8757     }
8758     enc.add_2B(e.data.size());
8759     enc.add_data(e.data);
8760   }).save(enc);
8761 }
8762 
8763 } // namespace morphodita
8764 
8765 /////////
8766 // File: morphodita/morpho/morpho_statistical_guesser_trainer.h
8767 /////////
8768 
8769 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8770 //
8771 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8772 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8773 //
8774 // This Source Code Form is subject to the terms of the Mozilla Public
8775 // License, v. 2.0. If a copy of the MPL was not distributed with this
8776 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8777 
8778 namespace morphodita {
8779 
8780 class morpho_statistical_guesser_trainer {
8781  public:
8782   static void train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os);
8783 
8784  private:
8785   struct instance {
8786     string form, lemma, tag;
8787     string lemma_rule, form_prefix;
8788 
8789     instance(const string& form, const string& lemma, const string& tag);
8790   };
8791 
8792   enum casing { CASE_LC, CASE_UCLC, CASE_UC, CASE_OTHER };
8793   static casing get_casing(const string& word, bool allow_nonletters);
8794   static void set_casing(const string& original, casing c, string& word);
8795   static bool suffix(const string& word, unsigned& length);
8796 };
8797 
8798 } // namespace morphodita
8799 
8800 /////////
8801 // File: morphodita/morpho/morpho_statistical_guesser_trainer.cpp
8802 /////////
8803 
8804 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
8805 //
8806 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8807 // Mathematics and Physics, Charles University in Prague, Czech Republic.
8808 //
8809 // This Source Code Form is subject to the terms of the Mozilla Public
8810 // License, v. 2.0. If a copy of the MPL was not distributed with this
8811 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8812 
8813 namespace morphodita {
8814 
train(istream & is,unsigned suffix_len,unsigned rules_per_suffix,unsigned max_prefixes,unsigned min_prefix_count,ostream & os)8815 void morpho_statistical_guesser_trainer::train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os) {
8816   vector<instance> data;
8817 
8818   // Load training data
8819   string form;
8820   vector<string> tokens;
8821   for (string line; getline(is, line);) {
8822     if (line.empty()) continue;
8823 
8824     split(line, '\t', tokens);
8825     if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!");
8826     if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
8827 
8828     // Normalize case
8829     casing form_case = get_casing(tokens[0], false);
8830     casing lemma_case = get_casing(tokens[1], true);
8831     if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) ||
8832         (lemma_case == CASE_UCLC && form_case == CASE_UC)) {
8833       set_casing(tokens[0], lemma_case, form);
8834     } else {
8835       form.swap(tokens[0]);
8836     }
8837 
8838     data.emplace_back(form, tokens[1], tokens[2]);
8839   }
8840 
8841   // Generate at most max_prefixes prefixes with min_prefix_count
8842   unordered_map<string, unordered_set<string>> prefixes_with_forms;
8843   for (auto&& instance : data)
8844     if (!instance.form_prefix.empty())
8845       prefixes_with_forms[instance.form_prefix].insert(instance.form);
8846 
8847   vector<pair<unsigned, string>> prefixes_with_counts;
8848   for (auto&& prefix : prefixes_with_forms)
8849     if (prefix.second.size() >= min_prefix_count)
8850       prefixes_with_counts.emplace_back(prefix.second.size(), prefix.first);
8851 
8852   if (prefixes_with_counts.size() > max_prefixes) {
8853     sort(prefixes_with_counts.begin(), prefixes_with_counts.end(), greater<pair<unsigned, string>>());
8854     prefixes_with_counts.resize(max_prefixes);
8855   }
8856 
8857   unordered_set<string> prefixes;
8858   prefixes.emplace();
8859   for (auto&& prefix : prefixes_with_counts)
8860     prefixes.insert(prefix.second);
8861 
8862   // Generate the guesser rules
8863   unordered_map<string, unordered_set<string>> tags;
8864   unordered_map<string, unordered_map<string, unordered_set<string>>> rules;
8865   unordered_set<string> suffixes;
8866   string prefix_suffix, tag_lemma_rule;
8867   for (auto&& instance : data) {
8868     // Add tag
8869     tags[instance.tag].insert(instance.form);
8870 
8871     // Find longest matching prefix
8872     unsigned prefix_length = 0;
8873     for (auto&& prefix : prefixes)
8874       if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0)
8875         prefix_length = prefix.size();
8876 
8877     tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag);
8878 
8879     // Add prefix + all suffixes of length 1..suffix_len to rules
8880     for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) {
8881       prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length);
8882       rules[prefix_suffix][tag_lemma_rule].insert(instance.form);
8883       suffixes.emplace(instance.form, instance.form.size() - utf8_length, utf8_length);
8884     }
8885   }
8886 
8887   // Start generating the guesser description by writing the most "frequent" tag
8888   string most_frequent_tag; unsigned most_frequent_tag_count = 0;
8889   for (auto&& tag : tags)
8890     if (tag.second.size() > most_frequent_tag_count)
8891       most_frequent_tag.assign(tag.first), most_frequent_tag_count = tag.second.size();
8892 
8893   os << most_frequent_tag << endl;
8894 
8895   // For every prefix-suffix, write at most rules_per_suffix most "frequent" rules
8896   string rule_key, output;
8897   unordered_set<string> rules_set;
8898   vector<pair<unsigned, string>> rules_counts;
8899   for (auto&& suffix : suffixes) {
8900     for (auto&& prefix : prefixes) {
8901       rules_counts.clear();
8902       rules_set.clear();
8903 
8904       // Gather at most rules_per_suffix rules
8905       for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) {
8906         for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) {
8907           rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len);
8908           if (!rules.count(rule_key)) continue;
8909 
8910           unsigned rules_counts_original = rules_counts.size();
8911           for (auto&& entry : rules[rule_key])
8912             if (!rules_set.count(entry.first)) {
8913               rules_counts.emplace_back(entry.second.size(), entry.first);
8914               rules_set.insert(entry.first);
8915             }
8916 
8917           sort(rules_counts.begin() + rules_counts_original, rules_counts.end(), greater<pair<unsigned, string>>());
8918 
8919           if (rules_counts.size() >= rules_per_suffix) {
8920             rules_counts.resize(rules_per_suffix);
8921             break;
8922           }
8923         }
8924         // Stop if there are no rules for given prefix
8925         if (rules_set.empty()) break;
8926       }
8927       if (!rules_set.empty()) {
8928         // Write the chosen rules
8929         output.assign(prefix).append(" ").append(suffix);
8930         for (unsigned i = 0; i < rules_counts.size(); i++) {
8931           unsigned tab = rules_counts[i].second.find('\t');
8932 
8933           output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos);
8934 
8935           // Join rules with same lemma_rule
8936           for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++)
8937             output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos);
8938         }
8939         os << output << endl;
8940       }
8941     }
8942   }
8943 }
8944 
instance(const string & form,const string & lemma,const string & tag)8945 morpho_statistical_guesser_trainer::instance::instance(const string& form, const string& lemma, const string& tag)
8946   : form(form), lemma(lemma), tag(tag)
8947 {
8948   using namespace unilib;
8949 
8950   unsigned length_best = 0;
8951   int form_best = 0, lemma_best = 0;
8952   for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) {
8953     unsigned form_offset = max(0, offset);
8954     unsigned lemma_offset = max(0, -offset);
8955     for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++)
8956       if (form[form_offset] == lemma[lemma_offset]) {
8957         if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length))
8958           length_best = length, form_best = form_offset + 1 - length, lemma_best = lemma_offset + 1 - length;
8959       } else {
8960         length = 0;
8961       }
8962   }
8963 
8964   form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0);
8965   lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ")
8966       .append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos);
8967 }
8968 
get_casing(const string & word,bool allow_nonletters)8969 morpho_statistical_guesser_trainer::casing morpho_statistical_guesser_trainer::get_casing(const string& word, bool allow_nonletters) {
8970   using namespace unilib;
8971 
8972   casing c = CASE_OTHER;
8973   int index = 0;
8974   for (auto&& chr : utf8::decoder(word)) {
8975     auto cat = unicode::category(chr);
8976 
8977     // Return OTHER for non-letters
8978     if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue;
8979     if (cat & ~unicode::L) return CASE_OTHER;
8980 
8981     if (index == 0) {
8982       c = cat & unicode::Ll ? CASE_LC : CASE_UC;
8983     } else if (c == CASE_UC && index == 1) {
8984       c = cat & unicode::Ll ? CASE_UCLC : CASE_UC;
8985     } else if (c == CASE_UC) {
8986       if (cat & ~unicode::Lut) return CASE_OTHER;
8987     } else /*CASE_LC or CASE_UCLC*/ {
8988       if (cat & ~unicode::Ll) return CASE_OTHER;
8989     }
8990     index++;
8991   }
8992   return c;
8993 }
8994 
set_casing(const string & original,casing c,string & word)8995 void morpho_statistical_guesser_trainer::set_casing(const string& original, casing c, string& word) {
8996   using namespace unilib;
8997 
8998   word.clear();
8999   bool first = true;
9000   for (auto&& chr : utf8::decoder(original)) {
9001     utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr));
9002     first = false;
9003   }
9004 }
9005 
suffix(const string & word,unsigned & length)9006 bool morpho_statistical_guesser_trainer::suffix(const string& word, unsigned& length) {
9007   using namespace unilib;
9008 
9009   unsigned additional = 1;
9010   while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional))
9011     additional++;
9012 
9013   if (additional + length > word.size()) return false;
9014 
9015   length += additional;
9016   return true;
9017 }
9018 
9019 } // namespace morphodita
9020 
9021 /////////
9022 // File: morphodita/morpho/raw_morpho_dictionary_reader.cpp
9023 /////////
9024 
9025 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9026 //
9027 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9028 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9029 //
9030 // This Source Code Form is subject to the terms of the Mozilla Public
9031 // License, v. 2.0. If a copy of the MPL was not distributed with this
9032 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9033 
9034 namespace morphodita {
9035 
next_lemma(string & lemma,vector<pair<string,string>> & tagged_forms)9036 bool raw_morpho_dictionary_reader::next_lemma(string& lemma, vector<pair<string, string>>& tagged_forms) {
9037   if (line.empty()) {
9038     if (!getline(in, line))
9039       return false;
9040     split(line, '\t', tokens);
9041     if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
9042   }
9043 
9044   lemma = tokens[0];
9045   if (seen_lemmas.count(lemma))
9046     training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!");
9047   seen_lemmas.insert(lemma);
9048 
9049   tagged_forms.clear();
9050   tagged_forms.emplace_back(tokens[2], tokens[1]);
9051   while (getline(in, line)) {
9052     split(line, '\t', tokens);
9053     if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
9054 
9055     if (lemma != tokens[0]) break;
9056     tagged_forms.emplace_back(tokens[2], tokens[1]);
9057   }
9058 
9059   return true;
9060 }
9061 
9062 } // namespace morphodita
9063 
9064 /////////
9065 // File: morphodita/morpho/tag_filter.cpp
9066 /////////
9067 
9068 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9069 //
9070 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9071 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9072 //
9073 // This Source Code Form is subject to the terms of the Mozilla Public
9074 // License, v. 2.0. If a copy of the MPL was not distributed with this
9075 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9076 
9077 namespace morphodita {
9078 
tag_filter(const char * filter)9079 tag_filter::tag_filter(const char* filter) {
9080   if (!filter) return;
9081 
9082   wildcard.assign(filter);
9083   filter = wildcard.c_str();
9084 
9085   for (int tag_pos = 0; *filter; tag_pos++, filter++) {
9086     if (*filter == '?') continue;
9087     if (*filter == '[') {
9088       filter++;
9089 
9090       bool negate = false;
9091       if (*filter == '^') negate = true, filter++;
9092 
9093       const char* chars = filter;
9094       for (bool first = true; *filter && (first || *filter != ']'); first = false) filter++;
9095 
9096       filters.emplace_back(tag_pos, negate, chars, filter - chars);
9097       if (!*filter) break;
9098     } else {
9099       filters.emplace_back(tag_pos, false, filter, 1);
9100     }
9101   }
9102 }
9103 
9104 } // namespace morphodita
9105 
9106 /////////
9107 // File: morphodita/tagger/elementary_features.h
9108 /////////
9109 
9110 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9111 //
9112 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9113 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9114 //
9115 // This Source Code Form is subject to the terms of the Mozilla Public
9116 // License, v. 2.0. If a copy of the MPL was not distributed with this
9117 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9118 
9119 namespace morphodita {
9120 
9121 // Declarations
9122 enum elementary_feature_type { PER_FORM, PER_TAG, DYNAMIC };
9123 enum elementary_feature_range { ONLY_CURRENT, ANY_OFFSET };
9124 
9125 typedef uint32_t elementary_feature_value;
9126 enum :elementary_feature_value { elementary_feature_unknown = 0, elementary_feature_empty = 1 };
9127 
9128 struct elementary_feature_description {
9129   string name;
9130   elementary_feature_type type;
9131   elementary_feature_range range;
9132   int index;
9133   int map_index;
9134 };
9135 
9136 template<class Map>
9137 class elementary_features {
9138  public:
9139   bool load(istream& is);
9140   bool save(ostream& out);
9141 
9142   vector<Map> maps;
9143 };
9144 
9145 class persistent_elementary_feature_map : public persistent_unordered_map {
9146  public:
persistent_elementary_feature_map()9147   persistent_elementary_feature_map() : persistent_unordered_map() {}
persistent_elementary_feature_map(const persistent_unordered_map && map)9148   persistent_elementary_feature_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {}
9149 
value(const char * feature,int len) const9150   elementary_feature_value value(const char* feature, int len) const {
9151     auto* it = at_typed<elementary_feature_value>(feature, len);
9152     return it ? *it : elementary_feature_unknown;
9153   }
9154 };
9155 
9156 // Definitions
9157 template <class Map>
load(istream & is)9158 inline bool elementary_features<Map>::load(istream& is) {
9159   binary_decoder data;
9160   if (!compressor::load(is, data)) return false;
9161 
9162   try {
9163     maps.resize(data.next_1B());
9164     for (auto&& map : maps)
9165       map.load(data);
9166   } catch (binary_decoder_error&) {
9167     return false;
9168   }
9169 
9170   return data.is_end();
9171 }
9172 
9173 } // namespace morphodita
9174 
9175 /////////
9176 // File: morphodita/tagger/vli.h
9177 /////////
9178 
9179 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9180 //
9181 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9182 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9183 //
9184 // This Source Code Form is subject to the terms of the Mozilla Public
9185 // License, v. 2.0. If a copy of the MPL was not distributed with this
9186 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9187 
9188 namespace morphodita {
9189 
9190 // Declarations
9191 template <class T>
9192 class vli {
9193  public:
9194   static int max_length();
9195   static void encode(T value, char*& where);
9196   static T decode(const char*& from);
9197 };
9198 
9199 // Definitions
9200 template <>
max_length()9201 inline int vli<uint32_t>::max_length() {
9202   return 5;
9203 }
9204 
9205 template <>
encode(uint32_t value,char * & where)9206 inline void vli<uint32_t>::encode(uint32_t value, char*& where) {
9207   if (value < 0x80) *where++ = value;
9208   else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu;
9209   else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu;
9210   else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu;
9211   else *where++ = (value >> 28) | 0x80u, *where++ = ((value >> 21) & 0x7Fu) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu;
9212 }
9213 
9214 template <>
decode(const char * & from)9215 inline uint32_t vli<uint32_t>::decode(const char*& from) {
9216   uint32_t value = 0;
9217   while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u);
9218   value = (value << 7) | ((unsigned char)(*from++));
9219   return value;
9220 }
9221 
9222 } // namespace morphodita
9223 
9224 /////////
9225 // File: morphodita/tagger/feature_sequences.h
9226 /////////
9227 
9228 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9229 //
9230 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9231 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9232 //
9233 // This Source Code Form is subject to the terms of the Mozilla Public
9234 // License, v. 2.0. If a copy of the MPL was not distributed with this
9235 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9236 
9237 namespace morphodita {
9238 
9239 // Declarations
9240 typedef int32_t feature_sequence_score;
9241 typedef int64_t feature_sequences_score;
9242 
9243 struct feature_sequence_element {
9244   elementary_feature_type type;
9245   int elementary_index;
9246   int sequence_index;
9247 
feature_sequence_elementufal::udpipe::morphodita::feature_sequence_element9248   feature_sequence_element() {}
feature_sequence_elementufal::udpipe::morphodita::feature_sequence_element9249   feature_sequence_element(elementary_feature_type type, int elementary_index, int sequence_index) : type(type), elementary_index(elementary_index), sequence_index(sequence_index) {}
9250 };
9251 
9252 struct feature_sequence {
9253   vector<feature_sequence_element> elements;
9254   int dependant_range = 1;
9255 };
9256 
9257 template <class ElementaryFeatures, class Map>
9258 class feature_sequences {
9259  public:
9260   typedef typename ElementaryFeatures::per_form_features per_form_features;
9261   typedef typename ElementaryFeatures::per_tag_features per_tag_features;
9262   typedef typename ElementaryFeatures::dynamic_features dynamic_features;
9263 
9264   void parse(int window_size, istream& is);
9265   bool load(istream& is);
9266   bool save(ostream& os);
9267 
9268   struct cache;
9269 
9270   inline void initialize_sentence(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, cache& c) const;
9271   inline void compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const;
9272   inline feature_sequences_score score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const;
9273   void feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector<string>& keys, cache& c) const;
9274 
9275   ElementaryFeatures elementary;
9276   vector<Map> scores;
9277   vector<feature_sequence> sequences;
9278 };
9279 
9280 class persistent_feature_sequence_map : public persistent_unordered_map {
9281  public:
persistent_feature_sequence_map()9282   persistent_feature_sequence_map() : persistent_unordered_map() {}
persistent_feature_sequence_map(const persistent_unordered_map && map)9283   persistent_feature_sequence_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {}
9284 
score(const char * feature,int len) const9285   feature_sequence_score score(const char* feature, int len) const {
9286     auto* it = at_typed<feature_sequence_score>(feature, len);
9287     return it ? *it : 0;
9288   }
9289 };
9290 
9291 template <class ElementaryFeatures> using persistent_feature_sequences = feature_sequences<ElementaryFeatures, persistent_feature_sequence_map>;
9292 
9293 // Definitions
9294 template <class ElementaryFeatures, class Map>
load(istream & is)9295 inline bool feature_sequences<ElementaryFeatures, Map>::load(istream& is) {
9296   if (!elementary.load(is)) return false;
9297 
9298   binary_decoder data;
9299   if (!compressor::load(is, data)) return false;
9300 
9301   try {
9302     sequences.resize(data.next_1B());
9303     for (auto&& sequence : sequences) {
9304       sequence.dependant_range = data.next_4B();
9305       sequence.elements.resize(data.next_1B());
9306       for (auto&& element : sequence.elements) {
9307         element.type = elementary_feature_type(data.next_4B());
9308         element.elementary_index = data.next_4B();
9309         element.sequence_index = data.next_4B();
9310       }
9311     }
9312 
9313     scores.resize(data.next_1B());
9314     for (auto&& score : scores)
9315       score.load(data);
9316   } catch (binary_decoder_error&) {
9317     return false;
9318   }
9319 
9320   return data.is_end();
9321 }
9322 
9323 template <class ElementaryFeatures, class Map>
9324 struct feature_sequences<ElementaryFeatures, Map>::cache {
9325   const vector<string_piece>* forms;
9326   const vector<vector<tagged_lemma>>* analyses;
9327   vector<per_form_features> elementary_per_form;
9328   vector<vector<per_tag_features>> elementary_per_tag;
9329 
9330   struct cache_element {
9331     vector<char> key;
9332     int key_size;
9333     feature_sequence_score score;
9334 
cache_elementufal::udpipe::morphodita::feature_sequences::cache::cache_element9335     cache_element(int elements) : key(vli<elementary_feature_value>::max_length() * elements), key_size(0), score(0) {}
9336   };
9337   vector<cache_element> caches;
9338   vector<const per_tag_features*> window;
9339   vector<char> key;
9340   feature_sequences_score score;
9341 
cacheufal::udpipe::morphodita::feature_sequences::cache9342   cache(const feature_sequences<ElementaryFeatures, Map>& self) : score(0) {
9343     caches.reserve(self.sequences.size());
9344     int max_sequence_elements = 0, max_window_size = 1;
9345     for (auto&& sequence : self.sequences) {
9346       caches.emplace_back(sequence.elements.size());
9347       if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size();
9348       for (auto&& element : sequence.elements)
9349         if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
9350           max_window_size = 1 - element.sequence_index;
9351     }
9352     key.resize(max_sequence_elements * vli<elementary_feature_value>::max_length());
9353     window.resize(max_window_size);
9354   }
9355 };
9356 
9357 template <class ElementaryFeatures, class Map>
initialize_sentence(const vector<string_piece> & forms,const vector<vector<tagged_lemma>> & analyses,cache & c) const9358 void feature_sequences<ElementaryFeatures, Map>::initialize_sentence(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, cache& c) const {
9359   // Store forms and forms_size
9360   c.forms = &forms;
9361   c.analyses = &analyses;
9362 
9363   // Enlarge elementary features vectors if needed
9364   if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2);
9365   if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2);
9366   for (unsigned i = 0; i < forms.size(); i++)
9367     if (analyses[i].size() > c.elementary_per_tag[i].size())
9368       c.elementary_per_tag[i].resize(analyses[i].size() * 2);
9369 
9370   // Compute elementary features
9371   elementary.compute_features(forms, analyses, c.elementary_per_form, c.elementary_per_tag);
9372 
9373   // Clear score cache, because scores may have been modified
9374   c.score = 0;
9375   for (auto&& cache : c.caches)
9376     cache.key_size = cache.score = 0;
9377 }
9378 
9379 template <class ElementaryFeatures, class Map>
compute_dynamic_features(int form_index,int tag_index,const dynamic_features * prev_dynamic,dynamic_features & dynamic,cache & c) const9380 void feature_sequences<ElementaryFeatures, Map>::compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const {
9381   elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic);
9382 }
9383 
9384 template <class ElementaryFeatures, class Map>
score(int form_index,int tags_window[],int tags_unchanged,dynamic_features & dynamic,cache & c) const9385 feature_sequences_score feature_sequences<ElementaryFeatures, Map>::score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const {
9386   // Start by creating a window of per_tag_features*
9387   for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
9388     c.window[i] = &c.elementary_per_tag[form_index - i][tags_window[i]];
9389 
9390   // Compute the score
9391   feature_sequences_score result = c.score;
9392   for (unsigned i = 0; i < sequences.size(); i++) {
9393     if (tags_unchanged >= sequences[i].dependant_range)
9394       break;
9395 
9396     char* key = c.key.data();
9397     for (unsigned j = 0; j < sequences[i].elements.size(); j++) {
9398       auto& element = sequences[i].elements[j];
9399       elementary_feature_value value;
9400 
9401       switch (element.type) {
9402         case PER_FORM:
9403           value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
9404           break;
9405         case PER_TAG:
9406           value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index];
9407           break;
9408         case DYNAMIC:
9409         default:
9410           value = dynamic.values[element.elementary_index];
9411       }
9412 
9413       if (value == elementary_feature_unknown) {
9414         key = c.key.data();
9415         break;
9416       }
9417       vli<elementary_feature_value>::encode(value, key);
9418     }
9419 
9420     result -= c.caches[i].score;
9421     int key_size = key - c.key.data();
9422     if (!key_size) {
9423       c.caches[i].score = 0;
9424       c.caches[i].key_size = 0;
9425     } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
9426       c.caches[i].score = scores[i].score(c.key.data(), key_size);
9427       c.caches[i].key_size = key_size;
9428       small_memcpy(c.caches[i].key.data(), c.key.data(), key_size);
9429     }
9430     result += c.caches[i].score;
9431   }
9432 
9433   c.score = result;
9434   return result;
9435 }
9436 
9437 template <class ElementaryFeatures, class Map>
feature_keys(int form_index,int tags_window[],int tags_unchanged,dynamic_features & dynamic,vector<string> & keys,cache & c) const9438 void feature_sequences<ElementaryFeatures, Map>::feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector<string>& keys, cache& c) const {
9439   score(form_index, tags_window, tags_unchanged, dynamic, c);
9440 
9441   keys.resize(c.caches.size());
9442   for (unsigned i = 0; i < c.caches.size(); i++)
9443     keys[i].assign(c.caches[i].key.data(), c.caches[i].key_size);
9444 }
9445 
9446 } // namespace morphodita
9447 
9448 /////////
9449 // File: morphodita/tagger/viterbi.h
9450 /////////
9451 
9452 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9453 //
9454 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9455 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9456 //
9457 // This Source Code Form is subject to the terms of the Mozilla Public
9458 // License, v. 2.0. If a copy of the MPL was not distributed with this
9459 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9460 
9461 namespace morphodita {
9462 
9463 // Declarations
9464 template <class FeatureSequences>
9465 class viterbi {
9466  public:
viterbi(const FeatureSequences & features,int decoding_order,int window_size)9467   viterbi(const FeatureSequences& features, int decoding_order, int window_size)
9468       : features(features), decoding_order(decoding_order), window_size(window_size) {}
9469 
9470   struct cache;
9471   void tag(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, cache& c, vector<int>& tags) const;
9472 
9473  private:
9474   struct node;
9475 
9476   const FeatureSequences& features;
9477   int decoding_order, window_size;
9478 };
9479 
9480 // Definitions
9481 template <class FeatureSequences>
9482 struct viterbi<FeatureSequences>::cache {
9483   vector<node> nodes;
9484   typename FeatureSequences::cache features_cache;
9485 
cacheufal::udpipe::morphodita::viterbi::cache9486   cache(const viterbi<FeatureSequences>& self) : features_cache(self.features) {}
9487 };
9488 
9489 template <class FeatureSequences>
9490 struct viterbi<FeatureSequences>::node {
9491   int tag;
9492   int prev;
9493   feature_sequences_score score;
9494   typename FeatureSequences::dynamic_features dynamic;
9495 };
9496 
9497 template <class FeatureSequences>
tag(const vector<string_piece> & forms,const vector<vector<tagged_lemma>> & analyses,cache & c,vector<int> & tags) const9498 void viterbi<FeatureSequences>::tag(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, cache& c, vector<int>& tags) const {
9499   if (!forms.size()) return;
9500 
9501   // Count number of nodes and allocate
9502   unsigned nodes = 0;
9503   for (unsigned i = 0, states = 1; i < forms.size(); i++) {
9504     if (analyses[i].empty()) return;
9505     states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size();
9506     nodes += states;
9507   }
9508   if (nodes > c.nodes.size()) c.nodes.resize(nodes);
9509 
9510   // Init feature sequences
9511   features.initialize_sentence(forms, analyses, c.features_cache);
9512 
9513   int window_stack[16]; vector<int> window_heap;
9514   int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
9515   typename FeatureSequences::dynamic_features dynamic;
9516   feature_sequences_score score;
9517 
9518   // Compute all nodes score
9519   int nodes_prev = -1, nodes_now = 0;
9520   for (unsigned i = 0; i < forms.size(); i++) {
9521     int nodes_next = nodes_now;
9522 
9523     for (int j = 0; j < window_size; j++) window[j] = -1;
9524     for (int tag = 0; tag < int(analyses[i].size()); tag++)
9525       for (int prev = nodes_prev; prev < nodes_now; prev++) {
9526         // Compute predecessors and number of unchanges
9527         int same_tags = window[0] == tag;
9528         window[0] = tag;
9529         for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
9530           same_tags += same_tags == n && window[n] == c.nodes[p].tag;
9531           window[n] = c.nodes[p].tag;
9532         }
9533 
9534         // Compute dynamic elementary features and score
9535         features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache);
9536         score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
9537             (prev >= 0 ? c.nodes[prev].score : 0);
9538 
9539         // Update existing node or create a new one
9540         if (same_tags >= decoding_order-1) {
9541           if (score <= c.nodes[nodes_next-1].score) continue;
9542           nodes_next--;
9543         }
9544         c.nodes[nodes_next].tag = tag;
9545         c.nodes[nodes_next].prev = prev;
9546         c.nodes[nodes_next].score = score;
9547         c.nodes[nodes_next++].dynamic = dynamic;
9548       }
9549 
9550     nodes_prev = nodes_now;
9551     nodes_now = nodes_next;
9552   }
9553 
9554   // Choose the best ending node
9555   int best = nodes_prev;
9556   for (int node = nodes_prev + 1; node < nodes_now; node++)
9557     if (c.nodes[node].score > c.nodes[best].score)
9558       best = node;
9559 
9560   for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev)
9561     tags[i] = c.nodes[best].tag;
9562 }
9563 
9564 } // namespace morphodita
9565 
9566 /////////
9567 // File: morphodita/tagger/conllu_elementary_features.h
9568 /////////
9569 
9570 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9571 //
9572 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9573 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9574 //
9575 // This Source Code Form is subject to the terms of the Mozilla Public
9576 // License, v. 2.0. If a copy of the MPL was not distributed with this
9577 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9578 
9579 namespace morphodita {
9580 
9581 // Declarations
9582 template <class Map>
9583 class conllu_elementary_features : public elementary_features<Map> {
9584  public:
9585   conllu_elementary_features();
9586 
9587   enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_FORM, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL };
9588   enum features_per_tag { TAG, TAG_UPOS, TAG_CASE, TAG_GENDER, TAG_NUMBER, TAG_NEGATIVE, TAG_PERSON, LEMMA, PER_TAG_TOTAL };
9589   enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_FORM, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_FORM, DYNAMIC_TOTAL };
9590   enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG_UPOS, MAP_TAG_CASE, MAP_TAG_GENDER, MAP_TAG_NUMBER, MAP_TAG_NEGATIVE, MAP_TAG_PERSON, MAP_LEMMA, MAP_TOTAL } ;
9591 
9592   struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; };
9593   struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; };
9594   struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; };
9595 
9596   static vector<elementary_feature_description> descriptions;
9597 
9598   void compute_features(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<per_form_features>& per_form, vector<vector<per_tag_features>>& per_tag) const;
9599   inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const;
9600 
9601   using elementary_features<Map>::maps;
9602 };
9603 
9604 typedef conllu_elementary_features<persistent_elementary_feature_map> persistent_conllu_elementary_features;
9605 
9606 // Definitions
9607 template <class Map>
conllu_elementary_features()9608 conllu_elementary_features<Map>::conllu_elementary_features() {
9609   maps.resize(MAP_TOTAL);
9610 }
9611 
9612 template <class Map>
9613 vector<elementary_feature_description> conllu_elementary_features<Map>::descriptions = {
9614   {"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM},
9615   {"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG},
9616   {"FollowingVerbForm", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_FORM, MAP_FORM},
9617   {"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE},
9618   {"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE},
9619   {"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE},
9620   {"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1},
9621   {"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2},
9622   {"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3},
9623   {"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4},
9624   {"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5},
9625   {"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6},
9626   {"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7},
9627   {"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8},
9628   {"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9},
9629   {"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1},
9630   {"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2},
9631   {"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3},
9632   {"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4},
9633   {"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5},
9634   {"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6},
9635   {"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7},
9636   {"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8},
9637   {"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9},
9638 
9639   {"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG},
9640   {"TagUPos", PER_TAG, ANY_OFFSET, TAG_UPOS, MAP_TAG_UPOS},
9641   {"TagCase", PER_TAG, ANY_OFFSET, TAG_CASE, MAP_TAG_CASE},
9642   {"TagGender", PER_TAG, ANY_OFFSET, TAG_GENDER, MAP_TAG_GENDER},
9643   {"TagNumber", PER_TAG, ANY_OFFSET, TAG_NUMBER, MAP_TAG_NUMBER},
9644   {"TagNegative", PER_TAG, ANY_OFFSET, TAG_NEGATIVE, MAP_TAG_NEGATIVE},
9645   {"TagPerson", PER_TAG, ANY_OFFSET, TAG_PERSON, MAP_TAG_PERSON},
9646   {"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA},
9647 
9648   {"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG},
9649   {"PreviousVerbForm", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_FORM, MAP_FORM},
9650 };
9651 
9652 template <class Map>
compute_features(const vector<string_piece> & forms,const vector<vector<tagged_lemma>> & analyses,vector<per_form_features> & per_form,vector<vector<per_tag_features>> & per_tag) const9653 void conllu_elementary_features<Map>::compute_features(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<per_form_features>& per_form, vector<vector<per_tag_features>>& per_tag) const {
9654   using namespace unilib;
9655 
9656   // We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly.
9657   elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_form = elementary_feature_empty;
9658   for (unsigned i = forms.size(); i--;) {
9659     int verb_candidate = -1;
9660 
9661     // Per_tag features and verb_candidate
9662     for (unsigned j = 0; j < analyses[i].size(); j++) {
9663       const string& tag = analyses[i][j].tag;
9664       const string& lemma = analyses[i][j].lemma;
9665 
9666       // Tag consists of three parts separated by tag[0] character
9667       // - first is TAG_UPOS,
9668       // - second is TAG_LPOS,
9669       // - then there is any number of | separated named fields in format Name=Value
9670       per_tag[i][j].values[TAG] = maps[MAP_TAG].value(tag.c_str(), tag.size());
9671       per_tag[i][j].values[TAG_UPOS] = per_tag[i][j].values[TAG_CASE] = per_tag[i][j].values[TAG_GENDER] = elementary_feature_empty;
9672       per_tag[i][j].values[TAG_NUMBER] = per_tag[i][j].values[TAG_NEGATIVE] = per_tag[i][j].values[TAG_PERSON] = elementary_feature_empty;
9673       per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] :
9674           maps[MAP_LEMMA].value(lemma.c_str(), lemma.size());
9675 
9676       char separator = tag[0];
9677       size_t index = tag.find(separator, 1);
9678       if (index == string::npos) index = tag.size();
9679       per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0));
9680 
9681       if (index < tag.size()) index++;
9682       if (index < tag.size()) index = tag.find(separator, index);
9683       if (index < tag.size()) index++;
9684       for (size_t length; index < tag.size(); index += length + 1) {
9685         length = tag.find('|', index);
9686         length = (length == string::npos ? tag.size() : length) - index;
9687 
9688         for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++)
9689           if (tag[index + equal_sign] == '=') {
9690             int value = -1, map;
9691             switch (equal_sign) {
9692               case 4:
9693                 if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE;
9694                 break;
9695               case 6:
9696                 if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER;
9697                 if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER;
9698                 if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON;
9699                 break;
9700               case 8:
9701                 if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE;
9702                 break;
9703             }
9704 
9705             if (value >= 0)
9706               per_tag[i][j].values[value] = maps[map].value(tag.c_str() + index + equal_sign + 1, length - equal_sign - 1);
9707             break;
9708           }
9709       }
9710 
9711       if (tag.size() >= 2 && tag[1] == 'V') {
9712         int tag_compare;
9713         verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
9714       }
9715     }
9716 
9717     // Per_form features
9718     per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len);
9719     per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag;
9720     per_form[i].values[FOLLOWING_VERB_FORM] = following_verb_form;
9721 
9722     // Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}.
9723     if (verb_candidate >= 0) {
9724       following_verb_tag = per_tag[i][verb_candidate].values[TAG];
9725       following_verb_form = per_form[i].values[FORM];
9726     }
9727 
9728     // Ortographic per_form features if needed
9729     if (analyses[i].size() == 1) {
9730       per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown;
9731       per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown;
9732       per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown;
9733       per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown;
9734       per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown;
9735       per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown;
9736       per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown;
9737     } else if (forms[i].len <= 0) {
9738       per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1;
9739       per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty;
9740       per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty;
9741       per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty;
9742       per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty;
9743       per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty;
9744       per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty;
9745     } else {
9746       string_piece form = forms[i];
9747       const char* form_start = form.str;
9748 
9749       bool num = false, cap = false, dash = false;
9750       size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters
9751       int index = 0;
9752       while (form.len) {
9753         indices[(index++) % 18] = form.str - form_start;
9754 
9755         unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len));
9756         num = num || cat & unicode::N;
9757         cap = cap || cat & unicode::Lut;
9758         dash = dash || cat & unicode::Pd;
9759 
9760         if (index == 10 || (!form.len && index < 10)) {
9761           per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]);
9762           per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]);
9763           per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]);
9764           per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]);
9765           per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]);
9766           per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]);
9767           per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]);
9768           per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]);
9769           per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]);
9770         }
9771       }
9772       per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]);
9773       per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]);
9774       per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]);
9775       per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]);
9776       per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]);
9777       per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]);
9778       per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]);
9779       per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]);
9780       per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]);
9781       per_form[i].values[NUM] = elementary_feature_empty + 1 + num;
9782       per_form[i].values[CAP] = elementary_feature_empty + 1 + cap;
9783       per_form[i].values[DASH] = elementary_feature_empty + 1 + dash;
9784     }
9785   }
9786 }
9787 
9788 template <class Map>
compute_dynamic_features(const tagged_lemma & tag,const per_form_features & per_form,const per_tag_features & per_tag,const dynamic_features * prev_dynamic,dynamic_features & dynamic) const9789 void conllu_elementary_features<Map>::compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const {
9790   if (prev_dynamic) {
9791     dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG];
9792     dynamic.values[PREVIOUS_VERB_FORM] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_FORM];
9793   } else {
9794     dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty;
9795     dynamic.values[PREVIOUS_VERB_FORM] = elementary_feature_empty;
9796   }
9797 
9798   if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
9799     dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG];
9800     dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = per_form.values[FORM];
9801   } else {
9802     dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG];
9803     dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = dynamic.values[PREVIOUS_VERB_FORM];
9804   }
9805 }
9806 
9807 } // namespace morphodita
9808 
9809 /////////
9810 // File: morphodita/tagger/czech_elementary_features.h
9811 /////////
9812 
9813 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9814 //
9815 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9816 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9817 //
9818 // This Source Code Form is subject to the terms of the Mozilla Public
9819 // License, v. 2.0. If a copy of the MPL was not distributed with this
9820 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9821 
9822 namespace morphodita {
9823 
9824 // Declarations
9825 template <class Map>
9826 class czech_elementary_features : public elementary_features<Map> {
9827  public:
9828   czech_elementary_features();
9829 
9830   enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, PER_FORM_TOTAL };
9831   enum features_per_tag { TAG, TAG3, TAG5, TAG25, LEMMA, PER_TAG_TOTAL };
9832   enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL };
9833   enum features_map { MAP_NONE = -1, MAP_FORM, MAP_LEMMA, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_TAG, MAP_TAG3, MAP_TAG5, MAP_TAG25, MAP_TOTAL } ;
9834 
9835   struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; };
9836   struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; };
9837   struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; };
9838 
9839   static vector<elementary_feature_description> descriptions;
9840 
9841   void compute_features(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<per_form_features>& per_form, vector<vector<per_tag_features>>& per_tag) const;
9842   inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const;
9843 
9844   using elementary_features<Map>::maps;
9845 };
9846 
9847 typedef czech_elementary_features<persistent_elementary_feature_map> persistent_czech_elementary_features;
9848 
9849 // Definitions
9850 template <class Map>
czech_elementary_features()9851 czech_elementary_features<Map>::czech_elementary_features() {
9852   maps.resize(MAP_TOTAL);
9853 }
9854 
9855 template <class Map>
9856 vector<elementary_feature_description> czech_elementary_features<Map>::descriptions = {
9857   {"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM},
9858   {"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG},
9859   {"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA },
9860   {"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE},
9861   {"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE},
9862   {"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE},
9863   {"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1},
9864   {"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2},
9865   {"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3},
9866   {"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4},
9867   {"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1},
9868   {"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2},
9869   {"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3},
9870   {"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4},
9871 
9872   {"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG},
9873   {"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3},
9874   {"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5},
9875   {"Tag25", PER_TAG, ANY_OFFSET, TAG25, MAP_TAG25},
9876   {"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA},
9877 
9878   {"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG},
9879   {"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA}
9880 };
9881 
9882 template <class Map>
compute_features(const vector<string_piece> & forms,const vector<vector<tagged_lemma>> & analyses,vector<per_form_features> & per_form,vector<vector<per_tag_features>> & per_tag) const9883 void czech_elementary_features<Map>::compute_features(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<per_form_features>& per_form, vector<vector<per_tag_features>>& per_tag) const {
9884   using namespace unilib;
9885 
9886   // We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly.
9887   elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty;
9888   for (unsigned i = forms.size(); i--;) {
9889     int verb_candidate = -1;
9890 
9891     // Per_tag features and verb_candidate
9892     for (unsigned j = 0; j < analyses[i].size(); j++) {
9893       char tag25[2];
9894       per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size());
9895       per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty;
9896       per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty;
9897       per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty;
9898       per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
9899           maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size());
9900 
9901       if (analyses[i][j].tag[0] == 'V') {
9902         int tag_compare;
9903         verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
9904       }
9905     }
9906 
9907     // Per_form features
9908     per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len);
9909     per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag;
9910     per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma;
9911 
9912     // Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}.
9913     if (verb_candidate >= 0) {
9914       following_verb_tag = per_tag[i][verb_candidate].values[TAG];
9915       following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA];
9916     }
9917 
9918     // Ortographic per_form features if needed
9919     if (analyses[i].size() == 1) {
9920       per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown;
9921       per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_unknown;
9922       per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_unknown;
9923     } else if (forms[i].len <= 0) {
9924       per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1;
9925       per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_empty;
9926       per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_empty;
9927     } else {
9928       string_piece form = forms[i];
9929       const char* form_start = form.str;
9930 
9931       bool num = false, cap = false, dash = false;
9932       size_t indices[8] = {0, form.len, form.len, form.len, form.len, 0, 0, 0}; // careful here regarding forms shorter than 4 characters
9933       int index = 0;
9934       while (form.len) {
9935         indices[(index++)&7] = form.str - form_start;
9936 
9937         unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len));
9938         num = num || cat & unicode::N;
9939         cap = cap || cat & unicode::Lut;
9940         dash = dash || cat & unicode::Pd;
9941 
9942         if (index == 5 || (!form.len && index < 5)) {
9943           per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]);
9944           per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]);
9945           per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]);
9946           per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]);
9947         }
9948       }
9949       per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index-1)&7], form.str - form_start - indices[(index-1)&7]);
9950       per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index-2)&7], form.str - form_start - indices[(index-2)&7]);
9951       per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index-3)&7], form.str - form_start - indices[(index-3)&7]);
9952       per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index-4)&7], form.str - form_start - indices[(index-4)&7]);
9953       per_form[i].values[NUM] = elementary_feature_empty + 1 + num;
9954       per_form[i].values[CAP] = elementary_feature_empty + 1 + cap;
9955       per_form[i].values[DASH] = elementary_feature_empty + 1 + dash;
9956     }
9957   }
9958 }
9959 
9960 template <class Map>
compute_dynamic_features(const tagged_lemma & tag,const per_form_features &,const per_tag_features & per_tag,const dynamic_features * prev_dynamic,dynamic_features & dynamic) const9961 void czech_elementary_features<Map>::compute_dynamic_features(const tagged_lemma& tag, const per_form_features& /*per_form*/, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const {
9962   if (prev_dynamic) {
9963     dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG];
9964     dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA];
9965   } else {
9966     dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty;
9967     dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty;
9968   }
9969 
9970   if (tag.tag[0] == 'V') {
9971     dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG];
9972     dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA];
9973   } else {
9974     dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG];
9975     dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA];
9976   }
9977 }
9978 
9979 } // namespace morphodita
9980 
9981 /////////
9982 // File: morphodita/tagger/generic_elementary_features.h
9983 /////////
9984 
9985 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
9986 //
9987 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9988 // Mathematics and Physics, Charles University in Prague, Czech Republic.
9989 //
9990 // This Source Code Form is subject to the terms of the Mozilla Public
9991 // License, v. 2.0. If a copy of the MPL was not distributed with this
9992 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
9993 
9994 namespace morphodita {
9995 
9996 // Declarations
9997 template <class Map>
9998 class generic_elementary_features : public elementary_features<Map> {
9999  public:
10000   generic_elementary_features();
10001 
10002   enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL };
10003   enum features_per_tag { TAG, TAG1, TAG2, TAG3, TAG4, TAG5, LEMMA, PER_TAG_TOTAL };
10004   enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL };
10005   enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG1, MAP_TAG2, MAP_TAG3, MAP_TAG4, MAP_TAG5, MAP_LEMMA, MAP_TOTAL } ;
10006 
10007   struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; };
10008   struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; };
10009   struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; };
10010 
10011   static vector<elementary_feature_description> descriptions;
10012 
10013   void compute_features(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<per_form_features>& per_form, vector<vector<per_tag_features>>& per_tag) const;
10014   inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const;
10015 
10016   using elementary_features<Map>::maps;
10017 };
10018 
10019 typedef generic_elementary_features<persistent_elementary_feature_map> persistent_generic_elementary_features;
10020 
10021 // Definitions
10022 template <class Map>
generic_elementary_features()10023 generic_elementary_features<Map>::generic_elementary_features() {
10024   maps.resize(MAP_TOTAL);
10025 }
10026 
10027 template <class Map>
10028 vector<elementary_feature_description> generic_elementary_features<Map>::descriptions = {
10029   {"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM},
10030   {"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG},
10031   {"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA },
10032   {"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE},
10033   {"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE},
10034   {"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE},
10035   {"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1},
10036   {"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2},
10037   {"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3},
10038   {"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4},
10039   {"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5},
10040   {"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6},
10041   {"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7},
10042   {"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8},
10043   {"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9},
10044   {"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1},
10045   {"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2},
10046   {"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3},
10047   {"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4},
10048   {"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5},
10049   {"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6},
10050   {"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7},
10051   {"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8},
10052   {"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9},
10053 
10054   {"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG},
10055   {"Tag1", PER_TAG, ANY_OFFSET, TAG1, MAP_TAG1},
10056   {"Tag2", PER_TAG, ANY_OFFSET, TAG2, MAP_TAG2},
10057   {"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3},
10058   {"Tag4", PER_TAG, ANY_OFFSET, TAG4, MAP_TAG4},
10059   {"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5},
10060   {"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA},
10061 
10062   {"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG},
10063   {"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA}
10064 };
10065 
10066 template <class Map>
compute_features(const vector<string_piece> & forms,const vector<vector<tagged_lemma>> & analyses,vector<per_form_features> & per_form,vector<vector<per_tag_features>> & per_tag) const10067 void generic_elementary_features<Map>::compute_features(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<per_form_features>& per_form, vector<vector<per_tag_features>>& per_tag) const {
10068   using namespace unilib;
10069 
10070   // We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly.
10071   elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty;
10072   for (unsigned i = forms.size(); i--;) {
10073     int verb_candidate = -1;
10074 
10075     // Per_tag features and verb_candidate
10076     for (unsigned j = 0; j < analyses[i].size(); j++) {
10077       per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size());
10078       per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty;
10079       per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty;
10080       per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty;
10081       per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty;
10082       per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty;
10083       per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
10084           maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size());
10085 
10086       if (analyses[i][j].tag[0] == 'V') {
10087         int tag_compare;
10088         verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
10089       }
10090     }
10091 
10092     // Per_form features
10093     per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len);
10094     per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag;
10095     per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma;
10096 
10097     // Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}.
10098     if (verb_candidate >= 0) {
10099       following_verb_tag = per_tag[i][verb_candidate].values[TAG];
10100       following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA];
10101     }
10102 
10103     // Ortographic per_form features if needed
10104     if (analyses[i].size() == 1) {
10105       per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown;
10106       per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown;
10107       per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown;
10108       per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown;
10109       per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown;
10110       per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown;
10111       per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown;
10112     } else if (forms[i].len <= 0) {
10113       per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1;
10114       per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty;
10115       per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty;
10116       per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty;
10117       per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty;
10118       per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty;
10119       per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty;
10120     } else {
10121       string_piece form = forms[i];
10122       const char* form_start = form.str;
10123 
10124       bool num = false, cap = false, dash = false;
10125       size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters
10126       int index = 0;
10127       while (form.len) {
10128         indices[(index++) % 18] = form.str - form_start;
10129 
10130         unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len));
10131         num = num || cat & unicode::N;
10132         cap = cap || cat & unicode::Lut;
10133         dash = dash || cat & unicode::Pd;
10134 
10135         if (index == 10 || (!form.len && index < 10)) {
10136           per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]);
10137           per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]);
10138           per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]);
10139           per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]);
10140           per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]);
10141           per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]);
10142           per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]);
10143           per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]);
10144           per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]);
10145         }
10146       }
10147       per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]);
10148       per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]);
10149       per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]);
10150       per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]);
10151       per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]);
10152       per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]);
10153       per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]);
10154       per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]);
10155       per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]);
10156       per_form[i].values[NUM] = elementary_feature_empty + 1 + num;
10157       per_form[i].values[CAP] = elementary_feature_empty + 1 + cap;
10158       per_form[i].values[DASH] = elementary_feature_empty + 1 + dash;
10159     }
10160   }
10161 }
10162 
10163 template <class Map>
compute_dynamic_features(const tagged_lemma & tag,const per_form_features &,const per_tag_features & per_tag,const dynamic_features * prev_dynamic,dynamic_features & dynamic) const10164 void generic_elementary_features<Map>::compute_dynamic_features(const tagged_lemma& tag, const per_form_features& /*per_form*/, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const {
10165   if (prev_dynamic) {
10166     dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG];
10167     dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA];
10168   } else {
10169     dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty;
10170     dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty;
10171   }
10172 
10173   if (tag.tag[0] == 'V') {
10174     dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG];
10175     dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA];
10176   } else {
10177     dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG];
10178     dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA];
10179   }
10180 }
10181 
10182 } // namespace morphodita
10183 
10184 /////////
10185 // File: morphodita/tagger/perceptron_tagger.h
10186 /////////
10187 
10188 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10189 //
10190 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10191 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10192 //
10193 // This Source Code Form is subject to the terms of the Mozilla Public
10194 // License, v. 2.0. If a copy of the MPL was not distributed with this
10195 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10196 
10197 namespace morphodita {
10198 
10199 // Declarations
10200 template<class FeatureSequences>
10201 class perceptron_tagger : public tagger {
10202  public:
10203   perceptron_tagger(int decoding_order, int window_size);
10204 
10205   bool load(istream& is);
10206   virtual const morpho* get_morpho() const override;
10207   virtual void tag(const vector<string_piece>& forms, vector<tagged_lemma>& tags, morpho::guesser_mode guesser = morpho::guesser_mode(-1)) const override;
10208   virtual void tag_analyzed(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<int>& tags) const override;
10209 
10210  private:
10211   int decoding_order, window_size;
10212 
10213   unique_ptr<morpho> dict;
10214   bool use_guesser;
10215   FeatureSequences features;
10216   typedef viterbi<FeatureSequences> viterbi_decoder;
10217   viterbi_decoder decoder;
10218   struct cache {
10219     vector<string_piece> forms;
10220     vector<vector<tagged_lemma>> analyses;
10221     vector<int> tags;
10222     typename viterbi_decoder::cache decoder_cache;
10223 
cacheufal::udpipe::morphodita::perceptron_tagger::cache10224     cache(const perceptron_tagger<FeatureSequences>& self) : decoder_cache(self.decoder) {}
10225   };
10226 
10227   mutable threadsafe_stack<cache> caches;
10228 };
10229 
10230 // Definitions
10231 
10232 template<class FeatureSequences>
perceptron_tagger(int decoding_order,int window_size)10233 perceptron_tagger<FeatureSequences>::perceptron_tagger(int decoding_order, int window_size)
10234   : decoding_order(decoding_order), window_size(window_size), decoder(features, decoding_order, window_size) {}
10235 
10236 template<class FeatureSequences>
load(istream & is)10237 bool perceptron_tagger<FeatureSequences>::load(istream& is) {
10238   if (dict.reset(morpho::load(is)), !dict) return false;
10239   use_guesser = is.get();
10240   if (!features.load(is)) return false;
10241   return true;
10242 }
10243 
10244 template<class FeatureSequences>
get_morpho() const10245 const morpho* perceptron_tagger<FeatureSequences>::get_morpho() const {
10246   return dict.get();
10247 }
10248 
10249 template<class FeatureSequences>
tag(const vector<string_piece> & forms,vector<tagged_lemma> & tags,morpho::guesser_mode guesser) const10250 void perceptron_tagger<FeatureSequences>::tag(const vector<string_piece>& forms, vector<tagged_lemma>& tags, morpho::guesser_mode guesser) const {
10251   tags.clear();
10252   if (!dict) return;
10253 
10254   cache* c = caches.pop();
10255   if (!c) c = new cache(*this);
10256 
10257   c->forms.resize(forms.size());
10258   if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size());
10259   for (unsigned i = 0; i < forms.size(); i++) {
10260     c->forms[i] = forms[i];
10261     c->forms[i].len = dict->raw_form_len(forms[i]);
10262     dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
10263   }
10264 
10265   if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2);
10266   decoder.tag(c->forms, c->analyses, c->decoder_cache, c->tags);
10267 
10268   for (unsigned i = 0; i < forms.size(); i++)
10269     tags.emplace_back(c->analyses[i][c->tags[i]]);
10270 
10271   caches.push(c);
10272 }
10273 
10274 template<class FeatureSequences>
tag_analyzed(const vector<string_piece> & forms,const vector<vector<tagged_lemma>> & analyses,vector<int> & tags) const10275 void perceptron_tagger<FeatureSequences>::tag_analyzed(const vector<string_piece>& forms, const vector<vector<tagged_lemma>>& analyses, vector<int>& tags) const {
10276   tags.clear();
10277 
10278   cache* c = caches.pop();
10279   if (!c) c = new cache(*this);
10280 
10281   tags.resize(forms.size());
10282   decoder.tag(forms, analyses, c->decoder_cache, tags);
10283 
10284   caches.push(c);
10285 }
10286 
10287 } // namespace morphodita
10288 
10289 /////////
10290 // File: morphodita/tagger/tagger_ids.h
10291 /////////
10292 
10293 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10294 //
10295 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10296 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10297 //
10298 // This Source Code Form is subject to the terms of the Mozilla Public
10299 // License, v. 2.0. If a copy of the MPL was not distributed with this
10300 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10301 
10302 namespace morphodita {
10303 
10304 class tagger_ids {
10305  public:
10306   enum tagger_id {
10307     CZECH2 = 0, CZECH3 = 1, CZECH2_3 = 6,
10308     /* 2 was used internally for ENGLISH3, but never released publicly */
10309     GENERIC2 = 3, GENERIC3 = 4, GENERIC4 = 5, GENERIC2_3 = 7,
10310     CONLLU2 = 8, CONLLU2_3 = 9, CONLLU3 = 10,
10311   };
10312 
parse(const string & str,tagger_id & id)10313   static bool parse(const string& str, tagger_id& id) {
10314     if (str == "czech2") return id = CZECH2, true;
10315     if (str == "czech2_3") return id = CZECH2_3, true;
10316     if (str == "czech3") return id = CZECH3, true;
10317     if (str == "generic2") return id = GENERIC2, true;
10318     if (str == "generic2_3") return id = GENERIC2_3, true;
10319     if (str == "generic3") return id = GENERIC3, true;
10320     if (str == "generic4") return id = GENERIC4, true;
10321     if (str == "conllu2") return id = CONLLU2, true;
10322     if (str == "conllu2_3") return id = CONLLU2_3, true;
10323     if (str == "conllu3") return id = CONLLU3, true;
10324     return false;
10325   }
10326 
decoding_order(tagger_id id)10327   static int decoding_order(tagger_id id) {
10328     switch (id) {
10329       case CZECH2: return 2;
10330       case CZECH2_3: return 2;
10331       case CZECH3: return 3;
10332       case GENERIC2: return 2;
10333       case GENERIC2_3: return 2;
10334       case GENERIC3: return 3;
10335       case GENERIC4: return 4;
10336       case CONLLU2: return 2;
10337       case CONLLU2_3: return 2;
10338       case CONLLU3: return 3;
10339     }
10340     return 0;
10341   }
10342 
window_size(tagger_id id)10343   static int window_size(tagger_id id) {
10344     switch (id) {
10345       case CZECH2_3: return 3;
10346       case GENERIC2_3: return 3;
10347       case CONLLU2_3: return 3;
10348       default: break;
10349     }
10350     return decoding_order(id);
10351   }
10352 };
10353 
10354 typedef tagger_ids::tagger_id tagger_id;
10355 
10356 } // namespace morphodita
10357 
10358 /////////
10359 // File: morphodita/tagger/tagger.cpp
10360 /////////
10361 
10362 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10363 //
10364 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10365 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10366 //
10367 // This Source Code Form is subject to the terms of the Mozilla Public
10368 // License, v. 2.0. If a copy of the MPL was not distributed with this
10369 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10370 
10371 namespace morphodita {
10372 
load(istream & is)10373 tagger* tagger::load(istream& is) {
10374   tagger_id id = tagger_id(is.get());
10375   switch (id) {
10376     case tagger_ids::CZECH2:
10377     case tagger_ids::CZECH2_3:
10378     case tagger_ids::CZECH3:
10379       {
10380         auto res = new_unique_ptr<perceptron_tagger<persistent_feature_sequences<persistent_czech_elementary_features>>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10381         if (res->load(is)) return res.release();
10382         break;
10383       }
10384     case tagger_ids::GENERIC2:
10385     case tagger_ids::GENERIC2_3:
10386     case tagger_ids::GENERIC3:
10387     case tagger_ids::GENERIC4:
10388       {
10389         auto res = new_unique_ptr<perceptron_tagger<persistent_feature_sequences<persistent_generic_elementary_features>>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10390         if (res->load(is)) return res.release();
10391         break;
10392       }
10393     case tagger_ids::CONLLU2:
10394     case tagger_ids::CONLLU2_3:
10395     case tagger_ids::CONLLU3:
10396       {
10397         auto res = new_unique_ptr<perceptron_tagger<persistent_feature_sequences<persistent_conllu_elementary_features>>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10398         if (res->load(is)) return res.release();
10399         break;
10400       }
10401   }
10402 
10403   return nullptr;
10404 }
10405 
load(const char * fname)10406 tagger* tagger::load(const char* fname) {
10407   ifstream f(fname, ifstream::binary);
10408   if (!f) return nullptr;
10409 
10410   return load(f);
10411 }
10412 
new_tokenizer() const10413 tokenizer* tagger::new_tokenizer() const {
10414   auto morpho = get_morpho();
10415   return morpho ? morpho->new_tokenizer() : nullptr;
10416 }
10417 
10418 } // namespace morphodita
10419 
10420 /////////
10421 // File: morphodita/tagset_converter/tagset_converter.h
10422 /////////
10423 
10424 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10425 //
10426 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10427 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10428 //
10429 // This Source Code Form is subject to the terms of the Mozilla Public
10430 // License, v. 2.0. If a copy of the MPL was not distributed with this
10431 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10432 
10433 namespace morphodita {
10434 
10435 class tagset_converter {
10436  public:
~tagset_converter()10437   virtual ~tagset_converter() {}
10438 
10439   // Convert a tag-lemma pair to a different tag set.
10440   virtual void convert(tagged_lemma& tagged_lemma) const = 0;
10441   // Convert a result of analysis to a different tag set. Apart from calling
10442   // convert, any repeated entry is removed.
10443   virtual void convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const = 0;
10444   // Convert a result of generation to a different tag set. Apart from calling
10445   // convert, any repeated entry is removed.
10446   virtual void convert_generated(vector<tagged_lemma_forms>& forms) const = 0;
10447 
10448   // Static factory methods
10449   static tagset_converter* new_identity_converter();
10450 
10451   static tagset_converter* new_pdt_to_conll2009_converter();
10452   static tagset_converter* new_strip_lemma_comment_converter(const morpho& dictionary);
10453   static tagset_converter* new_strip_lemma_id_converter(const morpho& dictionary);
10454 };
10455 
10456 // Helper method for creating tagset_converter from instance name.
10457 tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary);
10458 
10459 // Helper methods making sure remapped results are unique.
10460 void tagset_converter_unique_analyzed(vector<tagged_lemma>& tagged_lemmas);
10461 void tagset_converter_unique_generated(vector<tagged_lemma_forms>& forms);
10462 
10463 } // namespace morphodita
10464 
10465 /////////
10466 // File: morphodita/tagset_converter/identity_tagset_converter.h
10467 /////////
10468 
10469 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10470 //
10471 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10472 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10473 //
10474 // This Source Code Form is subject to the terms of the Mozilla Public
10475 // License, v. 2.0. If a copy of the MPL was not distributed with this
10476 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10477 
10478 namespace morphodita {
10479 
10480 class identity_tagset_converter : public tagset_converter {
10481  public:
10482   virtual void convert(tagged_lemma& tagged_lemma) const override;
10483   virtual void convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const override;
10484   virtual void convert_generated(vector<tagged_lemma_forms>& forms) const override;
10485 };
10486 
10487 } // namespace morphodita
10488 
10489 /////////
10490 // File: morphodita/tagset_converter/identity_tagset_converter.cpp
10491 /////////
10492 
10493 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10494 //
10495 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10496 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10497 //
10498 // This Source Code Form is subject to the terms of the Mozilla Public
10499 // License, v. 2.0. If a copy of the MPL was not distributed with this
10500 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10501 
10502 namespace morphodita {
10503 
convert(tagged_lemma &) const10504 void identity_tagset_converter::convert(tagged_lemma& /*tagged_lemma*/) const {}
10505 
convert_analyzed(vector<tagged_lemma> &) const10506 void identity_tagset_converter::convert_analyzed(vector<tagged_lemma>& /*tagged_lemmas*/) const {}
10507 
convert_generated(vector<tagged_lemma_forms> &) const10508 void identity_tagset_converter::convert_generated(vector<tagged_lemma_forms>& /*forms*/) const {}
10509 
10510 } // namespace morphodita
10511 
10512 /////////
10513 // File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.h
10514 /////////
10515 
10516 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10517 //
10518 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10519 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10520 //
10521 // This Source Code Form is subject to the terms of the Mozilla Public
10522 // License, v. 2.0. If a copy of the MPL was not distributed with this
10523 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10524 
10525 namespace morphodita {
10526 
10527 class pdt_to_conll2009_tagset_converter : public tagset_converter {
10528  public:
10529   virtual void convert(tagged_lemma& tagged_lemma) const override;
10530   virtual void convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const override;
10531   virtual void convert_generated(vector<tagged_lemma_forms>& forms) const override;
10532 
10533  private:
10534   inline void convert_tag(const string& lemma, string& tag) const;
10535   inline bool convert_lemma(string& lemma) const;
10536 };
10537 
10538 } // namespace morphodita
10539 
10540 /////////
10541 // File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.cpp
10542 /////////
10543 
10544 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10545 //
10546 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10547 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10548 //
10549 // This Source Code Form is subject to the terms of the Mozilla Public
10550 // License, v. 2.0. If a copy of the MPL was not distributed with this
10551 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10552 
10553 namespace morphodita {
10554 
10555 static const char* names[15] = {"POS", "SubPOS", "Gen", "Num", "Cas", "PGe", "PNu", "Per", "Ten", "Gra", "Neg", "Voi", "", "", "Var"};
10556 
convert_tag(const string & lemma,string & tag) const10557 inline void pdt_to_conll2009_tagset_converter::convert_tag(const string& lemma, string& tag) const {
10558   char pdt_tag[15];
10559   strncpy(pdt_tag, tag.c_str(), 15);
10560 
10561   // Clear the tag
10562   tag.clear();
10563 
10564   // Fill FEAT of filled tag characters
10565   for (int i = 0; i < 15 && pdt_tag[i]; i++)
10566     if (pdt_tag[i] != '-') {
10567       if (!tag.empty()) tag.push_back('|');
10568       tag.append(names[i]);
10569       tag.push_back('=');
10570       tag.push_back(pdt_tag[i]);
10571     }
10572 
10573   // Try adding Sem FEAT
10574   for (unsigned i = 0; i + 2 < lemma.size(); i++)
10575     if (lemma[i] == '_' && lemma[i + 1] == ';') {
10576       if (!tag.empty()) tag.push_back('|');
10577       tag.append("Sem=");
10578       tag.push_back(lemma[i + 2]);
10579       break;
10580     }
10581 }
10582 
convert_lemma(string & lemma) const10583 inline bool pdt_to_conll2009_tagset_converter::convert_lemma(string& lemma) const {
10584   unsigned raw_lemma = czech_lemma_addinfo::raw_lemma_len(lemma);
10585   return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false;
10586 }
10587 
convert(tagged_lemma & tagged_lemma) const10588 void pdt_to_conll2009_tagset_converter::convert(tagged_lemma& tagged_lemma) const {
10589   convert_tag(tagged_lemma.lemma, tagged_lemma.tag);
10590   convert_lemma(tagged_lemma.lemma);
10591 }
10592 
convert_analyzed(vector<tagged_lemma> & tagged_lemmas) const10593 void pdt_to_conll2009_tagset_converter::convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const {
10594   bool lemma_changed = false;
10595 
10596   for (auto&& tagged_lemma : tagged_lemmas) {
10597     convert_tag(tagged_lemma.lemma, tagged_lemma.tag);
10598     lemma_changed |= convert_lemma(tagged_lemma.lemma);
10599   }
10600 
10601   // If no lemma was changed or there is 1 analysis, no duplicates could be created.
10602   if (!lemma_changed || tagged_lemmas.size() < 2) return;
10603 
10604   tagset_converter_unique_analyzed(tagged_lemmas);
10605 }
10606 
convert_generated(vector<tagged_lemma_forms> & forms) const10607 void pdt_to_conll2009_tagset_converter::convert_generated(vector<tagged_lemma_forms>& forms) const {
10608   bool lemma_changed = false;
10609 
10610   for (auto&& tagged_lemma_forms : forms) {
10611     for (auto&& tagged_form : tagged_lemma_forms.forms)
10612       convert_tag(tagged_lemma_forms.lemma, tagged_form.tag);
10613     lemma_changed |= convert_lemma(tagged_lemma_forms.lemma);
10614   }
10615 
10616   // If no lemma was changed or there is 1 analysis, no duplicates could be created.
10617   if (!lemma_changed || forms.size() < 2) return;
10618 
10619   tagset_converter_unique_generated(forms);
10620 }
10621 
10622 } // namespace morphodita
10623 
10624 /////////
10625 // File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.h
10626 /////////
10627 
10628 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10629 //
10630 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10631 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10632 //
10633 // This Source Code Form is subject to the terms of the Mozilla Public
10634 // License, v. 2.0. If a copy of the MPL was not distributed with this
10635 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10636 
10637 namespace morphodita {
10638 
10639 class strip_lemma_comment_tagset_converter : public tagset_converter {
10640  public:
strip_lemma_comment_tagset_converter(const morpho & dictionary)10641   strip_lemma_comment_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {}
10642 
10643   virtual void convert(tagged_lemma& tagged_lemma) const override;
10644   virtual void convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const override;
10645   virtual void convert_generated(vector<tagged_lemma_forms>& forms) const override;
10646 
10647  private:
10648   inline bool convert_lemma(string& lemma) const;
10649   const morpho& dictionary;
10650 };
10651 
10652 } // namespace morphodita
10653 
10654 /////////
10655 // File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.cpp
10656 /////////
10657 
10658 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10659 //
10660 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10661 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10662 //
10663 // This Source Code Form is subject to the terms of the Mozilla Public
10664 // License, v. 2.0. If a copy of the MPL was not distributed with this
10665 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10666 
10667 namespace morphodita {
10668 
convert_lemma(string & lemma) const10669 inline bool strip_lemma_comment_tagset_converter::convert_lemma(string& lemma) const {
10670   unsigned lemma_id_len = dictionary.lemma_id_len(lemma);
10671   return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false;
10672 }
10673 
convert(tagged_lemma & tagged_lemma) const10674 void strip_lemma_comment_tagset_converter::convert(tagged_lemma& tagged_lemma) const {
10675   convert_lemma(tagged_lemma.lemma);
10676 }
10677 
convert_analyzed(vector<tagged_lemma> & tagged_lemmas) const10678 void strip_lemma_comment_tagset_converter::convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const {
10679   bool lemma_changed = false;
10680 
10681   for (auto&& tagged_lemma : tagged_lemmas)
10682     lemma_changed |= convert_lemma(tagged_lemma.lemma);
10683 
10684   // If no lemma was changed or there is 1 analysis, no duplicates could be created.
10685   if (!lemma_changed || tagged_lemmas.size() < 2) return;
10686 
10687   tagset_converter_unique_analyzed(tagged_lemmas);
10688 }
10689 
convert_generated(vector<tagged_lemma_forms> & forms) const10690 void strip_lemma_comment_tagset_converter::convert_generated(vector<tagged_lemma_forms>& forms) const {
10691   bool lemma_changed = false;
10692 
10693   for (auto&& tagged_lemma_forms : forms)
10694     lemma_changed |= convert_lemma(tagged_lemma_forms.lemma);
10695 
10696   // If no lemma was changed or there is 1 analysis, no duplicates could be created.
10697   if (!lemma_changed || forms.size() < 2) return;
10698 
10699   tagset_converter_unique_generated(forms);
10700 }
10701 
10702 } // namespace morphodita
10703 
10704 /////////
10705 // File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.h
10706 /////////
10707 
10708 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10709 //
10710 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10711 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10712 //
10713 // This Source Code Form is subject to the terms of the Mozilla Public
10714 // License, v. 2.0. If a copy of the MPL was not distributed with this
10715 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10716 
10717 namespace morphodita {
10718 
10719 class strip_lemma_id_tagset_converter : public tagset_converter {
10720  public:
strip_lemma_id_tagset_converter(const morpho & dictionary)10721   strip_lemma_id_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {}
10722 
10723   virtual void convert(tagged_lemma& tagged_lemma) const override;
10724   virtual void convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const override;
10725   virtual void convert_generated(vector<tagged_lemma_forms>& forms) const override;
10726 
10727  private:
10728   inline bool convert_lemma(string& lemma) const;
10729   const morpho& dictionary;
10730 };
10731 
10732 } // namespace morphodita
10733 
10734 /////////
10735 // File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.cpp
10736 /////////
10737 
10738 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10739 //
10740 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10741 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10742 //
10743 // This Source Code Form is subject to the terms of the Mozilla Public
10744 // License, v. 2.0. If a copy of the MPL was not distributed with this
10745 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10746 
10747 namespace morphodita {
10748 
convert_lemma(string & lemma) const10749 inline bool strip_lemma_id_tagset_converter::convert_lemma(string& lemma) const {
10750   unsigned raw_lemma_len = dictionary.raw_lemma_len(lemma);
10751   return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false;
10752 }
10753 
convert(tagged_lemma & tagged_lemma) const10754 void strip_lemma_id_tagset_converter::convert(tagged_lemma& tagged_lemma) const {
10755   convert_lemma(tagged_lemma.lemma);
10756 }
10757 
convert_analyzed(vector<tagged_lemma> & tagged_lemmas) const10758 void strip_lemma_id_tagset_converter::convert_analyzed(vector<tagged_lemma>& tagged_lemmas) const {
10759   bool lemma_changed = false;
10760 
10761   for (auto&& tagged_lemma : tagged_lemmas)
10762     lemma_changed |= convert_lemma(tagged_lemma.lemma);
10763 
10764   // If no lemma was changed or there is 1 analysis, no duplicates could be created.
10765   if (!lemma_changed || tagged_lemmas.size() < 2) return;
10766 
10767   tagset_converter_unique_analyzed(tagged_lemmas);
10768 }
10769 
convert_generated(vector<tagged_lemma_forms> & forms) const10770 void strip_lemma_id_tagset_converter::convert_generated(vector<tagged_lemma_forms>& forms) const {
10771   bool lemma_changed = false;
10772 
10773   for (auto&& tagged_lemma_forms : forms)
10774     lemma_changed |= convert_lemma(tagged_lemma_forms.lemma);
10775 
10776   // If no lemma was changed or there is 1 analysis, no duplicates could be created.
10777   if (!lemma_changed || forms.size() < 2) return;
10778 
10779   tagset_converter_unique_generated(forms);
10780 }
10781 
10782 } // namespace morphodita
10783 
10784 /////////
10785 // File: morphodita/tagset_converter/tagset_converter.cpp
10786 /////////
10787 
10788 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10789 //
10790 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10791 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10792 //
10793 // This Source Code Form is subject to the terms of the Mozilla Public
10794 // License, v. 2.0. If a copy of the MPL was not distributed with this
10795 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10796 
10797 namespace morphodita {
10798 
new_identity_converter()10799 tagset_converter* tagset_converter::new_identity_converter() {
10800   return new identity_tagset_converter();
10801 }
10802 
new_pdt_to_conll2009_converter()10803 tagset_converter* tagset_converter::new_pdt_to_conll2009_converter() {
10804   return new pdt_to_conll2009_tagset_converter();
10805 }
10806 
new_strip_lemma_comment_converter(const morpho & dictionary)10807 tagset_converter* tagset_converter::new_strip_lemma_comment_converter(const morpho& dictionary) {
10808   return new strip_lemma_comment_tagset_converter(dictionary);
10809 }
10810 
new_strip_lemma_id_converter(const morpho & dictionary)10811 tagset_converter* tagset_converter::new_strip_lemma_id_converter(const morpho& dictionary) {
10812   return new strip_lemma_id_tagset_converter(dictionary);
10813 }
10814 
new_tagset_converter(const string & name,const morpho & dictionary)10815 tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary) {
10816   if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter();
10817   if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary);
10818   if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary);
10819   return nullptr;
10820 }
10821 
tagset_converter_unique_analyzed(vector<tagged_lemma> & tagged_lemmas)10822 void tagset_converter_unique_analyzed(vector<tagged_lemma>& tagged_lemmas) {
10823   // Remove possible lemma-tag pair duplicates
10824   struct tagged_lemma_comparator {
10825     inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; }
10826     inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); }
10827   };
10828 
10829   sort(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::lt);
10830   tagged_lemmas.resize(unique(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::eq) - tagged_lemmas.begin());
10831 }
10832 
tagset_converter_unique_generated(vector<tagged_lemma_forms> & forms)10833 void tagset_converter_unique_generated(vector<tagged_lemma_forms>& forms) {
10834   // Regroup and if needed remove duplicate form-tag pairs for each lemma
10835   for (unsigned i = 0; i < forms.size(); i++) {
10836     bool any_merged = false;
10837     for (unsigned j = forms.size() - 1; j > i; j--)
10838       if (forms[j].lemma == forms[i].lemma) {
10839         // Same lemma was found. Merge form-tag pairs
10840         for (auto&& tagged_form : forms[j].forms)
10841           forms[i].forms.emplace_back(move(tagged_form));
10842 
10843         // Remove lemma j by moving it to end and deleting
10844         if (j < forms.size() - 1) {
10845           forms[j].lemma.swap(forms[forms.size() - 1].lemma);
10846           forms[j].forms.swap(forms[forms.size() - 1].forms);
10847         }
10848         forms.pop_back();
10849         any_merged = true;
10850       }
10851 
10852     if (any_merged && forms[i].forms.size() > 1) {
10853       // Remove duplicate form-tag pairs
10854       struct tagged_form_comparator {
10855         inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; }
10856         inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); }
10857       };
10858 
10859       sort(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::lt);
10860       forms[i].forms.resize(unique(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::eq) - forms[i].forms.begin());
10861     }
10862   }
10863 }
10864 
10865 } // namespace morphodita
10866 
10867 /////////
10868 // File: morphodita/tokenizer/czech_tokenizer.cpp
10869 /////////
10870 
10871 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
10872 //
10873 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10874 // Mathematics and Physics, Charles University in Prague, Czech Republic.
10875 //
10876 // This Source Code Form is subject to the terms of the Mozilla Public
10877 // License, v. 2.0. If a copy of the MPL was not distributed with this
10878 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
10879 
10880 namespace morphodita {
10881 
10882 static const char _czech_tokenizer_cond_offsets[] = {
10883 	0, 0, 0, 0, 0, 0, 0, 0,
10884 	2, 2, 2, 2, 2, 2, 2, 2,
10885 	2, 2, 2, 2, 2, 2, 2
10886 };
10887 
10888 static const char _czech_tokenizer_cond_lengths[] = {
10889 	0, 0, 0, 0, 0, 0, 0, 2,
10890 	0, 0, 0, 0, 0, 0, 0, 0,
10891 	0, 0, 0, 0, 0, 0, 0
10892 };
10893 
10894 static const short _czech_tokenizer_cond_keys[] = {
10895 	43u, 43u, 45u, 45u, 0
10896 };
10897 
10898 static const char _czech_tokenizer_cond_spaces[] = {
10899 	1, 0, 0
10900 };
10901 
10902 static const unsigned char _czech_tokenizer_key_offsets[] = {
10903 	0, 0, 17, 29, 43, 46, 51, 54,
10904 	89, 94, 98, 101, 105, 110, 111, 116,
10905 	117, 122, 136, 143, 148, 151, 163
10906 };
10907 
10908 static const short _czech_tokenizer_trans_keys[] = {
10909 	13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u,
10910 	133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u,
10911 	90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u,
10912 	135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u,
10913 	39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u,
10914 	161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u,
10915 	159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u,
10916 	13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u,
10917 	131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u,
10918 	557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u,
10919 	64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u,
10920 	255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u,
10921 	32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u,
10922 	147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u,
10923 	10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u,
10924 	32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u,
10925 	93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u,
10926 	44u, 46u, 69u, 101u, 159u, 48u, 57u, 69u,
10927 	101u, 159u, 48u, 57u, 159u, 48u, 57u, 129u,
10928 	131u, 135u, 151u, 155u, 157u, 65u, 90u, 97u,
10929 	122u, 142u, 143u, 159u, 48u, 57u, 0
10930 };
10931 
10932 static const char _czech_tokenizer_single_lengths[] = {
10933 	0, 13, 10, 12, 1, 3, 1, 21,
10934 	5, 4, 3, 4, 5, 1, 5, 1,
10935 	5, 12, 5, 3, 1, 6, 1
10936 };
10937 
10938 static const char _czech_tokenizer_range_lengths[] = {
10939 	0, 2, 1, 1, 1, 1, 1, 7,
10940 	0, 0, 0, 0, 0, 0, 0, 0,
10941 	0, 1, 1, 1, 1, 3, 1
10942 };
10943 
10944 static const unsigned char _czech_tokenizer_index_offsets[] = {
10945 	0, 0, 16, 28, 42, 45, 50, 53,
10946 	82, 88, 93, 97, 102, 108, 110, 116,
10947 	118, 124, 138, 145, 150, 153, 163
10948 };
10949 
10950 static const char _czech_tokenizer_indicies[] = {
10951 	1, 1, 2, 2, 2, 2, 2, 3,
10952 	2, 3, 1, 2, 2, 1, 3, 0,
10953 	2, 2, 2, 2, 2, 3, 2, 3,
10954 	2, 2, 3, 0, 4, 4, 5, 5,
10955 	5, 5, 5, 5, 5, 5, 4, 5,
10956 	4, 0, 6, 6, 0, 7, 7, 8,
10957 	8, 0, 8, 8, 0, 10, 11, 12,
10958 	10, 13, 9, 13, 9, 13, 16, 16,
10959 	16, 16, 10, 16, 15, 13, 9, 17,
10960 	9, 17, 9, 15, 9, 16, 9, 16,
10961 	9, 14, 10, 19, 20, 10, 10, 18,
10962 	10, 21, 10, 10, 18, 10, 10, 10,
10963 	18, 10, 21, 10, 10, 18, 10, 22,
10964 	23, 10, 10, 18, 25, 24, 10, 22,
10965 	26, 10, 10, 18, 25, 24, 10, 23,
10966 	26, 10, 10, 18, 4, 4, 5, 5,
10967 	5, 5, 5, 5, 5, 5, 4, 5,
10968 	4, 27, 28, 28, 29, 29, 15, 15,
10969 	27, 29, 29, 6, 6, 27, 8, 8,
10970 	27, 16, 16, 16, 16, 16, 16, 16,
10971 	16, 16, 27, 15, 15, 27, 0
10972 };
10973 
10974 static const char _czech_tokenizer_trans_targs[] = {
10975 	7, 1, 2, 7, 1, 3, 19, 6,
10976 	20, 7, 8, 12, 16, 17, 0, 18,
10977 	21, 22, 7, 9, 11, 10, 13, 14,
10978 	7, 7, 15, 7, 4, 5
10979 };
10980 
10981 static const char _czech_tokenizer_trans_actions[] = {
10982 	1, 0, 0, 2, 3, 0, 4, 0,
10983 	0, 7, 0, 0, 0, 4, 0, 4,
10984 	0, 0, 8, 0, 0, 0, 0, 0,
10985 	9, 10, 0, 11, 0, 0
10986 };
10987 
10988 static const char _czech_tokenizer_to_state_actions[] = {
10989 	0, 0, 0, 0, 0, 0, 0, 5,
10990 	0, 0, 0, 0, 0, 0, 0, 0,
10991 	0, 0, 0, 0, 0, 0, 0
10992 };
10993 
10994 static const char _czech_tokenizer_from_state_actions[] = {
10995 	0, 0, 0, 0, 0, 0, 0, 6,
10996 	0, 0, 0, 0, 0, 0, 0, 0,
10997 	0, 0, 0, 0, 0, 0, 0
10998 };
10999 
11000 static const unsigned char _czech_tokenizer_eof_trans[] = {
11001 	0, 1, 1, 1, 1, 1, 1, 0,
11002 	19, 19, 19, 19, 19, 25, 19, 25,
11003 	19, 28, 28, 28, 28, 28, 28
11004 };
11005 
11006 static const int czech_tokenizer_start = 7;
11007 
11008 // The list of lower cased words that when preceding eos do not end sentence.
11009 // Note: because of VS, we cannot list the abbreviations directly in UTF-8,
11010 // because the compilation of utf-8 encoded sources fail on some locales
11011 // (e.g., Japanese).
11012 // perl -CSD -ple 'use Encode;s/([^[:ascii:]])/join("", map {sprintf "\\%o", ord($_)} split(m@@, encode("utf-8", $1)))/ge'
11013 // perl -CSD -ple 'use Encode;s/\\([0-7]{3})\\([0-7]{3})/decode("utf-8", chr(oct($1)).chr(oct($2)))/ge'
11014 const unordered_set<string> czech_tokenizer::abbreviations_czech = {
11015   // Titles
11016   "prof", "csc", "drsc", "doc", "phd", "ph", "d",
11017   "judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr",
11018   "ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga",
11019   "gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s",
11020   "p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv",
11021   // Geographic names
11022   "angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "n\304\233m", "nem", "it", "pol", "ma\304\217", "mad", "rus",
11023   "sev", "v\303\275ch", "vych", "ji\305\276", "jiz", "z\303\241p", "zap",
11024   // Common abbrevs
11025   "adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "nap\305\231", "napr",
11026   "okr", "pop\305\231", "popr", "pozn", "r", "\305\231", "red", "rep", "resp", "srov", "st", "st\305\231", "str",
11027   "sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn",
11028 };
11029 
11030 const unordered_set<string> czech_tokenizer::abbreviations_slovak = {
11031   // Titles
11032   "prof", "csc", "drsc", "doc", "phd", "ph", "d",
11033   "judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr",
11034   "ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga",
11035   "gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s",
11036   "p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv",
11037   // Geographic names
11038   "angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "nem", "it", "po\304\276", "pol", "ma\304\217", "mad",
11039   "rus", "sev", "v\303\275ch", "vych", "ju\305\276", "juz", "z\303\241p", "zap",
11040   // Common abbrevs
11041   "adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "napr",
11042   "okr", "popr", "pozn", "r", "red", "rep", "resp", "srov", "st", "str",
11043   "sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn",
11044 };
11045 
czech_tokenizer(tokenizer_language language,unsigned version,const morpho * m)11046 czech_tokenizer::czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m)
11047   : ragel_tokenizer(version <= 1 ? 1 : 2), m(m) {
11048   switch (language) {
11049     case CZECH:
11050       abbreviations = &abbreviations_czech;
11051       break;
11052     case SLOVAK:
11053       abbreviations = &abbreviations_slovak;
11054       break;
11055   }
11056 }
11057 
merge_hyphenated(vector<token_range> & tokens)11058 void czech_tokenizer::merge_hyphenated(vector<token_range>& tokens) {
11059   using namespace unilib;
11060 
11061   if (!m) return;
11062   if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return;
11063 
11064   unsigned matched_hyphens = 0;
11065   for (unsigned hyphens = 1; hyphens <= 2; hyphens++) {
11066     // Are the tokens a sequence of 'hyphens' hyphenated tokens?
11067     if (tokens.size() < 2*hyphens + 1) break;
11068     unsigned first_hyphen = tokens.size() - 2*hyphens;
11069     if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P ||
11070         tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start ||
11071         tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start ||
11072         chars[tokens[first_hyphen-1].start].cat & ~unicode::L)
11073       break;
11074 
11075     if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0)
11076       matched_hyphens = hyphens;
11077   }
11078 
11079   if (matched_hyphens) {
11080     unsigned first = tokens.size() - 2*matched_hyphens - 1;
11081     tokens[first].length = tokens.back().start + tokens.back().length - tokens[first].start;
11082     tokens.resize(first + 1);
11083   }
11084 }
11085 
next_sentence(vector<token_range> & tokens)11086 bool czech_tokenizer::next_sentence(vector<token_range>& tokens) {
11087   using namespace unilib;
11088 
11089   int cs, act;
11090   size_t ts, te;
11091   size_t whitespace = 0; // Suppress "may be uninitialized" warning
11092 
11093   while (tokenize_url_email(tokens))
11094     if (emergency_sentence_split(tokens))
11095       return true;
11096 
11097 	{
11098 	cs = czech_tokenizer_start;
11099 	ts = 0;
11100 	te = 0;
11101 	act = 0;
11102 	}
11103 
11104 	{
11105 	int _klen;
11106 	const short *_keys;
11107 	int _trans;
11108 	short _widec;
11109 
11110 	if ( ( current) == ( (chars.size() - 1)) )
11111 		goto _test_eof;
11112 	if ( cs == 0 )
11113 		goto _out;
11114 _resume:
11115 	switch ( _czech_tokenizer_from_state_actions[cs] ) {
11116 	case 6:
11117 	{ts = ( current);}
11118 	break;
11119 	}
11120 
11121 	_widec = ( ragel_char(chars[current]));
11122 	_klen = _czech_tokenizer_cond_lengths[cs];
11123 	_keys = _czech_tokenizer_cond_keys + (_czech_tokenizer_cond_offsets[cs]*2);
11124 	if ( _klen > 0 ) {
11125 		const short *_lower = _keys;
11126 		const short *_mid;
11127 		const short *_upper = _keys + (_klen<<1) - 2;
11128 		while (1) {
11129 			if ( _upper < _lower )
11130 				break;
11131 
11132 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11133 			if ( _widec < _mid[0] )
11134 				_upper = _mid - 2;
11135 			else if ( _widec > _mid[1] )
11136 				_lower = _mid + 2;
11137 			else {
11138 				switch ( _czech_tokenizer_cond_spaces[_czech_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
11139 	case 0: {
11140 		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
11141 		if (
11142  !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd))  ) _widec += 256;
11143 		break;
11144 	}
11145 	case 1: {
11146 		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
11147 		if (
11148  !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+')  ) _widec += 256;
11149 		break;
11150 	}
11151 				}
11152 				break;
11153 			}
11154 		}
11155 	}
11156 
11157 	_keys = _czech_tokenizer_trans_keys + _czech_tokenizer_key_offsets[cs];
11158 	_trans = _czech_tokenizer_index_offsets[cs];
11159 
11160 	_klen = _czech_tokenizer_single_lengths[cs];
11161 	if ( _klen > 0 ) {
11162 		const short *_lower = _keys;
11163 		const short *_mid;
11164 		const short *_upper = _keys + _klen - 1;
11165 		while (1) {
11166 			if ( _upper < _lower )
11167 				break;
11168 
11169 			_mid = _lower + ((_upper-_lower) >> 1);
11170 			if ( _widec < *_mid )
11171 				_upper = _mid - 1;
11172 			else if ( _widec > *_mid )
11173 				_lower = _mid + 1;
11174 			else {
11175 				_trans += (unsigned int)(_mid - _keys);
11176 				goto _match;
11177 			}
11178 		}
11179 		_keys += _klen;
11180 		_trans += _klen;
11181 	}
11182 
11183 	_klen = _czech_tokenizer_range_lengths[cs];
11184 	if ( _klen > 0 ) {
11185 		const short *_lower = _keys;
11186 		const short *_mid;
11187 		const short *_upper = _keys + (_klen<<1) - 2;
11188 		while (1) {
11189 			if ( _upper < _lower )
11190 				break;
11191 
11192 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11193 			if ( _widec < _mid[0] )
11194 				_upper = _mid - 2;
11195 			else if ( _widec > _mid[1] )
11196 				_lower = _mid + 2;
11197 			else {
11198 				_trans += (unsigned int)((_mid - _keys)>>1);
11199 				goto _match;
11200 			}
11201 		}
11202 		_trans += _klen;
11203 	}
11204 
11205 _match:
11206 	_trans = _czech_tokenizer_indicies[_trans];
11207 _eof_trans:
11208 	cs = _czech_tokenizer_trans_targs[_trans];
11209 
11210 	if ( _czech_tokenizer_trans_actions[_trans] == 0 )
11211 		goto _again;
11212 
11213 	switch ( _czech_tokenizer_trans_actions[_trans] ) {
11214 	case 3:
11215 	{ whitespace = current; }
11216 	break;
11217 	case 4:
11218 	{te = ( current)+1;}
11219 	break;
11220 	case 7:
11221 	{te = ( current)+1;{ tokens.emplace_back(ts, te - ts);
11222           merge_hyphenated(tokens);
11223           current = te;
11224           do
11225             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11226           while (tokenize_url_email(tokens));
11227           ( current)--;
11228         }}
11229 	break;
11230 	case 2:
11231 	{te = ( current)+1;{
11232           bool eos = is_eos(tokens, chars[ts].chr, abbreviations);
11233           for (current = ts; current < whitespace; current++)
11234             tokens.emplace_back(current, 1);
11235           {( current) = (( whitespace))-1;}
11236           if (eos) {( current)++; goto _out; }
11237         }}
11238 	break;
11239 	case 10:
11240 	{te = ( current)+1;{
11241           if (!tokens.empty()) {( current)++; goto _out; }
11242           current = te;
11243           do
11244             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11245           while (tokenize_url_email(tokens));
11246           ( current)--;
11247         }}
11248 	break;
11249 	case 11:
11250 	{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts);
11251           merge_hyphenated(tokens);
11252           current = te;
11253           do
11254             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11255           while (tokenize_url_email(tokens));
11256           ( current)--;
11257         }}
11258 	break;
11259 	case 8:
11260 	{te = ( current);( current)--;{
11261           current = te;
11262           do
11263             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11264           while (tokenize_url_email(tokens));
11265           ( current)--;
11266         }}
11267 	break;
11268 	case 9:
11269 	{te = ( current);( current)--;{
11270           if (!tokens.empty()) {( current)++; goto _out; }
11271           current = te;
11272           do
11273             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11274           while (tokenize_url_email(tokens));
11275           ( current)--;
11276         }}
11277 	break;
11278 	case 1:
11279 	{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts);
11280           merge_hyphenated(tokens);
11281           current = te;
11282           do
11283             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11284           while (tokenize_url_email(tokens));
11285           ( current)--;
11286         }}
11287 	break;
11288 	}
11289 
11290 _again:
11291 	switch ( _czech_tokenizer_to_state_actions[cs] ) {
11292 	case 5:
11293 	{ts = 0;}
11294 	break;
11295 	}
11296 
11297 	if ( cs == 0 )
11298 		goto _out;
11299 	if ( ++( current) != ( (chars.size() - 1)) )
11300 		goto _resume;
11301 	_test_eof: {}
11302 	if ( ( current) == ( (chars.size() - 1)) )
11303 	{
11304 	if ( _czech_tokenizer_eof_trans[cs] > 0 ) {
11305 		_trans = _czech_tokenizer_eof_trans[cs] - 1;
11306 		goto _eof_trans;
11307 	}
11308 	}
11309 
11310 	_out: {}
11311 	}
11312 
11313   (void)act; // Suppress unused variable warning
11314 
11315   return !tokens.empty();
11316 }
11317 
11318 } // namespace morphodita
11319 
11320 /////////
11321 // File: morphodita/tokenizer/english_tokenizer.cpp
11322 /////////
11323 
11324 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
11325 //
11326 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
11327 // Mathematics and Physics, Charles University in Prague, Czech Republic.
11328 //
11329 // This Source Code Form is subject to the terms of the Mozilla Public
11330 // License, v. 2.0. If a copy of the MPL was not distributed with this
11331 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
11332 
11333 namespace morphodita {
11334 
11335 // The list of lowercased words that when preceding eos do not end sentence.
11336 const unordered_set<string> english_tokenizer::abbreviations = {
11337   // Titles
11338   "adj", "adm", "adv", "assoc", "asst", "bart", "bldg", "brig", "bros", "capt",
11339   "cmdr", "col", "comdr", "con", "corp", "cpl", "d", "dr", "dr", "drs", "ens",
11340   "gen", "gov", "hon", "hosp", "hr", "insp", "lt", "mm", "mr", "mrs", "ms",
11341   "maj", "messrs", "mlle", "mme", "mr", "mrs", "ms", "msgr", "op", "ord",
11342   "pfc", "ph", "phd", "prof", "pvt", "rep", "reps", "res", "rev", "rt", "sen",
11343   "sens", "sfc", "sgt", "sr", "st", "supt", "surg", "univ",
11344   // Common abbrevs
11345   "addr", "approx", "apr", "aug", "calif", "co", "corp", "dec", "def", "e",
11346   "e.g", "eg", "feb", "fla", "ft", "gen", "gov", "hrs", "i.", "i.e", "ie",
11347   "inc", "jan", "jr", "ltd", "mar", "max", "min", "mph", "mt", "n", "nov",
11348   "oct", "ont", "pa", "pres", "rep", "rev", "s", "sec", "sen", "sep", "sept",
11349   "sgt", "sr", "tel", "un", "univ", "v", "va", "vs", "w", "yrs",
11350 };
11351 
11352 static const char _english_tokenizer_split_token_key_offsets[] = {
11353 	0, 0, 16, 20, 22, 26, 28, 30,
11354 	32, 34, 36, 44, 46, 50, 52, 54,
11355 	56, 58, 60, 62, 64, 66, 68, 72,
11356 	74, 76, 78, 80, 82, 82
11357 };
11358 
11359 static const unsigned char _english_tokenizer_split_token_trans_keys[] = {
11360 	65u, 68u, 69u, 76u, 77u, 78u, 83u, 84u,
11361 	97u, 100u, 101u, 108u, 109u, 110u, 115u, 116u,
11362 	78u, 84u, 110u, 116u, 78u, 110u, 65u, 79u,
11363 	97u, 111u, 87u, 119u, 71u, 103u, 84u, 116u,
11364 	79u, 111u, 39u, 161u, 77u, 82u, 86u, 89u,
11365 	109u, 114u, 118u, 121u, 77u, 109u, 69u, 73u,
11366 	101u, 105u, 76u, 108u, 39u, 161u, 68u, 100u,
11367 	76u, 108u, 39u, 161u, 69u, 101u, 82u, 114u,
11368 	79u, 111u, 77u, 109u, 39u, 79u, 111u, 161u,
11369 	78u, 110u, 78u, 110u, 78u, 110u, 65u, 97u,
11370 	67u, 99u, 0
11371 };
11372 
11373 static const char _english_tokenizer_split_token_single_lengths[] = {
11374 	0, 16, 4, 2, 4, 2, 2, 2,
11375 	2, 2, 8, 2, 4, 2, 2, 2,
11376 	2, 2, 2, 2, 2, 2, 4, 2,
11377 	2, 2, 2, 2, 0, 0
11378 };
11379 
11380 static const char _english_tokenizer_split_token_range_lengths[] = {
11381 	0, 0, 0, 0, 0, 0, 0, 0,
11382 	0, 0, 0, 0, 0, 0, 0, 0,
11383 	0, 0, 0, 0, 0, 0, 0, 0,
11384 	0, 0, 0, 0, 0, 0
11385 };
11386 
11387 static const unsigned char _english_tokenizer_split_token_index_offsets[] = {
11388 	0, 0, 17, 22, 25, 30, 33, 36,
11389 	39, 42, 45, 54, 57, 62, 65, 68,
11390 	71, 74, 77, 80, 83, 86, 89, 94,
11391 	97, 100, 103, 106, 109, 110
11392 };
11393 
11394 static const char _english_tokenizer_split_token_indicies[] = {
11395 	0, 2, 3, 4, 2, 5, 2, 6,
11396 	0, 2, 3, 4, 2, 5, 2, 6,
11397 	1, 7, 8, 7, 8, 1, 9, 9,
11398 	1, 10, 11, 10, 11, 1, 12, 12,
11399 	1, 12, 12, 1, 13, 13, 1, 11,
11400 	11, 1, 14, 14, 1, 15, 2, 2,
11401 	16, 15, 2, 2, 16, 1, 17, 17,
11402 	1, 18, 11, 18, 11, 1, 12, 12,
11403 	1, 19, 19, 1, 12, 12, 1, 2,
11404 	2, 1, 20, 20, 1, 21, 21, 1,
11405 	22, 22, 1, 23, 23, 1, 12, 12,
11406 	1, 24, 25, 25, 24, 1, 14, 14,
11407 	1, 26, 26, 1, 27, 27, 1, 28,
11408 	28, 1, 12, 12, 1, 1, 1, 0
11409 };
11410 
11411 static const char _english_tokenizer_split_token_trans_targs[] = {
11412 	2, 0, 9, 10, 16, 17, 22, 3,
11413 	7, 4, 5, 6, 28, 8, 29, 11,
11414 	14, 12, 13, 15, 18, 19, 20, 21,
11415 	23, 24, 25, 26, 27
11416 };
11417 
11418 static const char _english_tokenizer_split_token_trans_actions[] = {
11419 	0, 0, 0, 0, 0, 0, 0, 1,
11420 	1, 0, 0, 0, 0, 0, 2, 1,
11421 	1, 0, 0, 0, 1, 0, 0, 0,
11422 	0, 0, 1, 0, 0
11423 };
11424 
11425 static const char _english_tokenizer_split_token_eof_actions[] = {
11426 	0, 0, 0, 0, 0, 0, 0, 0,
11427 	0, 0, 0, 0, 0, 0, 0, 0,
11428 	0, 0, 0, 0, 0, 0, 0, 0,
11429 	0, 0, 0, 0, 3, 0
11430 };
11431 
11432 static const int english_tokenizer_split_token_start = 1;
11433 
split_token(vector<token_range> & tokens)11434 void english_tokenizer::split_token(vector<token_range>& tokens) {
11435   if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return;
11436 
11437   size_t index = tokens.back().start, end = index + tokens.back().length;
11438   int cs;
11439   size_t split_mark = 0, split_len = 0;
11440 
11441 	{
11442 	cs = english_tokenizer_split_token_start;
11443 	}
11444 
11445 	{
11446 	int _klen;
11447 	const unsigned char *_keys;
11448 	int _trans;
11449 
11450 	if ( ( index) == ( end) )
11451 		goto _test_eof;
11452 	if ( cs == 0 )
11453 		goto _out;
11454 _resume:
11455 	_keys = _english_tokenizer_split_token_trans_keys + _english_tokenizer_split_token_key_offsets[cs];
11456 	_trans = _english_tokenizer_split_token_index_offsets[cs];
11457 
11458 	_klen = _english_tokenizer_split_token_single_lengths[cs];
11459 	if ( _klen > 0 ) {
11460 		const unsigned char *_lower = _keys;
11461 		const unsigned char *_mid;
11462 		const unsigned char *_upper = _keys + _klen - 1;
11463 		while (1) {
11464 			if ( _upper < _lower )
11465 				break;
11466 
11467 			_mid = _lower + ((_upper-_lower) >> 1);
11468 			if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid )
11469 				_upper = _mid - 1;
11470 			else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid )
11471 				_lower = _mid + 1;
11472 			else {
11473 				_trans += (unsigned int)(_mid - _keys);
11474 				goto _match;
11475 			}
11476 		}
11477 		_keys += _klen;
11478 		_trans += _klen;
11479 	}
11480 
11481 	_klen = _english_tokenizer_split_token_range_lengths[cs];
11482 	if ( _klen > 0 ) {
11483 		const unsigned char *_lower = _keys;
11484 		const unsigned char *_mid;
11485 		const unsigned char *_upper = _keys + (_klen<<1) - 2;
11486 		while (1) {
11487 			if ( _upper < _lower )
11488 				break;
11489 
11490 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11491 			if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] )
11492 				_upper = _mid - 2;
11493 			else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] )
11494 				_lower = _mid + 2;
11495 			else {
11496 				_trans += (unsigned int)((_mid - _keys)>>1);
11497 				goto _match;
11498 			}
11499 		}
11500 		_trans += _klen;
11501 	}
11502 
11503 _match:
11504 	_trans = _english_tokenizer_split_token_indicies[_trans];
11505 	cs = _english_tokenizer_split_token_trans_targs[_trans];
11506 
11507 	if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 )
11508 		goto _again;
11509 
11510 	switch ( _english_tokenizer_split_token_trans_actions[_trans] ) {
11511 	case 1:
11512 	{ split_mark = index - tokens.back().start + 1; }
11513 	break;
11514 	case 2:
11515 	{ split_mark = index - tokens.back().start + 1; }
11516 	{ split_len = split_mark; {( index)++; goto _out; } }
11517 	break;
11518 	}
11519 
11520 _again:
11521 	if ( cs == 0 )
11522 		goto _out;
11523 	if ( ++( index) != ( end) )
11524 		goto _resume;
11525 	_test_eof: {}
11526 	if ( ( index) == ( end) )
11527 	{
11528 	switch ( _english_tokenizer_split_token_eof_actions[cs] ) {
11529 	case 3:
11530 	{ split_len = split_mark; {( index)++; goto _out; } }
11531 	break;
11532 	}
11533 	}
11534 
11535 	_out: {}
11536 	}
11537 
11538   if (split_len && split_len < end) {
11539     tokens.back().length -= split_len;
11540     tokens.emplace_back(end - split_len, split_len);
11541   }
11542 }
11543 
11544 static const char _english_tokenizer_cond_offsets[] = {
11545 	0, 0, 0, 0, 0, 0, 0, 0,
11546 	0, 0, 0, 2, 2, 2, 2, 2,
11547 	2, 2, 2, 2, 2, 2, 2, 2,
11548 	2, 2, 2, 2, 2
11549 };
11550 
11551 static const char _english_tokenizer_cond_lengths[] = {
11552 	0, 0, 0, 0, 0, 0, 0, 0,
11553 	0, 0, 2, 0, 0, 0, 0, 0,
11554 	0, 0, 0, 0, 0, 0, 0, 0,
11555 	0, 0, 0, 0, 0
11556 };
11557 
11558 static const short _english_tokenizer_cond_keys[] = {
11559 	43u, 43u, 45u, 45u, 0
11560 };
11561 
11562 static const char _english_tokenizer_cond_spaces[] = {
11563 	1, 0, 0
11564 };
11565 
11566 static const unsigned char _english_tokenizer_key_offsets[] = {
11567 	0, 0, 17, 29, 43, 46, 49, 52,
11568 	55, 60, 63, 98, 103, 107, 110, 114,
11569 	119, 120, 125, 126, 131, 145, 152, 156,
11570 	161, 164, 179, 192, 206
11571 };
11572 
11573 static const short _english_tokenizer_trans_keys[] = {
11574 	13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u,
11575 	133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u,
11576 	90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u,
11577 	135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u,
11578 	39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u,
11579 	161u, 9u, 10u, 159u, 48u, 57u, 159u, 48u,
11580 	57u, 159u, 48u, 57u, 159u, 48u, 57u, 43u,
11581 	45u, 159u, 48u, 57u, 159u, 48u, 57u, 9u,
11582 	10u, 13u, 32u, 33u, 44u, 46u, 47u, 63u,
11583 	129u, 131u, 135u, 142u, 147u, 157u, 159u, 160u,
11584 	301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u,
11585 	58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u,
11586 	123u, 255u, 9u, 10u, 13u, 32u, 147u, 9u,
11587 	13u, 32u, 147u, 9u, 32u, 147u, 9u, 10u,
11588 	32u, 147u, 9u, 10u, 13u, 32u, 147u, 13u,
11589 	9u, 10u, 13u, 32u, 147u, 10u, 9u, 10u,
11590 	13u, 32u, 147u, 13u, 32u, 34u, 39u, 41u,
11591 	59u, 93u, 125u, 139u, 141u, 147u, 161u, 9u,
11592 	10u, 44u, 46u, 69u, 101u, 159u, 48u, 57u,
11593 	44u, 46u, 69u, 101u, 69u, 101u, 159u, 48u,
11594 	57u, 159u, 48u, 57u, 39u, 45u, 129u, 131u,
11595 	135u, 151u, 155u, 157u, 161u, 65u, 90u, 97u,
11596 	122u, 142u, 143u, 45u, 129u, 131u, 135u, 151u,
11597 	155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u,
11598 	39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u,
11599 	65u, 90u, 97u, 122u, 142u, 143u, 159u, 48u,
11600 	57u, 0
11601 };
11602 
11603 static const char _english_tokenizer_single_lengths[] = {
11604 	0, 13, 10, 12, 1, 1, 1, 1,
11605 	3, 1, 21, 5, 4, 3, 4, 5,
11606 	1, 5, 1, 5, 12, 5, 4, 3,
11607 	1, 9, 7, 8, 1
11608 };
11609 
11610 static const char _english_tokenizer_range_lengths[] = {
11611 	0, 2, 1, 1, 1, 1, 1, 1,
11612 	1, 1, 7, 0, 0, 0, 0, 0,
11613 	0, 0, 0, 0, 1, 1, 0, 1,
11614 	1, 3, 3, 3, 1
11615 };
11616 
11617 static const unsigned char _english_tokenizer_index_offsets[] = {
11618 	0, 0, 16, 28, 42, 45, 48, 51,
11619 	54, 59, 62, 91, 97, 102, 106, 111,
11620 	117, 119, 125, 127, 133, 147, 154, 159,
11621 	164, 167, 180, 191, 203
11622 };
11623 
11624 static const char _english_tokenizer_indicies[] = {
11625 	1, 1, 2, 2, 2, 2, 2, 3,
11626 	2, 3, 1, 2, 2, 1, 3, 0,
11627 	2, 2, 2, 2, 2, 3, 2, 3,
11628 	2, 2, 3, 0, 4, 4, 5, 5,
11629 	5, 5, 5, 5, 5, 5, 4, 5,
11630 	4, 0, 6, 6, 0, 7, 7, 0,
11631 	8, 8, 0, 9, 9, 0, 10, 10,
11632 	11, 11, 0, 11, 11, 0, 13, 14,
11633 	15, 13, 16, 12, 16, 12, 16, 19,
11634 	19, 19, 19, 13, 19, 18, 16, 12,
11635 	20, 12, 20, 12, 18, 12, 19, 12,
11636 	19, 12, 17, 13, 22, 23, 13, 13,
11637 	21, 13, 24, 13, 13, 21, 13, 13,
11638 	13, 21, 13, 24, 13, 13, 21, 13,
11639 	25, 26, 13, 13, 21, 28, 27, 13,
11640 	25, 29, 13, 13, 21, 28, 27, 13,
11641 	26, 29, 13, 13, 21, 4, 4, 5,
11642 	5, 5, 5, 5, 5, 5, 5, 4,
11643 	5, 4, 30, 31, 32, 33, 33, 18,
11644 	18, 30, 31, 32, 33, 33, 30, 33,
11645 	33, 9, 9, 30, 11, 11, 30, 34,
11646 	35, 19, 19, 19, 19, 19, 19, 34,
11647 	19, 19, 19, 30, 35, 19, 19, 19,
11648 	19, 19, 19, 19, 19, 19, 30, 34,
11649 	19, 19, 19, 19, 19, 19, 34, 19,
11650 	19, 19, 30, 18, 18, 30, 0
11651 };
11652 
11653 static const char _english_tokenizer_trans_targs[] = {
11654 	10, 1, 2, 10, 1, 3, 5, 6,
11655 	22, 23, 9, 24, 10, 11, 15, 19,
11656 	20, 0, 21, 25, 28, 10, 12, 14,
11657 	13, 16, 17, 10, 10, 18, 10, 4,
11658 	7, 8, 26, 27
11659 };
11660 
11661 static const char _english_tokenizer_trans_actions[] = {
11662 	1, 0, 0, 2, 3, 0, 0, 0,
11663 	4, 4, 0, 0, 7, 0, 0, 0,
11664 	4, 0, 4, 0, 0, 8, 0, 0,
11665 	0, 0, 0, 9, 10, 0, 11, 0,
11666 	0, 0, 0, 0
11667 };
11668 
11669 static const char _english_tokenizer_to_state_actions[] = {
11670 	0, 0, 0, 0, 0, 0, 0, 0,
11671 	0, 0, 5, 0, 0, 0, 0, 0,
11672 	0, 0, 0, 0, 0, 0, 0, 0,
11673 	0, 0, 0, 0, 0
11674 };
11675 
11676 static const char _english_tokenizer_from_state_actions[] = {
11677 	0, 0, 0, 0, 0, 0, 0, 0,
11678 	0, 0, 6, 0, 0, 0, 0, 0,
11679 	0, 0, 0, 0, 0, 0, 0, 0,
11680 	0, 0, 0, 0, 0
11681 };
11682 
11683 static const unsigned char _english_tokenizer_eof_trans[] = {
11684 	0, 1, 1, 1, 1, 1, 1, 1,
11685 	1, 1, 0, 22, 22, 22, 22, 22,
11686 	28, 22, 28, 22, 31, 31, 31, 31,
11687 	31, 31, 31, 31, 31
11688 };
11689 
11690 static const int english_tokenizer_start = 10;
11691 
english_tokenizer(unsigned version)11692 english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
11693 
next_sentence(vector<token_range> & tokens)11694 bool english_tokenizer::next_sentence(vector<token_range>& tokens) {
11695   using namespace unilib;
11696 
11697   int cs, act;
11698   size_t ts, te;
11699   size_t whitespace = 0; // Suppress "may be uninitialized" warning
11700 
11701   while (tokenize_url_email(tokens))
11702     if (emergency_sentence_split(tokens))
11703       return true;
11704 
11705 	{
11706 	cs = english_tokenizer_start;
11707 	ts = 0;
11708 	te = 0;
11709 	act = 0;
11710 	}
11711 
11712 	{
11713 	int _klen;
11714 	const short *_keys;
11715 	int _trans;
11716 	short _widec;
11717 
11718 	if ( ( current) == ( (chars.size() - 1)) )
11719 		goto _test_eof;
11720 	if ( cs == 0 )
11721 		goto _out;
11722 _resume:
11723 	switch ( _english_tokenizer_from_state_actions[cs] ) {
11724 	case 6:
11725 	{ts = ( current);}
11726 	break;
11727 	}
11728 
11729 	_widec = ( ragel_char(chars[current]));
11730 	_klen = _english_tokenizer_cond_lengths[cs];
11731 	_keys = _english_tokenizer_cond_keys + (_english_tokenizer_cond_offsets[cs]*2);
11732 	if ( _klen > 0 ) {
11733 		const short *_lower = _keys;
11734 		const short *_mid;
11735 		const short *_upper = _keys + (_klen<<1) - 2;
11736 		while (1) {
11737 			if ( _upper < _lower )
11738 				break;
11739 
11740 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11741 			if ( _widec < _mid[0] )
11742 				_upper = _mid - 2;
11743 			else if ( _widec > _mid[1] )
11744 				_lower = _mid + 2;
11745 			else {
11746 				switch ( _english_tokenizer_cond_spaces[_english_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
11747 	case 0: {
11748 		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
11749 		if (
11750  !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd))  ) _widec += 256;
11751 		break;
11752 	}
11753 	case 1: {
11754 		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
11755 		if (
11756  !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+')  ) _widec += 256;
11757 		break;
11758 	}
11759 				}
11760 				break;
11761 			}
11762 		}
11763 	}
11764 
11765 	_keys = _english_tokenizer_trans_keys + _english_tokenizer_key_offsets[cs];
11766 	_trans = _english_tokenizer_index_offsets[cs];
11767 
11768 	_klen = _english_tokenizer_single_lengths[cs];
11769 	if ( _klen > 0 ) {
11770 		const short *_lower = _keys;
11771 		const short *_mid;
11772 		const short *_upper = _keys + _klen - 1;
11773 		while (1) {
11774 			if ( _upper < _lower )
11775 				break;
11776 
11777 			_mid = _lower + ((_upper-_lower) >> 1);
11778 			if ( _widec < *_mid )
11779 				_upper = _mid - 1;
11780 			else if ( _widec > *_mid )
11781 				_lower = _mid + 1;
11782 			else {
11783 				_trans += (unsigned int)(_mid - _keys);
11784 				goto _match;
11785 			}
11786 		}
11787 		_keys += _klen;
11788 		_trans += _klen;
11789 	}
11790 
11791 	_klen = _english_tokenizer_range_lengths[cs];
11792 	if ( _klen > 0 ) {
11793 		const short *_lower = _keys;
11794 		const short *_mid;
11795 		const short *_upper = _keys + (_klen<<1) - 2;
11796 		while (1) {
11797 			if ( _upper < _lower )
11798 				break;
11799 
11800 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11801 			if ( _widec < _mid[0] )
11802 				_upper = _mid - 2;
11803 			else if ( _widec > _mid[1] )
11804 				_lower = _mid + 2;
11805 			else {
11806 				_trans += (unsigned int)((_mid - _keys)>>1);
11807 				goto _match;
11808 			}
11809 		}
11810 		_trans += _klen;
11811 	}
11812 
11813 _match:
11814 	_trans = _english_tokenizer_indicies[_trans];
11815 _eof_trans:
11816 	cs = _english_tokenizer_trans_targs[_trans];
11817 
11818 	if ( _english_tokenizer_trans_actions[_trans] == 0 )
11819 		goto _again;
11820 
11821 	switch ( _english_tokenizer_trans_actions[_trans] ) {
11822 	case 3:
11823 	{ whitespace = current; }
11824 	break;
11825 	case 4:
11826 	{te = ( current)+1;}
11827 	break;
11828 	case 7:
11829 	{te = ( current)+1;{ tokens.emplace_back(ts, te - ts);
11830           split_token(tokens);
11831           current = te;
11832           do
11833             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11834           while (tokenize_url_email(tokens));
11835           ( current)--;
11836         }}
11837 	break;
11838 	case 2:
11839 	{te = ( current)+1;{
11840           bool eos = is_eos(tokens, chars[ts].chr, &abbreviations);
11841           for (current = ts; current < whitespace; current++)
11842             tokens.emplace_back(current, 1);
11843           {( current) = (( whitespace))-1;}
11844           if (eos) {( current)++; goto _out; }
11845         }}
11846 	break;
11847 	case 10:
11848 	{te = ( current)+1;{
11849           if (!tokens.empty()) {( current)++; goto _out; }
11850           current = te;
11851           do
11852             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11853           while (tokenize_url_email(tokens));
11854           ( current)--;
11855         }}
11856 	break;
11857 	case 11:
11858 	{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts);
11859           split_token(tokens);
11860           current = te;
11861           do
11862             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11863           while (tokenize_url_email(tokens));
11864           ( current)--;
11865         }}
11866 	break;
11867 	case 8:
11868 	{te = ( current);( current)--;{
11869           current = te;
11870           do
11871             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11872           while (tokenize_url_email(tokens));
11873           ( current)--;
11874         }}
11875 	break;
11876 	case 9:
11877 	{te = ( current);( current)--;{
11878           if (!tokens.empty()) {( current)++; goto _out; }
11879           current = te;
11880           do
11881             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11882           while (tokenize_url_email(tokens));
11883           ( current)--;
11884         }}
11885 	break;
11886 	case 1:
11887 	{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts);
11888           split_token(tokens);
11889           current = te;
11890           do
11891             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11892           while (tokenize_url_email(tokens));
11893           ( current)--;
11894         }}
11895 	break;
11896 	}
11897 
11898 _again:
11899 	switch ( _english_tokenizer_to_state_actions[cs] ) {
11900 	case 5:
11901 	{ts = 0;}
11902 	break;
11903 	}
11904 
11905 	if ( cs == 0 )
11906 		goto _out;
11907 	if ( ++( current) != ( (chars.size() - 1)) )
11908 		goto _resume;
11909 	_test_eof: {}
11910 	if ( ( current) == ( (chars.size() - 1)) )
11911 	{
11912 	if ( _english_tokenizer_eof_trans[cs] > 0 ) {
11913 		_trans = _english_tokenizer_eof_trans[cs] - 1;
11914 		goto _eof_trans;
11915 	}
11916 	}
11917 
11918 	_out: {}
11919 	}
11920 
11921   (void)act; // Suppress unused variable warning
11922 
11923   return !tokens.empty();
11924 }
11925 
11926 } // namespace morphodita
11927 
11928 /////////
11929 // File: morphodita/tokenizer/generic_tokenizer.cpp
11930 /////////
11931 
11932 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
11933 //
11934 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
11935 // Mathematics and Physics, Charles University in Prague, Czech Republic.
11936 //
11937 // This Source Code Form is subject to the terms of the Mozilla Public
11938 // License, v. 2.0. If a copy of the MPL was not distributed with this
11939 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
11940 
11941 namespace morphodita {
11942 
11943 static const char _generic_tokenizer_cond_offsets[] = {
11944 	0, 0, 0, 0, 0, 0, 0, 0,
11945 	2, 2, 2, 2, 2, 2, 2, 2,
11946 	2, 2, 2, 2, 2, 2, 2
11947 };
11948 
11949 static const char _generic_tokenizer_cond_lengths[] = {
11950 	0, 0, 0, 0, 0, 0, 0, 2,
11951 	0, 0, 0, 0, 0, 0, 0, 0,
11952 	0, 0, 0, 0, 0, 0, 0
11953 };
11954 
11955 static const short _generic_tokenizer_cond_keys[] = {
11956 	43u, 43u, 45u, 45u, 0
11957 };
11958 
11959 static const char _generic_tokenizer_cond_spaces[] = {
11960 	1, 0, 0
11961 };
11962 
11963 static const unsigned char _generic_tokenizer_key_offsets[] = {
11964 	0, 0, 17, 29, 43, 46, 51, 54,
11965 	89, 94, 98, 101, 105, 110, 111, 116,
11966 	117, 122, 136, 142, 147, 150, 162
11967 };
11968 
11969 static const short _generic_tokenizer_trans_keys[] = {
11970 	13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u,
11971 	133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u,
11972 	90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u,
11973 	135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u,
11974 	39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u,
11975 	161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u,
11976 	159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u,
11977 	13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u,
11978 	131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u,
11979 	557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u,
11980 	64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u,
11981 	255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u,
11982 	32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u,
11983 	147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u,
11984 	10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u,
11985 	32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u,
11986 	93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u,
11987 	46u, 69u, 101u, 159u, 48u, 57u, 69u, 101u,
11988 	159u, 48u, 57u, 159u, 48u, 57u, 129u, 131u,
11989 	135u, 151u, 155u, 157u, 65u, 90u, 97u, 122u,
11990 	142u, 143u, 159u, 48u, 57u, 0
11991 };
11992 
11993 static const char _generic_tokenizer_single_lengths[] = {
11994 	0, 13, 10, 12, 1, 3, 1, 21,
11995 	5, 4, 3, 4, 5, 1, 5, 1,
11996 	5, 12, 4, 3, 1, 6, 1
11997 };
11998 
11999 static const char _generic_tokenizer_range_lengths[] = {
12000 	0, 2, 1, 1, 1, 1, 1, 7,
12001 	0, 0, 0, 0, 0, 0, 0, 0,
12002 	0, 1, 1, 1, 1, 3, 1
12003 };
12004 
12005 static const unsigned char _generic_tokenizer_index_offsets[] = {
12006 	0, 0, 16, 28, 42, 45, 50, 53,
12007 	82, 88, 93, 97, 102, 108, 110, 116,
12008 	118, 124, 138, 144, 149, 152, 162
12009 };
12010 
12011 static const char _generic_tokenizer_indicies[] = {
12012 	1, 1, 2, 2, 2, 2, 2, 3,
12013 	2, 3, 1, 2, 2, 1, 3, 0,
12014 	2, 2, 2, 2, 2, 3, 2, 3,
12015 	2, 2, 3, 0, 4, 4, 5, 5,
12016 	5, 5, 5, 5, 5, 5, 4, 5,
12017 	4, 0, 6, 6, 0, 7, 7, 8,
12018 	8, 0, 8, 8, 0, 10, 11, 12,
12019 	10, 13, 9, 13, 9, 13, 16, 16,
12020 	16, 16, 10, 16, 15, 13, 9, 17,
12021 	9, 17, 9, 15, 9, 16, 9, 16,
12022 	9, 14, 10, 19, 20, 10, 10, 18,
12023 	10, 21, 10, 10, 18, 10, 10, 10,
12024 	18, 10, 21, 10, 10, 18, 10, 22,
12025 	23, 10, 10, 18, 25, 24, 10, 22,
12026 	26, 10, 10, 18, 25, 24, 10, 23,
12027 	26, 10, 10, 18, 4, 4, 5, 5,
12028 	5, 5, 5, 5, 5, 5, 4, 5,
12029 	4, 27, 28, 29, 29, 15, 15, 27,
12030 	29, 29, 6, 6, 27, 8, 8, 27,
12031 	16, 16, 16, 16, 16, 16, 16, 16,
12032 	16, 27, 15, 15, 27, 0
12033 };
12034 
12035 static const char _generic_tokenizer_trans_targs[] = {
12036 	7, 1, 2, 7, 1, 3, 19, 6,
12037 	20, 7, 8, 12, 16, 17, 0, 18,
12038 	21, 22, 7, 9, 11, 10, 13, 14,
12039 	7, 7, 15, 7, 4, 5
12040 };
12041 
12042 static const char _generic_tokenizer_trans_actions[] = {
12043 	1, 0, 0, 2, 3, 0, 4, 0,
12044 	0, 7, 0, 0, 0, 4, 0, 4,
12045 	0, 0, 8, 0, 0, 0, 0, 0,
12046 	9, 10, 0, 11, 0, 0
12047 };
12048 
12049 static const char _generic_tokenizer_to_state_actions[] = {
12050 	0, 0, 0, 0, 0, 0, 0, 5,
12051 	0, 0, 0, 0, 0, 0, 0, 0,
12052 	0, 0, 0, 0, 0, 0, 0
12053 };
12054 
12055 static const char _generic_tokenizer_from_state_actions[] = {
12056 	0, 0, 0, 0, 0, 0, 0, 6,
12057 	0, 0, 0, 0, 0, 0, 0, 0,
12058 	0, 0, 0, 0, 0, 0, 0
12059 };
12060 
12061 static const unsigned char _generic_tokenizer_eof_trans[] = {
12062 	0, 1, 1, 1, 1, 1, 1, 0,
12063 	19, 19, 19, 19, 19, 25, 19, 25,
12064 	19, 28, 28, 28, 28, 28, 28
12065 };
12066 
12067 static const int generic_tokenizer_start = 7;
12068 
generic_tokenizer(unsigned version)12069 generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
12070 
next_sentence(vector<token_range> & tokens)12071 bool generic_tokenizer::next_sentence(vector<token_range>& tokens) {
12072   using namespace unilib;
12073 
12074   int cs, act;
12075   size_t ts, te;
12076   size_t whitespace = 0; // Suppress "may be uninitialized" warning
12077 
12078   while (tokenize_url_email(tokens))
12079     if (emergency_sentence_split(tokens))
12080       return true;
12081 
12082 	{
12083 	cs = generic_tokenizer_start;
12084 	ts = 0;
12085 	te = 0;
12086 	act = 0;
12087 	}
12088 
12089 	{
12090 	int _klen;
12091 	const short *_keys;
12092 	int _trans;
12093 	short _widec;
12094 
12095 	if ( ( current) == ( (chars.size() - 1)) )
12096 		goto _test_eof;
12097 	if ( cs == 0 )
12098 		goto _out;
12099 _resume:
12100 	switch ( _generic_tokenizer_from_state_actions[cs] ) {
12101 	case 6:
12102 	{ts = ( current);}
12103 	break;
12104 	}
12105 
12106 	_widec = ( ragel_char(chars[current]));
12107 	_klen = _generic_tokenizer_cond_lengths[cs];
12108 	_keys = _generic_tokenizer_cond_keys + (_generic_tokenizer_cond_offsets[cs]*2);
12109 	if ( _klen > 0 ) {
12110 		const short *_lower = _keys;
12111 		const short *_mid;
12112 		const short *_upper = _keys + (_klen<<1) - 2;
12113 		while (1) {
12114 			if ( _upper < _lower )
12115 				break;
12116 
12117 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
12118 			if ( _widec < _mid[0] )
12119 				_upper = _mid - 2;
12120 			else if ( _widec > _mid[1] )
12121 				_lower = _mid + 2;
12122 			else {
12123 				switch ( _generic_tokenizer_cond_spaces[_generic_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
12124 	case 0: {
12125 		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
12126 		if (
12127  !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd))  ) _widec += 256;
12128 		break;
12129 	}
12130 	case 1: {
12131 		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
12132 		if (
12133  !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+')  ) _widec += 256;
12134 		break;
12135 	}
12136 				}
12137 				break;
12138 			}
12139 		}
12140 	}
12141 
12142 	_keys = _generic_tokenizer_trans_keys + _generic_tokenizer_key_offsets[cs];
12143 	_trans = _generic_tokenizer_index_offsets[cs];
12144 
12145 	_klen = _generic_tokenizer_single_lengths[cs];
12146 	if ( _klen > 0 ) {
12147 		const short *_lower = _keys;
12148 		const short *_mid;
12149 		const short *_upper = _keys + _klen - 1;
12150 		while (1) {
12151 			if ( _upper < _lower )
12152 				break;
12153 
12154 			_mid = _lower + ((_upper-_lower) >> 1);
12155 			if ( _widec < *_mid )
12156 				_upper = _mid - 1;
12157 			else if ( _widec > *_mid )
12158 				_lower = _mid + 1;
12159 			else {
12160 				_trans += (unsigned int)(_mid - _keys);
12161 				goto _match;
12162 			}
12163 		}
12164 		_keys += _klen;
12165 		_trans += _klen;
12166 	}
12167 
12168 	_klen = _generic_tokenizer_range_lengths[cs];
12169 	if ( _klen > 0 ) {
12170 		const short *_lower = _keys;
12171 		const short *_mid;
12172 		const short *_upper = _keys + (_klen<<1) - 2;
12173 		while (1) {
12174 			if ( _upper < _lower )
12175 				break;
12176 
12177 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
12178 			if ( _widec < _mid[0] )
12179 				_upper = _mid - 2;
12180 			else if ( _widec > _mid[1] )
12181 				_lower = _mid + 2;
12182 			else {
12183 				_trans += (unsigned int)((_mid - _keys)>>1);
12184 				goto _match;
12185 			}
12186 		}
12187 		_trans += _klen;
12188 	}
12189 
12190 _match:
12191 	_trans = _generic_tokenizer_indicies[_trans];
12192 _eof_trans:
12193 	cs = _generic_tokenizer_trans_targs[_trans];
12194 
12195 	if ( _generic_tokenizer_trans_actions[_trans] == 0 )
12196 		goto _again;
12197 
12198 	switch ( _generic_tokenizer_trans_actions[_trans] ) {
12199 	case 3:
12200 	{ whitespace = current; }
12201 	break;
12202 	case 4:
12203 	{te = ( current)+1;}
12204 	break;
12205 	case 7:
12206 	{te = ( current)+1;{ tokens.emplace_back(ts, te - ts);
12207           current = te;
12208           do
12209             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12210           while (tokenize_url_email(tokens));
12211           ( current)--;
12212         }}
12213 	break;
12214 	case 2:
12215 	{te = ( current)+1;{
12216           bool eos = is_eos(tokens, chars[ts].chr, nullptr);
12217           for (current = ts; current < whitespace; current++)
12218             tokens.emplace_back(current, 1);
12219           {( current) = (( whitespace))-1;}
12220           if (eos) {( current)++; goto _out; }
12221         }}
12222 	break;
12223 	case 10:
12224 	{te = ( current)+1;{
12225           if (!tokens.empty()) {( current)++; goto _out; }
12226           current = te;
12227           do
12228             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12229           while (tokenize_url_email(tokens));
12230           ( current)--;
12231         }}
12232 	break;
12233 	case 11:
12234 	{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts);
12235           current = te;
12236           do
12237             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12238           while (tokenize_url_email(tokens));
12239           ( current)--;
12240         }}
12241 	break;
12242 	case 8:
12243 	{te = ( current);( current)--;{
12244           current = te;
12245           do
12246             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12247           while (tokenize_url_email(tokens));
12248           ( current)--;
12249         }}
12250 	break;
12251 	case 9:
12252 	{te = ( current);( current)--;{
12253           if (!tokens.empty()) {( current)++; goto _out; }
12254           current = te;
12255           do
12256             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12257           while (tokenize_url_email(tokens));
12258           ( current)--;
12259         }}
12260 	break;
12261 	case 1:
12262 	{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts);
12263           current = te;
12264           do
12265             if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12266           while (tokenize_url_email(tokens));
12267           ( current)--;
12268         }}
12269 	break;
12270 	}
12271 
12272 _again:
12273 	switch ( _generic_tokenizer_to_state_actions[cs] ) {
12274 	case 5:
12275 	{ts = 0;}
12276 	break;
12277 	}
12278 
12279 	if ( cs == 0 )
12280 		goto _out;
12281 	if ( ++( current) != ( (chars.size() - 1)) )
12282 		goto _resume;
12283 	_test_eof: {}
12284 	if ( ( current) == ( (chars.size() - 1)) )
12285 	{
12286 	if ( _generic_tokenizer_eof_trans[cs] > 0 ) {
12287 		_trans = _generic_tokenizer_eof_trans[cs] - 1;
12288 		goto _eof_trans;
12289 	}
12290 	}
12291 
12292 	_out: {}
12293 	}
12294 
12295   (void)act; // Suppress unused variable warning
12296 
12297   return !tokens.empty();
12298 }
12299 
12300 } // namespace morphodita
12301 
12302 /////////
12303 // File: morphodita/tokenizer/generic_tokenizer_factory.h
12304 /////////
12305 
12306 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12307 //
12308 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12309 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12310 //
12311 // This Source Code Form is subject to the terms of the Mozilla Public
12312 // License, v. 2.0. If a copy of the MPL was not distributed with this
12313 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12314 
12315 namespace morphodita {
12316 
12317 class generic_tokenizer_factory : public tokenizer_factory {
12318  public:
12319   // Construct a new tokenizer instance.
12320   virtual tokenizer* new_tokenizer() const override;
12321 
12322   bool load(istream& is);
12323  private:
12324   unsigned version;
12325 };
12326 
12327 } // namespace morphodita
12328 
12329 /////////
12330 // File: morphodita/tokenizer/generic_tokenizer_factory.cpp
12331 /////////
12332 
12333 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12334 //
12335 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12336 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12337 //
12338 // This Source Code Form is subject to the terms of the Mozilla Public
12339 // License, v. 2.0. If a copy of the MPL was not distributed with this
12340 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12341 
12342 namespace morphodita {
12343 
new_tokenizer() const12344 tokenizer* generic_tokenizer_factory::new_tokenizer() const {
12345   return new generic_tokenizer(version);
12346 }
12347 
load(istream & is)12348 bool generic_tokenizer_factory::load(istream& is) {
12349   version = is.get();
12350 
12351   return bool(is);
12352 }
12353 
12354 } // namespace morphodita
12355 
12356 /////////
12357 // File: morphodita/tokenizer/generic_tokenizer_factory_encoder.h
12358 /////////
12359 
12360 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12361 //
12362 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
12363 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12364 //
12365 // This Source Code Form is subject to the terms of the Mozilla Public
12366 // License, v. 2.0. If a copy of the MPL was not distributed with this
12367 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12368 
12369 namespace morphodita {
12370 
12371 class generic_tokenizer_factory_encoder {
12372  public:
12373   static void encode(unsigned version, ostream& os);
12374 };
12375 
12376 } // namespace morphodita
12377 
12378 /////////
12379 // File: morphodita/tokenizer/generic_tokenizer_factory_encoder.cpp
12380 /////////
12381 
12382 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12383 //
12384 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
12385 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12386 //
12387 // This Source Code Form is subject to the terms of the Mozilla Public
12388 // License, v. 2.0. If a copy of the MPL was not distributed with this
12389 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12390 
12391 namespace morphodita {
12392 
encode(unsigned version,ostream & os)12393 void generic_tokenizer_factory_encoder::encode(unsigned version, ostream& os) {
12394   os.put(version);
12395 }
12396 
12397 } // namespace morphodita
12398 
12399 /////////
12400 // File: morphodita/tokenizer/gru_tokenizer_network.h
12401 /////////
12402 
12403 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12404 //
12405 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12406 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12407 //
12408 // This Source Code Form is subject to the terms of the Mozilla Public
12409 // License, v. 2.0. If a copy of the MPL was not distributed with this
12410 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12411 
12412 namespace morphodita {
12413 
12414 // Declarations
12415 
12416 class gru_tokenizer_network {
12417  public:
~gru_tokenizer_network()12418   virtual ~gru_tokenizer_network() {}
12419 
12420   template <int R, int C> struct matrix {
12421     float w[R][C];
12422     float b[R];
12423 
12424     void clear();
12425     void load(binary_decoder& data);
12426   };
12427 
12428   enum { NO_SPLIT, END_OF_TOKEN, END_OF_SENTENCE, OUTCOMES };
12429   struct outcome_t {
12430     int outcome;
12431     float w[3];
12432     const float* embedding;
12433   };
12434   struct char_info {
12435     char32_t chr;
12436     unilib::unicode::category_t cat;
12437 
char_infoufal::udpipe::morphodita::gru_tokenizer_network::char_info12438     char_info() {}
char_infoufal::udpipe::morphodita::gru_tokenizer_network::char_info12439     char_info(char32_t chr, unilib::unicode::category_t cat) : chr(chr), cat(cat) {}
12440   };
12441 
12442   virtual void classify(const vector<char_info>& chars, vector<outcome_t>& outcomes) const = 0;
12443 
12444   static gru_tokenizer_network* load(binary_decoder& data);
12445 };
12446 
12447 template <int D>
12448 class gru_tokenizer_network_implementation : public gru_tokenizer_network {
12449  public:
12450   virtual void classify(const vector<char_info>& chars, vector<outcome_t>& outcomes) const override;
12451 
12452   static gru_tokenizer_network_implementation<D>* load(binary_decoder& data);
12453 
12454  protected:
12455   void cache_embeddings();
12456 
12457   struct cached_embedding {
12458     matrix<1, D> e;
12459     matrix<6, D> cache;
12460   };
12461 
12462   struct gru {
12463     matrix<D,D> X, X_r, X_z;
12464     matrix<D,D> H, H_r, H_z;
12465 
12466     void load(binary_decoder& data);
12467   };
12468 
12469   unordered_map<char32_t, cached_embedding> embeddings;
12470   cached_embedding empty_embedding;
12471   gru gru_fwd, gru_bwd;
12472   matrix<3, D> projection_fwd, projection_bwd;
12473   unordered_map<unilib::unicode::category_t, char32_t> unknown_chars;
12474 };
12475 
12476 // Definitions
12477 
12478 template <int R, int C>
clear()12479 void gru_tokenizer_network::matrix<R, C>::clear() {
12480   for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
12481   fill_n(b, R, 0.f);
12482 }
12483 
12484 template <int R, int C>
load(binary_decoder & data)12485 void gru_tokenizer_network::matrix<R, C>::load(binary_decoder& data) {
12486   for (int i = 0; i < R; i++) copy_n(data.next<float>(C), C, w[i]);
12487   copy_n(data.next<float>(R), R, b);
12488 }
12489 
12490 template <int D>
load(binary_decoder & data)12491 void gru_tokenizer_network_implementation<D>::gru::load(binary_decoder& data) {
12492   X.load(data);
12493   X_r.load(data);
12494   X_z.load(data);
12495   H.load(data);
12496   H_r.load(data);
12497   H_z.load(data);
12498 }
12499 
12500 template <int D>
classify(const vector<char_info> & chars,vector<outcome_t> & outcomes) const12501 void gru_tokenizer_network_implementation<D>::classify(const vector<char_info>& chars, vector<outcome_t>& outcomes) const {
12502   if (chars.empty()) return;
12503 
12504   // Resolve embeddings, possibly with unknown_chars or empty_embedding
12505   for (size_t i = 0; i < chars.size(); i++) {
12506     auto embedding = embeddings.find(chars[i].chr);
12507     if (embedding != embeddings.end()) {
12508       outcomes[i].embedding = embedding->second.cache.w[0];
12509     } else {
12510       auto unknown_char = unknown_chars.find(chars[i].cat);
12511       if (unknown_char != unknown_chars.end()) embedding = embeddings.find(unknown_char->second);
12512       outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0];
12513     }
12514   }
12515 
12516   // Clear outcome probabilities
12517   for (auto&& outcome : outcomes)
12518     for (int i = 0; i < 3; i++)
12519       outcome.w[i] = projection_fwd.b[i];
12520 
12521   // Perform forward & backward GRU
12522   matrix<1, D> state, update, reset, candidate;
12523   for (int dir = 0; dir < 2; dir++) {
12524     auto& gru = dir == 0 ? gru_fwd : gru_bwd;
12525     auto& projection = dir == 0 ? projection_fwd : projection_bwd;
12526 
12527     state.clear();
12528     for (size_t i = 0; i < outcomes.size(); i++) {
12529       auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i];
12530       auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D;
12531 
12532       for (int j = 0; j < D; j++) {
12533         update.w[0][j] = gru.X_z.b[j] + embedding_cache[2*D + j];
12534         reset.w[0][j] = gru.X_r.b[j] + embedding_cache[D + j];
12535         for (int k = 0; k < D; k++) {
12536           update.w[0][j] += state.w[0][k] * gru.H_z.w[j][k];
12537           reset.w[0][j] += state.w[0][k] * gru.H_r.w[j][k];
12538         }
12539         update.w[0][j] = 1.f / (1.f + exp(-update.w[0][j]));
12540         reset.w[0][j] = 1.f / (1.f + exp(-reset.w[0][j]));
12541         reset.w[0][j] *= state.w[0][j];
12542       }
12543       for (int j = 0; j < D; j++) {
12544         candidate.w[0][j] = gru.X.b[j] + embedding_cache[j];
12545         for (int k = 0; k < D; k++)
12546           candidate.w[0][j] += reset.w[0][k] * gru.H.w[j][k];
12547         candidate.w[0][j] = tanh(candidate.w[0][j]);
12548         state.w[0][j] = update.w[0][j] * state.w[0][j] + (1.f - update.w[0][j]) * candidate.w[0][j];
12549       }
12550 
12551       for (int j = 0; j < 3; j++)
12552         for (int k = 0; k < D; k++)
12553           outcome.w[j] += projection.w[j][k] * state.w[0][k];
12554     }
12555   }
12556 
12557   // Choose the outcome with the highest weight
12558   for (auto&& outcome : outcomes) {
12559     outcome.outcome = outcome.w[1] > outcome.w[0];
12560     if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2;
12561   }
12562 }
12563 
12564 template <int D>
load(binary_decoder & data)12565 gru_tokenizer_network_implementation<D>* gru_tokenizer_network_implementation<D>::load(binary_decoder& data) {
12566   unique_ptr<gru_tokenizer_network_implementation<D>> network(new gru_tokenizer_network_implementation<D>());
12567 
12568   for (unsigned chars = data.next_4B(); chars; chars--) {
12569     auto& embedding = network->embeddings[data.next_4B()];
12570     copy_n(data.next<float>(D), D, embedding.e.w[0]);
12571   }
12572   fill_n(network->empty_embedding.e.w[0], D, 0.f);
12573 
12574   network->gru_fwd.load(data);
12575   network->gru_bwd.load(data);
12576   network->projection_fwd.load(data);
12577   network->projection_bwd.load(data);
12578 
12579   network->unknown_chars.clear();
12580   for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
12581     unilib::unicode::category_t cat = data.next_4B();
12582     network->unknown_chars[cat] = data.next_4B();
12583   }
12584 
12585   network->cache_embeddings();
12586 
12587   return network.release();
12588 }
12589 
12590 template <int D>
cache_embeddings()12591 void gru_tokenizer_network_implementation<D>::cache_embeddings() {
12592   for (auto&& embedding : embeddings) {
12593     auto& e = embedding.second.e;
12594     auto& cache = embedding.second.cache;
12595 
12596     fill_n(cache.w[0], 6*D, 0.f);
12597     for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
12598     for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
12599     for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
12600     for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
12601     for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
12602     for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
12603   }
12604   fill_n(empty_embedding.cache.w[0], 6*D, 0.f);
12605 }
12606 
12607 } // namespace morphodita
12608 
12609 /////////
12610 // File: morphodita/tokenizer/gru_tokenizer.h
12611 /////////
12612 
12613 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12614 //
12615 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12616 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12617 //
12618 // This Source Code Form is subject to the terms of the Mozilla Public
12619 // License, v. 2.0. If a copy of the MPL was not distributed with this
12620 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12621 
12622 namespace morphodita {
12623 
12624 class gru_tokenizer : public unicode_tokenizer {
12625  public:
gru_tokenizer(unsigned url_email_tokenizer,unsigned segment,bool allow_spaces,const gru_tokenizer_network & network)12626   gru_tokenizer(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const gru_tokenizer_network& network)
12627       : unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {}
12628 
12629   virtual bool next_sentence(vector<token_range>& tokens) override;
12630 
12631  private:
12632   inline bool is_space(size_t index);
12633   int next_outcome();
12634 
12635   unsigned segment;
12636   bool allow_spaces;
12637   unsigned network_index, network_length;
12638   vector<gru_tokenizer_network::char_info> network_chars;
12639   vector<gru_tokenizer_network::outcome_t> network_outcomes;
12640   vector<size_t> network_offsets;
12641   const gru_tokenizer_network& network;
12642 };
12643 
12644 } // namespace morphodita
12645 
12646 /////////
12647 // File: morphodita/tokenizer/gru_tokenizer.cpp
12648 /////////
12649 
12650 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12651 //
12652 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12653 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12654 //
12655 // This Source Code Form is subject to the terms of the Mozilla Public
12656 // License, v. 2.0. If a copy of the MPL was not distributed with this
12657 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12658 
12659 namespace morphodita {
12660 
is_space(size_t index)12661 bool gru_tokenizer::is_space(size_t index) {
12662   return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
12663 }
12664 
next_sentence(vector<token_range> & tokens)12665 bool gru_tokenizer::next_sentence(vector<token_range>& tokens) {
12666   tokens.clear();
12667 
12668   // Reset tokenizer on new text
12669   if (current == 0) network_index = network_length = 0;
12670 
12671   // Tokenize until EOS
12672   for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) {
12673     while (current < chars.size() - 1 && is_space(current))
12674       next_outcome();
12675     if (current >= chars.size() - 1) break;
12676 
12677     // We have a beginning of a token. Try if it is an URL.
12678     if (tokenize_url_email(tokens)) {
12679       while (network_index < network_length && network_offsets[network_index] < current) network_index++;
12680       continue;
12681     }
12682 
12683     // Slurp current token
12684     size_t token_start = current;
12685     do {
12686       int outcome = next_outcome();
12687       eos = outcome == gru_tokenizer_network::END_OF_SENTENCE;
12688       if (outcome != gru_tokenizer_network::NO_SPLIT) break;
12689     } while (current < chars.size() - 1);
12690     tokens.emplace_back(token_start, current - token_start);
12691   }
12692 
12693   return !tokens.empty();
12694 }
12695 
next_outcome()12696 int gru_tokenizer::next_outcome() {
12697   if (network_index >= network_length) {
12698     // Compute required window
12699     network_index = 0;
12700     network_length = 0;
12701     network_chars.clear();
12702     network_outcomes.clear();
12703     network_offsets.clear();
12704 
12705     // Prepare data for the classification
12706     for (size_t offset = current;
12707          network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment;
12708          network_length++, offset++) {
12709       if (is_space(offset)) {
12710         network_chars.emplace_back(' ', unilib::unicode::Zs);
12711         while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++;
12712       } else {
12713         network_chars.emplace_back(chars[offset].chr, chars[offset].cat);
12714       }
12715     }
12716     // Add a space to the end on the EOD
12717     if (network_length < segment && network_chars.back().chr != ' ')
12718       network_chars.emplace_back(' ', unilib::unicode::Zs);
12719     network_outcomes.resize(network_chars.size());
12720 
12721     // Perform the classification
12722     network.classify(network_chars, network_outcomes);
12723 
12724     // Add spacing token/sentence breaks
12725     for (size_t i = 0; i < network_length - 1; i++)
12726       if (is_space(network_offsets[i+1])) {
12727         // Detect EOS on the following space or \n\n or \r\n\r\n, or if there is end of text
12728         bool eos = network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_SENTENCE;
12729         if (i + 2 == network_length) eos = true;
12730         for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++)
12731           eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') ||
12732                 (j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n');
12733         if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE;
12734 
12735         if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT)
12736           // Force EOT if not allowing spaces, and also detect EOT on the following space
12737           if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN)
12738             network_outcomes[i].outcome = gru_tokenizer_network::END_OF_TOKEN;
12739       }
12740 
12741     // Adjust network_length to suitable break
12742     if (network_length == segment && network_length >= 10) {
12743       network_length -= 5;
12744       while (network_length > segment / 2)
12745         if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT)
12746           break;
12747     }
12748   }
12749   return current = network_offsets[network_index + 1], network_outcomes[network_index++].outcome;
12750 }
12751 
12752 } // namespace morphodita
12753 
12754 /////////
12755 // File: morphodita/tokenizer/gru_tokenizer_factory.h
12756 /////////
12757 
12758 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12759 //
12760 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12761 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12762 //
12763 // This Source Code Form is subject to the terms of the Mozilla Public
12764 // License, v. 2.0. If a copy of the MPL was not distributed with this
12765 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12766 
12767 namespace morphodita {
12768 
12769 class gru_tokenizer_factory : public tokenizer_factory {
12770  public:
12771   // Construct a new tokenizer instance.
12772   virtual tokenizer* new_tokenizer() const override;
12773 
12774   bool load(istream& is);
12775 
12776  private:
12777   unsigned url_email_tokenizer;
12778   unsigned segment;
12779   bool allow_spaces;
12780 
12781   unique_ptr<gru_tokenizer_network> network;
12782 };
12783 
12784 } // namespace morphodita
12785 
12786 /////////
12787 // File: morphodita/tokenizer/gru_tokenizer_factory.cpp
12788 /////////
12789 
12790 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12791 //
12792 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12793 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12794 //
12795 // This Source Code Form is subject to the terms of the Mozilla Public
12796 // License, v. 2.0. If a copy of the MPL was not distributed with this
12797 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12798 
12799 namespace morphodita {
12800 
new_tokenizer() const12801 tokenizer* gru_tokenizer_factory::new_tokenizer() const {
12802   return new gru_tokenizer(url_email_tokenizer, segment, allow_spaces, *network);
12803 }
12804 
load(istream & is)12805 bool gru_tokenizer_factory::load(istream& is) {
12806   char version;
12807   if (!is.get(version)) return false;
12808   if (!(version >= 1 && version <= 2)) return false;
12809 
12810   binary_decoder data;
12811   if (!compressor::load(is, data)) return false;
12812 
12813   try {
12814     url_email_tokenizer = data.next_1B();
12815     segment = data.next_2B();
12816     allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/;
12817 
12818     network.reset(gru_tokenizer_network::load(data));
12819     if (!network) return false;
12820   } catch (binary_decoder_error&) {
12821     return false;
12822   }
12823 
12824   return data.is_end();
12825 }
12826 
12827 } // namespace morphodita
12828 
12829 /////////
12830 // File: morphodita/tokenizer/gru_tokenizer_network.cpp
12831 /////////
12832 
12833 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12834 //
12835 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12836 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12837 //
12838 // This Source Code Form is subject to the terms of the Mozilla Public
12839 // License, v. 2.0. If a copy of the MPL was not distributed with this
12840 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12841 
12842 namespace morphodita {
12843 
load(binary_decoder & data)12844 gru_tokenizer_network* gru_tokenizer_network::load(binary_decoder& data) {
12845   if (data.next_1B() != 1) return nullptr;
12846   switch (data.next_1B()) {
12847     case 16: return gru_tokenizer_network_implementation<16>::load(data);
12848     case 24: return gru_tokenizer_network_implementation<24>::load(data);
12849     case 64: return gru_tokenizer_network_implementation<64>::load(data);
12850   }
12851   return nullptr;
12852 }
12853 
12854 } // namespace morphodita
12855 
12856 /////////
12857 // File: morphodita/tokenizer/gru_tokenizer_trainer.h
12858 /////////
12859 
12860 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12861 //
12862 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12863 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12864 //
12865 // This Source Code Form is subject to the terms of the Mozilla Public
12866 // License, v. 2.0. If a copy of the MPL was not distributed with this
12867 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12868 
12869 namespace morphodita {
12870 
12871 struct tokenized_sentence {
12872   u32string sentence;
12873   vector<token_range> tokens;
12874 };
12875 
12876 class gru_tokenizer_trainer {
12877  public:
12878   enum { URL_EMAIL_LATEST = unicode_tokenizer::URL_EMAIL_LATEST };
12879 
12880   static bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs,
12881                     unsigned batch_size, float learning_rate, float learning_rate_final, float dropout,
12882                     float initialization_range, bool early_stopping, const vector<tokenized_sentence>& data,
12883                     const vector<tokenized_sentence>& heldout, ostream& os, string& error);
12884 };
12885 
12886 } // namespace morphodita
12887 
12888 /////////
12889 // File: morphodita/tokenizer/gru_tokenizer_network_trainer.h
12890 /////////
12891 
12892 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
12893 //
12894 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12895 // Mathematics and Physics, Charles University in Prague, Czech Republic.
12896 //
12897 // This Source Code Form is subject to the terms of the Mozilla Public
12898 // License, v. 2.0. If a copy of the MPL was not distributed with this
12899 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
12900 
12901 namespace morphodita {
12902 
12903 //
12904 // Declarations
12905 //
12906 
12907 template <int D>
12908 class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation<D> {
12909  public:
12910   bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size,
12911              float learning_rate, float learning_rate_final, float dropout, float initialization_range,
12912              bool early_stopping, const vector<tokenized_sentence>& data, const vector<tokenized_sentence>& heldout,
12913              binary_encoder& enc, string& error);
12914 
12915  private:
12916   template <int R, int C> using matrix = gru_tokenizer_network::matrix<R, C>;
12917   using typename gru_tokenizer_network_implementation<D>::cached_embedding;
12918   using typename gru_tokenizer_network_implementation<D>::gru;
12919 
12920   template <int R, int C> struct matrix_trainer {
12921     matrix<R, C>& original;
12922     float w_g[R][C], b_g[R];
12923     float w_m[R][C], b_m[R];
12924     float w_v[R][C], b_v[R];
12925 
matrix_trainerufal::udpipe::morphodita::gru_tokenizer_network_trainer::matrix_trainer12926     matrix_trainer(matrix<R, C>& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
12927     void update_weights(float learning_rate);
12928   };
12929   struct gru_trainer {
12930     matrix_trainer<D,D> X, X_r, X_z;
12931     matrix_trainer<D,D> H, H_r, H_z;
12932     vector<matrix<1, D>> states, updates, resets, resetstates, candidates, dropouts;
12933 
gru_trainerufal::udpipe::morphodita::gru_tokenizer_network_trainer::gru_trainer12934     gru_trainer(gru& g, unsigned segment)
12935         : X(g.X), X_r(g.X_r), X_z(g.X_z), H(g.H), H_r(g.H_r), H_z(g.H_z), states(segment + 1),
12936         updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
12937     void update_weights(float learning_rate);
12938   };
12939 
12940   struct f1_info { double precision, recall, f1; };
12941   void evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector<tokenized_sentence>& heldout,
12942                 f1_info& tokens_f1, f1_info& sentences_f1);
12943   void evaluate_f1(const vector<token_range>& system, const vector<token_range>& gold, f1_info& f1);
12944 
12945   template <int R, int C> void random_matrix(matrix<R,C>& m, mt19937& generator, float range, float bias);
12946   void random_gru(gru& g,  mt19937& generator, float range);
12947 
12948   template <int R, int C> void save_matrix(const matrix<R,C>& m, binary_encoder& enc);
12949   void save_gru(const gru& g, binary_encoder& enc);
12950 };
12951 
12952 //
12953 // Definitions
12954 //
12955 
12956 template <int D>
train(unsigned url_email_tokenizer,unsigned segment,bool allow_spaces,unsigned epochs,unsigned batch_size,float learning_rate_initial,float learning_rate_final,float dropout,float initialization_range,bool early_stopping,const vector<tokenized_sentence> & data,const vector<tokenized_sentence> & heldout,binary_encoder & enc,string & error)12957 bool gru_tokenizer_network_trainer<D>::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size,
12958                                              float learning_rate_initial, float learning_rate_final, float dropout,
12959                                              float initialization_range, bool early_stopping, const vector<tokenized_sentence>& data,
12960                                              const vector<tokenized_sentence>& heldout, binary_encoder& enc, string& error) {
12961   if (segment < 10) return error.assign("Segment size must be at least 10!"), false;
12962 
12963   unsigned characters = 0;
12964   for (auto&& sentence : data)
12965     characters += sentence.sentence.size();
12966   if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false;
12967 
12968   mt19937 generator;
12969 
12970   float dropout_multiplier = 1.f / (1.f - dropout);
12971   bernoulli_distribution dropout_distribution(dropout);
12972 
12973   // Generate embeddings
12974   for (auto&& sentence : data)
12975     for (auto&& chr : sentence.sentence)
12976       if (!this->embeddings.count(chr)) {
12977         cached_embedding embedding;
12978         random_matrix(embedding.e, generator, initialization_range, 0.f);
12979         this->embeddings.emplace(chr, embedding);
12980       }
12981   this->empty_embedding.e.clear();
12982 
12983   // Initialize weights
12984   random_gru(this->gru_fwd, generator, initialization_range);
12985   random_gru(this->gru_bwd, generator, initialization_range);
12986   random_matrix(this->projection_fwd, generator, initialization_range, 0.f); this->projection_fwd.b[this->NO_SPLIT] = 1.f;
12987   random_matrix(this->projection_bwd, generator, initialization_range, 0.f); this->projection_bwd.b[this->NO_SPLIT] = 1.f;
12988 
12989   // Train the network
12990   unordered_map<char32_t, matrix_trainer<1, D>> embeddings;
12991   for (auto&& embedding : this->embeddings)
12992     embeddings.emplace(embedding.first, embedding.second.e);
12993   vector<matrix_trainer<1, D>*> chosen_embeddings(segment);
12994   vector<matrix<1, D>> embedding_dropouts(segment);
12995   gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
12996   matrix_trainer<3, D> projection_fwd(this->projection_fwd), projection_bwd(this->projection_bwd);
12997   float learning_rate = learning_rate_initial, b1t = 1.f, b2t = 1.f;
12998 
12999   float best_combined_f1 = 0.f; unsigned best_combined_f1_epoch = 0;
13000   gru_tokenizer_network_trainer<D> best_combined_f1_network;
13001 
13002   size_t training_offset = 0, training_shift;
13003   vector<gru_tokenizer_network::char_info> training_input, instance_input(segment);
13004   vector<gru_tokenizer_network::outcome_t> training_output, instance_output(segment);
13005   vector<int> permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size());
13006   for (unsigned epoch = 0; epoch < epochs; epoch++) {
13007     double logprob = 0;
13008     int total = 0, correct = 0;
13009 
13010     for (int instance = 0, instances = 10000; instance < instances; instance++) {
13011       // Prepare input instance
13012       if (training_offset + segment >= training_input.size()) {
13013         shuffle(permutation.begin(), permutation.end(), generator);
13014         training_input.clear(); training_output.clear();
13015         for (auto&& index : permutation) {
13016           auto& sentence = data[index];
13017           if (sentence.tokens.empty()) continue;
13018 
13019           training_offset = training_input.size();
13020           training_input.resize(training_offset + sentence.sentence.size());
13021           training_output.resize(training_offset + sentence.sentence.size());
13022           for (size_t i = 0; i < sentence.sentence.size(); i++) {
13023             training_input[training_offset + i].chr = sentence.sentence[i];
13024             training_output[training_offset + i].outcome = gru_tokenizer_network::NO_SPLIT;
13025           }
13026           for (size_t i = 0; i < sentence.tokens.size(); i++)
13027             training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome =
13028                 i+1 < sentence.tokens.size() ? gru_tokenizer_network::END_OF_TOKEN : gru_tokenizer_network::END_OF_SENTENCE;
13029         }
13030         training_offset = 0;
13031       }
13032       copy_n(training_input.begin() + training_offset, segment, instance_input.begin());
13033       copy_n(training_output.begin() + training_offset, segment, instance_output.begin());
13034 
13035       // Shift training_offset
13036       for (training_shift = segment - 5; training_shift > segment / 2; training_shift--)
13037         if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
13038           break;
13039       training_offset += training_shift;
13040 
13041       // Forward pass
13042       for (unsigned i = 0; i < segment; i++) {
13043         chosen_embeddings[i] = &embeddings.at(instance_input[i].chr);
13044         for (unsigned k = 0; k < D; k++)
13045           embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
13046         for (int j = 0; j < 3; j++)
13047           instance_output[i].w[j] = projection_fwd.original.b[j];
13048       }
13049 
13050       for (int dir = 0; dir < 2; dir++) {
13051         auto& gru = dir == 0 ? gru_fwd : gru_bwd;
13052         auto& projection = dir == 0 ? projection_fwd : projection_bwd;
13053 
13054         gru.states[0].clear();
13055         for (size_t i = 0; i < segment; i++) {
13056           auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
13057           auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
13058           auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
13059 
13060           for (int j = 0; j < D; j++) {
13061             gru.updates[i].w[0][j] = gru.X_z.original.b[j];
13062             gru.resets[i].w[0][j] = gru.X_r.original.b[j];
13063             for (int k = 0; k < D; k++) {
13064               gru.updates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_z.original.w[j][k] + gru.states[i].w[0][k] * gru.H_z.original.w[j][k];
13065               gru.resets[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_r.original.w[j][k] + gru.states[i].w[0][k] * gru.H_r.original.w[j][k];
13066             }
13067             gru.updates[i].w[0][j] = 1.f / (1.f + exp(-gru.updates[i].w[0][j]));
13068             gru.resets[i].w[0][j] = 1.f / (1.f + exp(-gru.resets[i].w[0][j]));
13069             gru.resetstates[i].w[0][j] = gru.resets[i].w[0][j] * gru.states[i].w[0][j];
13070           }
13071           for (int j = 0; j < D; j++) {
13072             gru.candidates[i].w[0][j] = gru.X.original.b[j];
13073             for (int k = 0; k < D; k++)
13074               gru.candidates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X.original.w[j][k] + gru.resetstates[i].w[0][k] * gru.H.original.w[j][k];
13075             gru.candidates[i].w[0][j] = tanh(gru.candidates[i].w[0][j]);
13076             gru.states[i+1].w[0][j] = gru.updates[i].w[0][j] * gru.states[i].w[0][j] + (1.f - gru.updates[i].w[0][j]) * gru.candidates[i].w[0][j];
13077           }
13078 
13079           for (int j = 0; j < D; j++)
13080             gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
13081 
13082           for (int j = 0; j < 3; j++)
13083             for (int k = 0; k < D; k++)
13084               output.w[j] += projection.original.w[j][k] * gru.dropouts[i].w[0][k];
13085         }
13086       }
13087 
13088       for (auto&& output : instance_output) {
13089         int best = output.w[1] > output.w[0];
13090         if (output.w[2] > output.w[best]) best = 2;
13091         float maximum = output.w[best], sum = 0;
13092         for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum));
13093         sum = 1.f / sum;
13094         for (int j = 0; j < 3; j++) output.w[j] *= sum;
13095 
13096         total++;
13097         correct += best == output.outcome;
13098         logprob += log(output.w[output.outcome]);
13099       }
13100 
13101       // Backward pass
13102       for (auto&& output : instance_output)
13103         for (int j = 0; j < 3; j++)
13104           output.w[j] = (output.outcome == j) - output.w[j];
13105 
13106       for (int dir = 0; dir < 2; dir++) {
13107         auto& gru = dir == 0 ? gru_fwd : gru_bwd;
13108         auto& projection = dir == 0 ? projection_fwd : projection_bwd;
13109 
13110         matrix<1, D> state_g, update_g, candidate_g, reset_g, resetstate_g;
13111         state_g.clear();
13112         for (size_t i = segment; i--; ) {
13113           auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
13114           auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
13115           auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
13116 
13117           for (int j = 0; j < D; j++) // These for cycles are swapped because
13118             for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise.
13119               projection.w_g[k][j] += gru.dropouts[i].w[0][j] * output.w[k];
13120 
13121           for (int j = 0; j < D; j++)
13122             if (gru.dropouts[i].w[0][j])
13123               for (int k = 0; k < 3; k++)
13124                 state_g.w[0][j] += projection.original.w[k][j] * output.w[k];
13125 
13126           resetstate_g.clear();
13127           for (int j = 0; j < D; j++) {
13128             update_g.w[0][j] = state_g.w[0][j] * (gru.states[i].w[0][j] - gru.candidates[i].w[0][j]);
13129             candidate_g.w[0][j] = state_g.w[0][j] * (1.f - gru.updates[i].w[0][j]);
13130             state_g.w[0][j] = state_g.w[0][j] * gru.updates[i].w[0][j];
13131 
13132             candidate_g.w[0][j] *= 1 - gru.candidates[i].w[0][j] * gru.candidates[i].w[0][j];
13133             gru.X.b_g[j] += candidate_g.w[0][j];
13134             for (int k = 0; k < D; k++) {
13135               gru.X.w_g[j][k] += candidate_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k];
13136               gru.H.w_g[j][k] += candidate_g.w[0][j] * gru.resetstates[i].w[0][k];
13137               embedding->w_g[0][k] += embedding_dropout.w[0][k] * candidate_g.w[0][j] * gru.X.original.w[j][k];
13138               resetstate_g.w[0][k] += candidate_g.w[0][j] * gru.H.original.w[j][k];
13139             }
13140           }
13141           for (int j = 0; j < D; j++) {
13142             state_g.w[0][j] += resetstate_g.w[0][j] * gru.resets[i].w[0][j];
13143             reset_g.w[0][j] = resetstate_g.w[0][j] * gru.states[i].w[0][j];
13144 
13145             update_g.w[0][j] *= gru.updates[i].w[0][j] * (1 - gru.updates[i].w[0][j]);
13146             reset_g.w[0][j] *= gru.resets[i].w[0][j] * (1 - gru.resets[i].w[0][j]);
13147 
13148             gru.X_z.b_g[j] += update_g.w[0][j];
13149             gru.X_r.b_g[j] += reset_g.w[0][j];
13150             for (int k = 0; k < D; k++) {
13151               gru.X_z.w_g[j][k] += update_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k];
13152               gru.H_z.w_g[j][k] += update_g.w[0][j] * gru.states[i].w[0][k];
13153               gru.X_r.w_g[j][k] += reset_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k];
13154               gru.H_r.w_g[j][k] += reset_g.w[0][j] * gru.states[i].w[0][k];
13155               embedding->w_g[0][k] += embedding_dropout.w[0][k] * (update_g.w[0][j] * gru.X_z.original.w[j][k] +
13156                                                                    reset_g.w[0][j] * gru.X_r.original.w[j][k]);
13157               state_g.w[0][k] += update_g.w[0][j] * gru.H_z.original.w[j][k] + reset_g.w[0][j] * gru.H_r.original.w[j][k];
13158             }
13159           }
13160         }
13161       }
13162 
13163       // Update the weights
13164       if (batch_size == 1 ||
13165           instance+1 == instances ||
13166           (instance+1) % batch_size == 0) {
13167         b1t *= 0.9f;
13168         b2t *= 0.999f;
13169         float learning_rate_biased = learning_rate * sqrt(1-b2t) / (1-b1t);
13170 
13171         if (batch_size == 1)
13172           for (auto&& chosen_embedding : chosen_embeddings)
13173             chosen_embedding->update_weights(learning_rate_biased);
13174         else
13175           for (auto&& embedding : embeddings)
13176             embedding.second.update_weights(learning_rate_biased);
13177         gru_fwd.update_weights(learning_rate_biased);
13178         gru_bwd.update_weights(learning_rate_biased);
13179         projection_fwd.update_weights(learning_rate_biased);
13180         projection_bwd.update_weights(learning_rate_biased);
13181       }
13182     }
13183     if (learning_rate_final && learning_rate_final != learning_rate_initial)
13184       learning_rate = exp(((epochs - epoch - 2) * log(learning_rate_initial) + (epoch + 1) * log(learning_rate_final)) / (epochs - 1));
13185 
13186     // Evaluate
13187     cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob
13188          << ", training acc: " << fixed << setprecision(2) << 100. * correct / double(total) << "%";
13189     if (!heldout.empty()) {
13190       f1_info tokens, sentences;
13191       evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences);
13192       cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/"
13193            << 100. * tokens.f1 << "%, sentences: " << 100. * sentences.precision << "%P/"
13194            << 100. * sentences.recall << "%R/" << 100. * sentences.f1 << "%";
13195 
13196       if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
13197         best_combined_f1 = sentences.f1 + tokens.f1;
13198         best_combined_f1_epoch = epoch;
13199         best_combined_f1_network = *this;
13200       }
13201       if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
13202         cerr << endl << "Stopping after 30 iterations of not improving sum of sentence and token f1." << endl;
13203         break;
13204       }
13205     }
13206     cerr << endl;
13207   }
13208 
13209   // Choose best network if desired
13210   if (early_stopping && best_combined_f1) {
13211     cerr << "Choosing parameters from epoch " << best_combined_f1_epoch+1 << "." << endl;
13212     this->embeddings = best_combined_f1_network.embeddings;
13213     this->gru_fwd = best_combined_f1_network.gru_fwd;
13214     this->gru_bwd = best_combined_f1_network.gru_bwd;
13215     this->projection_fwd = best_combined_f1_network.projection_fwd;
13216     this->projection_bwd = best_combined_f1_network.projection_bwd;
13217   }
13218 
13219   // Encode the network
13220   enc.add_1B(1);
13221   enc.add_1B(D);
13222 
13223   enc.add_4B(this->embeddings.size());
13224   for (auto&& embedding : this->embeddings) {
13225     enc.add_4B(embedding.first);
13226     enc.add_data(embedding.second.e.w[0], D);
13227   }
13228   save_gru(this->gru_fwd, enc);
13229   save_gru(this->gru_bwd, enc);
13230   save_matrix(this->projection_fwd, enc);
13231   save_matrix(this->projection_bwd, enc);
13232 
13233   return true;
13234 }
13235 
13236 template <int D> template <int R, int C>
update_weights(float learning_rate)13237 void gru_tokenizer_network_trainer<D>::matrix_trainer<R, C>::update_weights(float learning_rate) {
13238   for (int i = 0; i < R; i++) {
13239     for (int j = 0; j < C; j++) {
13240       w_m[i][j] = 0.9 * w_m[i][j] + (1-0.9) * w_g[i][j];
13241       w_v[i][j] = 0.999 * w_v[i][j] + (1-0.999) * w_g[i][j] * w_g[i][j];
13242       original.w[i][j] += learning_rate * w_m[i][j] / (sqrt(w_v[i][j]) + 1e-8);
13243     }
13244     b_m[i] = 0.9 * b_m[i] + (1-0.9) * b_g[i];
13245     b_v[i] = 0.999 * b_v[i] + (1-0.999) * b_g[i] * b_g[i];
13246     original.b[i] += learning_rate * b_m[i] / (sqrt(b_v[i]) + 1e-8);
13247   }
13248 
13249   for (int i = 0; i < R; i++) {
13250     for (int j = 0; j < C; j++)
13251       w_g[i][j] = 0.f;
13252     b_g[i] = 0.f;
13253   }
13254 }
13255 
13256 template <int D>
update_weights(float learning_rate)13257 void gru_tokenizer_network_trainer<D>::gru_trainer::update_weights(float learning_rate) {
13258   X.update_weights(learning_rate);
13259   X_r.update_weights(learning_rate);
13260   X_z.update_weights(learning_rate);
13261   H.update_weights(learning_rate);
13262   H_r.update_weights(learning_rate);
13263   H_z.update_weights(learning_rate);
13264 }
13265 
13266 template <int D>
evaluate(unsigned url_email_tokenizer,unsigned segment,bool allow_spaces,const vector<tokenized_sentence> & heldout,f1_info & tokens_f1,f1_info & sentences_f1)13267 void gru_tokenizer_network_trainer<D>::evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector<tokenized_sentence>& heldout,
13268                                                 f1_info& tokens_f1, f1_info& sentences_f1) {
13269   // Generate gold data
13270   vector<token_range> gold_sentences, gold_tokens;
13271   u32string text;
13272   for (auto&& sentence : heldout) {
13273     if (sentence.tokens.empty()) continue;
13274 
13275     gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start);
13276     for (auto&& token : sentence.tokens)
13277       gold_tokens.emplace_back(text.size() + token.start, token.length);
13278     text.append(sentence.sentence);
13279   }
13280 
13281   // Generate system data
13282   vector<token_range> system_sentences, system_tokens, tokens;
13283   string text_utf8;
13284 
13285   this->cache_embeddings();
13286   gru_tokenizer tokenizer(url_email_tokenizer, segment, allow_spaces, *this);
13287   unilib::utf8::encode(text, text_utf8);
13288   tokenizer.set_text(text_utf8);
13289 
13290   while (tokenizer.next_sentence(tokens))
13291     if (!tokens.empty()) {
13292       system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start);
13293       system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end());
13294     }
13295 
13296   evaluate_f1(system_tokens, gold_tokens, tokens_f1);
13297   evaluate_f1(system_sentences, gold_sentences, sentences_f1);
13298 }
13299 
13300 template <int D>
evaluate_f1(const vector<token_range> & system,const vector<token_range> & gold,f1_info & f1)13301 void gru_tokenizer_network_trainer<D>::evaluate_f1(const vector<token_range>& system, const vector<token_range>& gold, f1_info& f1) {
13302   size_t both = 0;
13303   for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
13304     if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
13305       si++;
13306     else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
13307       gi++;
13308     else
13309       both += system[si++].length == gold[gi++].length;
13310 
13311   f1.precision = system.size() ? both / double(system.size()) : 0.;
13312   f1.recall = gold.size() ? both / double(gold.size()) : 0.;
13313   f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.;
13314 }
13315 
13316 template <int D> template <int R, int C>
random_matrix(matrix<R,C> & m,mt19937 & generator,float range,float bias)13317 void gru_tokenizer_network_trainer<D>::random_matrix(matrix<R,C>& m, mt19937& generator, float range, float bias) {
13318   uniform_real_distribution<float> uniform(-range, range);
13319   for (int i = 0; i < R; i++) {
13320     m.b[i] = bias;
13321     for (int j = 0; j < C; j++)
13322       m.w[i][j] = uniform(generator);
13323   }
13324 }
13325 
13326 template <int D>
random_gru(gru & g,mt19937 & generator,float range)13327 void gru_tokenizer_network_trainer<D>::random_gru(gru& g, mt19937& generator, float range) {
13328   random_matrix(g.X, generator, range, 0.f);
13329   random_matrix(g.X_r, generator, range, 1.f);
13330   random_matrix(g.X_z, generator, range, 1.f);
13331   random_matrix(g.H, generator, range, 0.f);
13332   random_matrix(g.H_r, generator, range, 1.f);
13333   random_matrix(g.H_z, generator, range, 1.f);
13334 }
13335 
13336 template <int D> template <int R, int C>
save_matrix(const matrix<R,C> & m,binary_encoder & enc)13337 void gru_tokenizer_network_trainer<D>::save_matrix(const matrix<R,C>& m, binary_encoder& enc) {
13338   for (int i = 0; i < R; i++)
13339     enc.add_data(m.w[i], C);
13340   enc.add_data(m.b, R);
13341 }
13342 
13343 template <int D>
save_gru(const gru & g,binary_encoder & enc)13344 void gru_tokenizer_network_trainer<D>::save_gru(const gru& g, binary_encoder& enc) {
13345   save_matrix(g.X, enc);
13346   save_matrix(g.X_r, enc);
13347   save_matrix(g.X_z, enc);
13348   save_matrix(g.H, enc);
13349   save_matrix(g.H_r, enc);
13350   save_matrix(g.H_z, enc);
13351 }
13352 
13353 } // namespace morphodita
13354 
13355 /////////
13356 // File: morphodita/tokenizer/gru_tokenizer_trainer.cpp
13357 /////////
13358 
13359 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
13360 //
13361 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13362 // Mathematics and Physics, Charles University in Prague, Czech Republic.
13363 //
13364 // This Source Code Form is subject to the terms of the Mozilla Public
13365 // License, v. 2.0. If a copy of the MPL was not distributed with this
13366 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
13367 
13368 namespace morphodita {
13369 
train(unsigned url_email_tokenizer,unsigned segment,bool allow_spaces,unsigned dimension,unsigned epochs,unsigned batch_size,float learning_rate,float learning_rate_final,float dropout,float initialization_range,bool early_stopping,const vector<tokenized_sentence> & data,const vector<tokenized_sentence> & heldout,ostream & os,string & error)13370 bool gru_tokenizer_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs,
13371                                   unsigned batch_size, float learning_rate, float learning_rate_final, float dropout,
13372                                   float initialization_range, bool early_stopping, const vector<tokenized_sentence>& data,
13373                                   const vector<tokenized_sentence>& heldout, ostream& os, string& error) {
13374   using namespace unilib;
13375 
13376   error.clear();
13377 
13378   // Start encoding the tokenizer
13379   os.put(2);
13380 
13381   binary_encoder enc;
13382   enc.add_1B(url_email_tokenizer);
13383   enc.add_2B(segment);
13384   enc.add_1B(allow_spaces);
13385 
13386   // Train the GRU network
13387   if (dimension == 16) {
13388     gru_tokenizer_network_trainer<16> network;
13389     if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
13390                        dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false;
13391   } else if (dimension == 24) {
13392     gru_tokenizer_network_trainer<24> network;
13393     if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
13394                        dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false;
13395   } else if (dimension == 64) {
13396     gru_tokenizer_network_trainer<64> network;
13397     if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
13398                        dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false;
13399   } else {
13400     return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false;
13401   }
13402 
13403   // Compute best substitutions for every category
13404   unordered_map<unicode::category_t, unordered_map<char32_t, unsigned>> counts;
13405   for (auto&& sentence : data)
13406     for (auto&& chr : sentence.sentence)
13407       counts[unicode::category(chr)][chr]++;
13408 
13409   unordered_map<unicode::category_t, char32_t> unknown_chars;
13410   for (auto&& count : counts) {
13411     char32_t best_chr = 0;
13412     unsigned best = 0;
13413     for (auto&& chr : count.second)
13414       if (chr.second > best)
13415         best = chr.second, best_chr = chr.first;
13416     if (best_chr)
13417       unknown_chars.emplace(count.first, best_chr);
13418   }
13419   enc.add_1B(unknown_chars.size());
13420   for (auto&& unknown_char : unknown_chars) {
13421     enc.add_4B(unknown_char.first);
13422     enc.add_4B(unknown_char.second);
13423   }
13424 
13425   if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false;
13426   return true;
13427 }
13428 
13429 } // namespace morphodita
13430 
13431 /////////
13432 // File: morphodita/tokenizer/ragel_tokenizer.cpp
13433 /////////
13434 
13435 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
13436 //
13437 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
13438 // Mathematics and Physics, Charles University in Prague, Czech Republic.
13439 //
13440 // This Source Code Form is subject to the terms of the Mozilla Public
13441 // License, v. 2.0. If a copy of the MPL was not distributed with this
13442 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
13443 
13444 namespace morphodita {
13445 
13446 static const char _ragel_url_email_cond_offsets[] = {
13447 	0, 0, 0, 0, 0, 0, 0, 0,
13448 	0, 0, 0, 0, 0, 1, 1, 1,
13449 	1, 1, 1, 1, 1, 1, 1, 1,
13450 	1, 1, 1, 1, 1, 1, 1, 1,
13451 	1, 1, 1, 1, 1, 1, 1, 1,
13452 	1, 1, 1, 1, 1, 1, 1, 1,
13453 	1, 1, 1, 1, 1, 1, 1, 1,
13454 	1, 1, 1, 2, 3, 3, 4, 5,
13455 	6, 7, 8, 9, 10, 11, 12, 13,
13456 	14, 15, 16
13457 };
13458 
13459 static const char _ragel_url_email_cond_lengths[] = {
13460 	0, 0, 0, 0, 0, 0, 0, 0,
13461 	0, 0, 0, 0, 1, 0, 0, 0,
13462 	0, 0, 0, 0, 0, 0, 0, 0,
13463 	0, 0, 0, 0, 0, 0, 0, 0,
13464 	0, 0, 0, 0, 0, 0, 0, 0,
13465 	0, 0, 0, 0, 0, 0, 0, 0,
13466 	0, 0, 0, 0, 0, 0, 0, 0,
13467 	0, 0, 1, 1, 0, 1, 1, 1,
13468 	1, 1, 1, 1, 1, 1, 1, 1,
13469 	1, 1, 1
13470 };
13471 
13472 static const short _ragel_url_email_cond_keys[] = {
13473 	41u, 41u, 47u, 47u, 47u, 47u, 41u, 41u,
13474 	47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u,
13475 	47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u,
13476 	47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u,
13477 	47u, 47u, 0
13478 };
13479 
13480 static const char _ragel_url_email_cond_spaces[] = {
13481 	1, 0, 0, 1, 0, 0, 0, 0,
13482 	0, 0, 0, 0, 0, 0, 0, 0,
13483 	0, 0
13484 };
13485 
13486 static const short _ragel_url_email_key_offsets[] = {
13487 	0, 0, 15, 29, 41, 54, 63, 71,
13488 	78, 86, 92, 100, 117, 145, 154, 162,
13489 	171, 179, 188, 196, 204, 215, 225, 233,
13490 	241, 252, 262, 270, 278, 289, 299, 315,
13491 	330, 346, 360, 376, 393, 409, 426, 442,
13492 	459, 475, 491, 510, 528, 544, 560, 579,
13493 	597, 613, 629, 648, 666, 682, 698, 714,
13494 	725, 726, 741, 752, 756, 773, 801, 812,
13495 	823, 834, 848, 861, 879, 893, 908, 926,
13496 	944, 962, 983
13497 };
13498 
13499 static const short _ragel_url_email_trans_keys[] = {
13500 	33u, 48u, 49u, 50u, 95u, 36u, 37u, 39u,
13501 	46u, 51u, 57u, 65u, 90u, 97u, 122u, 33u,
13502 	58u, 64u, 95u, 36u, 37u, 39u, 46u, 48u,
13503 	57u, 65u, 90u, 97u, 122u, 33u, 95u, 36u,
13504 	37u, 39u, 46u, 48u, 57u, 65u, 90u, 97u,
13505 	122u, 33u, 64u, 95u, 36u, 37u, 39u, 46u,
13506 	48u, 57u, 65u, 90u, 97u, 122u, 48u, 49u,
13507 	50u, 51u, 57u, 65u, 90u, 97u, 122u, 45u,
13508 	46u, 48u, 57u, 65u, 90u, 97u, 122u, 45u,
13509 	48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13510 	48u, 57u, 65u, 90u, 97u, 122u, 48u, 57u,
13511 	65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u,
13512 	65u, 90u, 97u, 122u, 33u, 39u, 41u, 61u,
13513 	95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u,
13514 	64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u,
13515 	44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u,
13516 	151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u,
13517 	59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u,
13518 	159u, 48u, 49u, 50u, 51u, 57u, 65u, 90u,
13519 	97u, 122u, 45u, 46u, 48u, 57u, 65u, 90u,
13520 	97u, 122u, 48u, 49u, 50u, 51u, 57u, 65u,
13521 	90u, 97u, 122u, 45u, 46u, 48u, 57u, 65u,
13522 	90u, 97u, 122u, 48u, 49u, 50u, 51u, 57u,
13523 	65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u,
13524 	65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u,
13525 	65u, 90u, 97u, 122u, 45u, 46u, 53u, 48u,
13526 	52u, 54u, 57u, 65u, 90u, 97u, 122u, 45u,
13527 	46u, 48u, 53u, 54u, 57u, 65u, 90u, 97u,
13528 	122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u,
13529 	122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u,
13530 	122u, 45u, 46u, 53u, 48u, 52u, 54u, 57u,
13531 	65u, 90u, 97u, 122u, 45u, 46u, 48u, 53u,
13532 	54u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13533 	48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13534 	48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13535 	53u, 48u, 52u, 54u, 57u, 65u, 90u, 97u,
13536 	122u, 45u, 46u, 48u, 53u, 54u, 57u, 65u,
13537 	90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
13538 	95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u,
13539 	90u, 97u, 122u, 33u, 45u, 58u, 64u, 95u,
13540 	36u, 37u, 39u, 46u, 48u, 57u, 65u, 90u,
13541 	97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13542 	36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13543 	97u, 122u, 33u, 58u, 64u, 95u, 36u, 37u,
13544 	39u, 46u, 48u, 57u, 65u, 90u, 97u, 122u,
13545 	33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
13546 	39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
13547 	33u, 48u, 49u, 50u, 58u, 64u, 95u, 36u,
13548 	37u, 39u, 46u, 51u, 57u, 65u, 90u, 97u,
13549 	122u, 33u, 45u, 46u, 58u, 64u, 95u, 36u,
13550 	37u, 39u, 44u, 48u, 57u, 65u, 90u, 97u,
13551 	122u, 33u, 48u, 49u, 50u, 58u, 64u, 95u,
13552 	36u, 37u, 39u, 46u, 51u, 57u, 65u, 90u,
13553 	97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13554 	36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13555 	97u, 122u, 33u, 48u, 49u, 50u, 58u, 64u,
13556 	95u, 36u, 37u, 39u, 46u, 51u, 57u, 65u,
13557 	90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
13558 	95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u,
13559 	90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
13560 	95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u,
13561 	90u, 97u, 122u, 33u, 45u, 46u, 53u, 58u,
13562 	64u, 95u, 36u, 37u, 39u, 44u, 48u, 52u,
13563 	54u, 57u, 65u, 90u, 97u, 122u, 33u, 45u,
13564 	46u, 58u, 64u, 95u, 36u, 37u, 39u, 44u,
13565 	48u, 53u, 54u, 57u, 65u, 90u, 97u, 122u,
13566 	33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
13567 	39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
13568 	33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
13569 	39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
13570 	33u, 45u, 46u, 53u, 58u, 64u, 95u, 36u,
13571 	37u, 39u, 44u, 48u, 52u, 54u, 57u, 65u,
13572 	90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
13573 	95u, 36u, 37u, 39u, 44u, 48u, 53u, 54u,
13574 	57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
13575 	58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u,
13576 	57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
13577 	58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u,
13578 	57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
13579 	53u, 58u, 64u, 95u, 36u, 37u, 39u, 44u,
13580 	48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u,
13581 	33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
13582 	39u, 44u, 48u, 53u, 54u, 57u, 65u, 90u,
13583 	97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13584 	36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13585 	97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13586 	36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13587 	97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13588 	36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13589 	97u, 122u, 33u, 47u, 95u, 36u, 37u, 39u,
13590 	57u, 65u, 90u, 97u, 122u, 47u, 33u, 48u,
13591 	49u, 50u, 95u, 36u, 37u, 39u, 46u, 51u,
13592 	57u, 65u, 90u, 97u, 122u, 45u, 46u, 58u,
13593 	303u, 559u, 48u, 57u, 65u, 90u, 97u, 122u,
13594 	303u, 559u, 48u, 57u, 33u, 39u, 41u, 61u,
13595 	95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u,
13596 	64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u,
13597 	44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u,
13598 	151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u,
13599 	59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u,
13600 	159u, 45u, 46u, 58u, 303u, 559u, 48u, 57u,
13601 	65u, 90u, 97u, 122u, 45u, 46u, 58u, 303u,
13602 	559u, 48u, 57u, 65u, 90u, 97u, 122u, 45u,
13603 	46u, 58u, 303u, 559u, 48u, 57u, 65u, 90u,
13604 	97u, 122u, 45u, 46u, 53u, 58u, 303u, 559u,
13605 	48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u,
13606 	45u, 46u, 58u, 303u, 559u, 48u, 53u, 54u,
13607 	57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
13608 	58u, 64u, 95u, 303u, 559u, 36u, 37u, 39u,
13609 	44u, 48u, 57u, 65u, 90u, 97u, 122u, 33u,
13610 	95u, 303u, 559u, 36u, 37u, 39u, 46u, 48u,
13611 	57u, 65u, 90u, 97u, 122u, 33u, 64u, 95u,
13612 	303u, 559u, 36u, 37u, 39u, 46u, 48u, 57u,
13613 	65u, 90u, 97u, 122u, 33u, 45u, 46u, 58u,
13614 	64u, 95u, 303u, 559u, 36u, 37u, 39u, 44u,
13615 	48u, 57u, 65u, 90u, 97u, 122u, 33u, 45u,
13616 	46u, 58u, 64u, 95u, 303u, 559u, 36u, 37u,
13617 	39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
13618 	33u, 45u, 46u, 58u, 64u, 95u, 303u, 559u,
13619 	36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13620 	97u, 122u, 33u, 45u, 46u, 53u, 58u, 64u,
13621 	95u, 303u, 559u, 36u, 37u, 39u, 44u, 48u,
13622 	52u, 54u, 57u, 65u, 90u, 97u, 122u, 33u,
13623 	45u, 46u, 58u, 64u, 95u, 303u, 559u, 36u,
13624 	37u, 39u, 44u, 48u, 53u, 54u, 57u, 65u,
13625 	90u, 97u, 122u, 0
13626 };
13627 
13628 static const char _ragel_url_email_single_lengths[] = {
13629 	0, 5, 4, 2, 3, 3, 2, 1,
13630 	2, 0, 2, 5, 14, 3, 2, 3,
13631 	2, 3, 2, 2, 3, 2, 2, 2,
13632 	3, 2, 2, 2, 3, 2, 6, 5,
13633 	6, 4, 6, 7, 6, 7, 6, 7,
13634 	6, 6, 7, 6, 6, 6, 7, 6,
13635 	6, 6, 7, 6, 6, 6, 6, 3,
13636 	1, 5, 5, 2, 5, 14, 5, 5,
13637 	5, 6, 5, 8, 4, 5, 8, 8,
13638 	8, 9, 8
13639 };
13640 
13641 static const char _ragel_url_email_range_lengths[] = {
13642 	0, 5, 5, 5, 5, 3, 3, 3,
13643 	3, 3, 3, 6, 7, 3, 3, 3,
13644 	3, 3, 3, 3, 4, 4, 3, 3,
13645 	4, 4, 3, 3, 4, 4, 5, 5,
13646 	5, 5, 5, 5, 5, 5, 5, 5,
13647 	5, 5, 6, 6, 5, 5, 6, 6,
13648 	5, 5, 6, 6, 5, 5, 5, 4,
13649 	0, 5, 3, 1, 6, 7, 3, 3,
13650 	3, 4, 4, 5, 5, 5, 5, 5,
13651 	5, 6, 6
13652 };
13653 
13654 static const short _ragel_url_email_index_offsets[] = {
13655 	0, 0, 11, 21, 29, 38, 45, 51,
13656 	56, 62, 66, 72, 84, 106, 113, 119,
13657 	126, 132, 139, 145, 151, 159, 166, 172,
13658 	178, 186, 193, 199, 205, 213, 220, 232,
13659 	243, 255, 265, 277, 290, 302, 315, 327,
13660 	340, 352, 364, 378, 391, 403, 415, 429,
13661 	442, 454, 466, 480, 493, 505, 517, 529,
13662 	537, 539, 550, 559, 563, 575, 597, 606,
13663 	615, 624, 635, 645, 659, 669, 680, 694,
13664 	708, 722, 738
13665 };
13666 
13667 static const char _ragel_url_email_indicies[] = {
13668 	0, 2, 3, 4, 0, 0, 0, 5,
13669 	6, 6, 1, 0, 7, 8, 0, 0,
13670 	0, 0, 0, 0, 1, 9, 9, 9,
13671 	9, 9, 9, 9, 1, 9, 8, 9,
13672 	9, 9, 9, 9, 9, 1, 10, 11,
13673 	12, 13, 14, 14, 1, 15, 16, 14,
13674 	14, 14, 1, 15, 14, 14, 14, 1,
13675 	15, 17, 14, 14, 14, 1, 14, 18,
13676 	18, 1, 15, 17, 14, 19, 19, 1,
13677 	20, 21, 21, 20, 20, 20, 21, 20,
13678 	20, 21, 21, 1, 22, 22, 24, 22,
13679 	22, 23, 22, 23, 23, 23, 23, 23,
13680 	25, 26, 23, 23, 22, 23, 23, 23,
13681 	23, 1, 27, 28, 29, 30, 18, 18,
13682 	1, 15, 31, 14, 14, 14, 1, 32,
13683 	33, 34, 35, 18, 18, 1, 15, 36,
13684 	14, 14, 14, 1, 37, 38, 39, 40,
13685 	18, 18, 1, 15, 36, 35, 14, 14,
13686 	1, 15, 36, 32, 14, 14, 1, 15,
13687 	36, 41, 35, 32, 14, 14, 1, 15,
13688 	36, 32, 14, 14, 14, 1, 15, 31,
13689 	30, 14, 14, 1, 15, 31, 27, 14,
13690 	14, 1, 15, 31, 42, 30, 27, 14,
13691 	14, 1, 15, 31, 27, 14, 14, 14,
13692 	1, 15, 16, 13, 14, 14, 1, 15,
13693 	16, 10, 14, 14, 1, 15, 16, 43,
13694 	13, 10, 14, 14, 1, 15, 16, 10,
13695 	14, 14, 14, 1, 0, 44, 45, 7,
13696 	8, 0, 0, 0, 46, 46, 46, 1,
13697 	0, 44, 7, 8, 0, 0, 0, 46,
13698 	46, 46, 1, 0, 44, 47, 7, 8,
13699 	0, 0, 0, 46, 46, 46, 1, 0,
13700 	7, 8, 0, 0, 0, 46, 48, 48,
13701 	1, 0, 44, 47, 7, 8, 0, 0,
13702 	0, 46, 49, 49, 1, 0, 50, 51,
13703 	52, 7, 8, 0, 0, 0, 53, 48,
13704 	48, 1, 0, 44, 54, 7, 8, 0,
13705 	0, 0, 46, 46, 46, 1, 0, 55,
13706 	56, 57, 7, 8, 0, 0, 0, 58,
13707 	48, 48, 1, 0, 44, 59, 7, 8,
13708 	0, 0, 0, 46, 46, 46, 1, 0,
13709 	60, 61, 62, 7, 8, 0, 0, 0,
13710 	63, 48, 48, 1, 0, 44, 59, 7,
13711 	8, 0, 0, 0, 58, 46, 46, 1,
13712 	0, 44, 59, 7, 8, 0, 0, 0,
13713 	55, 46, 46, 1, 0, 44, 59, 64,
13714 	7, 8, 0, 0, 0, 58, 55, 46,
13715 	46, 1, 0, 44, 59, 7, 8, 0,
13716 	0, 0, 55, 46, 46, 46, 1, 0,
13717 	44, 54, 7, 8, 0, 0, 0, 53,
13718 	46, 46, 1, 0, 44, 54, 7, 8,
13719 	0, 0, 0, 50, 46, 46, 1, 0,
13720 	44, 54, 65, 7, 8, 0, 0, 0,
13721 	53, 50, 46, 46, 1, 0, 44, 54,
13722 	7, 8, 0, 0, 0, 50, 46, 46,
13723 	46, 1, 0, 44, 45, 7, 8, 0,
13724 	0, 0, 5, 46, 46, 1, 0, 44,
13725 	45, 7, 8, 0, 0, 0, 2, 46,
13726 	46, 1, 0, 44, 45, 66, 7, 8,
13727 	0, 0, 0, 5, 2, 46, 46, 1,
13728 	0, 44, 45, 7, 8, 0, 0, 0,
13729 	2, 46, 46, 46, 1, 0, 44, 47,
13730 	7, 8, 0, 0, 0, 46, 67, 67,
13731 	1, 0, 44, 47, 7, 8, 0, 0,
13732 	0, 46, 68, 68, 1, 0, 44, 47,
13733 	69, 8, 0, 0, 0, 46, 68, 68,
13734 	1, 9, 70, 9, 9, 9, 9, 9,
13735 	1, 71, 1, 0, 2, 3, 4, 0,
13736 	0, 0, 5, 46, 46, 1, 15, 17,
13737 	72, 21, 23, 14, 19, 19, 1, 21,
13738 	23, 72, 1, 20, 21, 21, 20, 20,
13739 	20, 21, 20, 20, 21, 21, 1, 22,
13740 	22, 24, 22, 22, 23, 22, 23, 23,
13741 	23, 23, 23, 25, 26, 23, 23, 22,
13742 	23, 23, 23, 23, 1, 15, 17, 72,
13743 	21, 23, 14, 14, 14, 1, 15, 17,
13744 	72, 21, 23, 40, 14, 14, 1, 15,
13745 	17, 72, 21, 23, 37, 14, 14, 1,
13746 	15, 17, 73, 72, 21, 23, 40, 37,
13747 	14, 14, 1, 15, 17, 72, 21, 23,
13748 	37, 14, 14, 14, 1, 0, 44, 47,
13749 	74, 8, 0, 21, 23, 0, 0, 46,
13750 	49, 49, 1, 9, 9, 21, 23, 9,
13751 	9, 75, 9, 9, 1, 9, 8, 9,
13752 	21, 23, 9, 9, 75, 9, 9, 1,
13753 	0, 44, 47, 74, 8, 0, 21, 23,
13754 	0, 0, 46, 46, 46, 1, 0, 44,
13755 	47, 74, 8, 0, 21, 23, 0, 0,
13756 	63, 46, 46, 1, 0, 44, 47, 74,
13757 	8, 0, 21, 23, 0, 0, 60, 46,
13758 	46, 1, 0, 44, 47, 76, 74, 8,
13759 	0, 21, 23, 0, 0, 63, 60, 46,
13760 	46, 1, 0, 44, 47, 74, 8, 0,
13761 	21, 23, 0, 0, 60, 46, 46, 46,
13762 	1, 0
13763 };
13764 
13765 static const char _ragel_url_email_trans_targs[] = {
13766 	2, 0, 30, 48, 50, 49, 52, 3,
13767 	5, 4, 6, 26, 28, 27, 8, 7,
13768 	13, 9, 10, 58, 11, 60, 12, 61,
13769 	61, 12, 61, 14, 22, 24, 23, 15,
13770 	16, 18, 20, 19, 17, 62, 63, 65,
13771 	64, 21, 25, 29, 31, 35, 32, 33,
13772 	34, 67, 36, 44, 46, 45, 37, 38,
13773 	40, 42, 41, 39, 70, 71, 73, 72,
13774 	43, 47, 51, 53, 54, 55, 56, 57,
13775 	59, 66, 68, 69, 74
13776 };
13777 
13778 static const char _ragel_url_email_trans_actions[] = {
13779 	0, 0, 0, 0, 0, 0, 0, 0,
13780 	0, 0, 0, 0, 0, 0, 0, 0,
13781 	0, 0, 0, 1, 0, 1, 0, 1,
13782 	2, 3, 4, 0, 0, 0, 0, 0,
13783 	0, 0, 0, 0, 0, 1, 1, 1,
13784 	1, 0, 0, 0, 0, 0, 0, 0,
13785 	0, 1, 0, 0, 0, 0, 0, 0,
13786 	0, 0, 0, 0, 1, 1, 1, 1,
13787 	0, 0, 0, 0, 0, 0, 0, 0,
13788 	1, 1, 1, 1, 1
13789 };
13790 
13791 static const int ragel_url_email_start = 1;
13792 
13793 vector<uint8_t> ragel_tokenizer::ragel_map;
13794 atomic_flag ragel_tokenizer::ragel_map_flag = ATOMIC_FLAG_INIT;
13795 
ragel_tokenizer(unsigned url_email_tokenizer)13796 ragel_tokenizer::ragel_tokenizer(unsigned url_email_tokenizer) : unicode_tokenizer(url_email_tokenizer) {
13797   initialize_ragel_map();
13798 }
13799 
initialize_ragel_map()13800 void ragel_tokenizer::initialize_ragel_map() {
13801   while (ragel_map_flag.test_and_set()) {}
13802   if (ragel_map.empty()) {
13803     for (uint8_t ascii = 0; ascii < 128; ascii++)
13804       ragel_map.push_back(ascii);
13805 
13806     ragel_map_add(U'\u2026', 160); // horizontal ellipsis (TRIPLE DOT)
13807     ragel_map_add(U'\u2019', 161); // right single quotation mark
13808     ragel_map_add(U'\u2018', 162); // left single quotation mark
13809     ragel_map_add(U'\u2010', 163); // hyphen
13810   }
13811   ragel_map_flag.clear();
13812 }
13813 
ragel_map_add(char32_t chr,uint8_t mapping)13814 void ragel_tokenizer::ragel_map_add(char32_t chr, uint8_t mapping) {
13815   if (chr >= ragel_map.size())
13816     ragel_map.resize(chr + 1, 128);
13817   ragel_map[chr] = mapping;
13818 }
13819 
ragel_url_email(unsigned version,const vector<char_info> & chars,size_t & current,vector<token_range> & tokens)13820 bool ragel_tokenizer::ragel_url_email(unsigned version, const vector<char_info>& chars, size_t& current, vector<token_range>& tokens) {
13821   int cs;
13822 
13823   size_t start = current, end = current, parens = 0;
13824 
13825 	{
13826 	cs = ragel_url_email_start;
13827 	}
13828 
13829 	{
13830 	int _klen;
13831 	const short *_keys;
13832 	int _trans;
13833 	short _widec;
13834 
13835 	if ( ( current) == ( (chars.size() - 1)) )
13836 		goto _test_eof;
13837 	if ( cs == 0 )
13838 		goto _out;
13839 _resume:
13840 	_widec = ( ragel_char(chars[current]));
13841 	_klen = _ragel_url_email_cond_lengths[cs];
13842 	_keys = _ragel_url_email_cond_keys + (_ragel_url_email_cond_offsets[cs]*2);
13843 	if ( _klen > 0 ) {
13844 		const short *_lower = _keys;
13845 		const short *_mid;
13846 		const short *_upper = _keys + (_klen<<1) - 2;
13847 		while (1) {
13848 			if ( _upper < _lower )
13849 				break;
13850 
13851 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
13852 			if ( _widec < _mid[0] )
13853 				_upper = _mid - 2;
13854 			else if ( _widec > _mid[1] )
13855 				_lower = _mid + 2;
13856 			else {
13857 				switch ( _ragel_url_email_cond_spaces[_ragel_url_email_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
13858 	case 0: {
13859 		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
13860 		if (
13861  version >= 2  ) _widec += 256;
13862 		break;
13863 	}
13864 	case 1: {
13865 		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
13866 		if (
13867 parens ) _widec += 256;
13868 		break;
13869 	}
13870 				}
13871 				break;
13872 			}
13873 		}
13874 	}
13875 
13876 	_keys = _ragel_url_email_trans_keys + _ragel_url_email_key_offsets[cs];
13877 	_trans = _ragel_url_email_index_offsets[cs];
13878 
13879 	_klen = _ragel_url_email_single_lengths[cs];
13880 	if ( _klen > 0 ) {
13881 		const short *_lower = _keys;
13882 		const short *_mid;
13883 		const short *_upper = _keys + _klen - 1;
13884 		while (1) {
13885 			if ( _upper < _lower )
13886 				break;
13887 
13888 			_mid = _lower + ((_upper-_lower) >> 1);
13889 			if ( _widec < *_mid )
13890 				_upper = _mid - 1;
13891 			else if ( _widec > *_mid )
13892 				_lower = _mid + 1;
13893 			else {
13894 				_trans += (unsigned int)(_mid - _keys);
13895 				goto _match;
13896 			}
13897 		}
13898 		_keys += _klen;
13899 		_trans += _klen;
13900 	}
13901 
13902 	_klen = _ragel_url_email_range_lengths[cs];
13903 	if ( _klen > 0 ) {
13904 		const short *_lower = _keys;
13905 		const short *_mid;
13906 		const short *_upper = _keys + (_klen<<1) - 2;
13907 		while (1) {
13908 			if ( _upper < _lower )
13909 				break;
13910 
13911 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
13912 			if ( _widec < _mid[0] )
13913 				_upper = _mid - 2;
13914 			else if ( _widec > _mid[1] )
13915 				_lower = _mid + 2;
13916 			else {
13917 				_trans += (unsigned int)((_mid - _keys)>>1);
13918 				goto _match;
13919 			}
13920 		}
13921 		_trans += _klen;
13922 	}
13923 
13924 _match:
13925 	_trans = _ragel_url_email_indicies[_trans];
13926 	cs = _ragel_url_email_trans_targs[_trans];
13927 
13928 	if ( _ragel_url_email_trans_actions[_trans] == 0 )
13929 		goto _again;
13930 
13931 	switch ( _ragel_url_email_trans_actions[_trans] ) {
13932 	case 3:
13933 	{parens-=!!parens;}
13934 	break;
13935 	case 1:
13936 	{ end = current + 1; }
13937 	break;
13938 	case 2:
13939 	{parens++;}
13940 	{ end = current + 1; }
13941 	break;
13942 	case 4:
13943 	{parens-=!!parens;}
13944 	{ end = current + 1; }
13945 	break;
13946 	}
13947 
13948 _again:
13949 	if ( cs == 0 )
13950 		goto _out;
13951 	if ( ++( current) != ( (chars.size() - 1)) )
13952 		goto _resume;
13953 	_test_eof: {}
13954 	_out: {}
13955 	}
13956 
13957   if (end > start) {
13958     tokens.emplace_back(start, end - start);
13959     current = end;
13960     return true;
13961   } else {
13962     current = start;
13963     return false;
13964   }
13965 }
13966 
13967 } // namespace morphodita
13968 
13969 /////////
13970 // File: morphodita/tokenizer/vertical_tokenizer.h
13971 /////////
13972 
13973 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
13974 //
13975 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
13976 // Mathematics and Physics, Charles University in Prague, Czech Republic.
13977 //
13978 // This Source Code Form is subject to the terms of the Mozilla Public
13979 // License, v. 2.0. If a copy of the MPL was not distributed with this
13980 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
13981 
13982 namespace morphodita {
13983 
13984 class vertical_tokenizer : public unicode_tokenizer {
13985  public:
vertical_tokenizer()13986   vertical_tokenizer() : unicode_tokenizer(0) {}
13987 
13988   virtual bool next_sentence(vector<token_range>& tokens) override;
13989 };
13990 
13991 } // namespace morphodita
13992 
13993 /////////
13994 // File: morphodita/tokenizer/tokenizer.cpp
13995 /////////
13996 
13997 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
13998 //
13999 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14000 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14001 //
14002 // This Source Code Form is subject to the terms of the Mozilla Public
14003 // License, v. 2.0. If a copy of the MPL was not distributed with this
14004 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14005 
14006 namespace morphodita {
14007 
new_vertical_tokenizer()14008 tokenizer* tokenizer::new_vertical_tokenizer() {
14009   return new vertical_tokenizer();
14010 }
14011 
new_czech_tokenizer()14012 tokenizer* tokenizer::new_czech_tokenizer() {
14013   return new czech_tokenizer(czech_tokenizer::CZECH, czech_tokenizer::LATEST);
14014 }
14015 
new_english_tokenizer()14016 tokenizer* tokenizer::new_english_tokenizer() {
14017   return new english_tokenizer(english_tokenizer::LATEST);
14018 }
14019 
new_generic_tokenizer()14020 tokenizer* tokenizer::new_generic_tokenizer() {
14021   return new generic_tokenizer(generic_tokenizer::LATEST);
14022 }
14023 
14024 } // namespace morphodita
14025 
14026 /////////
14027 // File: morphodita/tokenizer/tokenizer_ids.h
14028 /////////
14029 
14030 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
14031 //
14032 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14033 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14034 //
14035 // This Source Code Form is subject to the terms of the Mozilla Public
14036 // License, v. 2.0. If a copy of the MPL was not distributed with this
14037 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14038 
14039 namespace morphodita {
14040 
14041 class tokenizer_ids {
14042  public:
14043   enum tokenizer_id {
14044     CZECH = 0,
14045     ENGLISH = 1,
14046     GENERIC = 2,
14047     GRU = 3,
14048   };
14049 
parse(const string & str,tokenizer_id & id)14050   static bool parse(const string& str, tokenizer_id& id) {
14051     if (str == "czech") return id = CZECH, true;
14052     if (str == "english") return id = ENGLISH, true;
14053     if (str == "generic") return id = GENERIC, true;
14054     if (str == "gru") return id = GRU, true;
14055     return false;
14056   }
14057 };
14058 
14059 typedef tokenizer_ids::tokenizer_id tokenizer_id;
14060 
14061 } // namespace morphodita
14062 
14063 /////////
14064 // File: morphodita/tokenizer/tokenizer_factory.cpp
14065 /////////
14066 
14067 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
14068 //
14069 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14070 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14071 //
14072 // This Source Code Form is subject to the terms of the Mozilla Public
14073 // License, v. 2.0. If a copy of the MPL was not distributed with this
14074 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14075 
14076 namespace morphodita {
14077 
load(istream & is)14078 tokenizer_factory* tokenizer_factory::load(istream& is) {
14079   tokenizer_id id = tokenizer_id(is.get());
14080   switch (id) {
14081     case tokenizer_ids::GENERIC:
14082       {
14083         auto res = new_unique_ptr<generic_tokenizer_factory>();
14084         if (res->load(is)) return res.release();
14085         break;
14086       }
14087     case tokenizer_ids::GRU:
14088       {
14089         auto res = new_unique_ptr<gru_tokenizer_factory>();
14090         if (res->load(is)) return res.release();
14091         break;
14092       }
14093     case tokenizer_ids::CZECH:
14094       break;
14095     case tokenizer_ids::ENGLISH:
14096       break;
14097   }
14098 
14099   return nullptr;
14100 }
14101 
load(const char * fname)14102 tokenizer_factory* tokenizer_factory::load(const char* fname) {
14103   ifstream f(fname, ifstream::binary);
14104   if (!f) return nullptr;
14105 
14106   return load(f);
14107 }
14108 
14109 } // namespace morphodita
14110 
14111 /////////
14112 // File: morphodita/tokenizer/unicode_tokenizer.cpp
14113 /////////
14114 
14115 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
14116 //
14117 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14118 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14119 //
14120 // This Source Code Form is subject to the terms of the Mozilla Public
14121 // License, v. 2.0. If a copy of the MPL was not distributed with this
14122 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14123 
14124 namespace morphodita {
14125 
unicode_tokenizer(unsigned url_email_tokenizer)14126 unicode_tokenizer::unicode_tokenizer(unsigned url_email_tokenizer) : url_email_tokenizer(url_email_tokenizer) {
14127   ragel_tokenizer::initialize_ragel_map();
14128 
14129   set_text(string_piece(nullptr, 0));
14130 }
14131 
set_text(string_piece text,bool make_copy)14132 void unicode_tokenizer::set_text(string_piece text, bool make_copy /*= false*/) {
14133   using namespace unilib;
14134 
14135   if (make_copy && text.str) {
14136     text_buffer.assign(text.str, text.len);
14137     text.str = text_buffer.c_str();
14138   }
14139   current = 0;
14140 
14141   chars.clear();
14142   for (const char* curr_str = text.str; text.len; curr_str = text.str)
14143     chars.emplace_back(utf8::decode(text.str, text.len), curr_str);
14144   chars.emplace_back(0, text.str);
14145 }
14146 
next_sentence(vector<string_piece> * forms,vector<token_range> * tokens_ptr)14147 bool unicode_tokenizer::next_sentence(vector<string_piece>* forms, vector<token_range>* tokens_ptr) {
14148   vector<token_range>& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer;
14149   tokens.clear();
14150   if (forms) forms->clear();
14151   if (current >= chars.size() - 1) return false;
14152 
14153   bool result = next_sentence(tokens);
14154   if (forms)
14155     for (auto&& token : tokens)
14156       forms->emplace_back(chars[token.start].str, chars[token.start + token.length].str - chars[token.start].str);
14157 
14158   return result;
14159 }
14160 
tokenize_url_email(vector<token_range> & tokens)14161 bool unicode_tokenizer::tokenize_url_email(vector<token_range>& tokens) {
14162   if (current >= chars.size() - 1) return false;
14163 
14164   return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false;
14165 }
14166 
emergency_sentence_split(const vector<token_range> & tokens)14167 bool unicode_tokenizer::emergency_sentence_split(const vector<token_range>& tokens) {
14168   using namespace unilib;
14169 
14170   // Implement emergency splitting for large sentences
14171   return tokens.size() >= 500 ||
14172          (tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) ||
14173          (tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po);
14174 }
14175 
is_eos(const vector<token_range> & tokens,char32_t eos_chr,const unordered_set<string> * abbreviations)14176 bool unicode_tokenizer::is_eos(const vector<token_range>& tokens, char32_t eos_chr, const unordered_set<string>* abbreviations) {
14177   using namespace unilib;
14178 
14179   if (eos_chr == '.' && !tokens.empty()) {
14180     // Ignore one-letter capitals before dot
14181     if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut)
14182       return false;
14183 
14184     // Ignore specified abbreviations
14185     if (abbreviations) {
14186       eos_buffer.clear();
14187       for (size_t i = 0; i < tokens.back().length; i++)
14188         utf8::append(eos_buffer, unicode::lowercase(chars[tokens.back().start + i].chr));
14189       if (abbreviations->count(eos_buffer))
14190         return false;
14191     }
14192   }
14193   return true;
14194 }
14195 
14196 } // namespace morphodita
14197 
14198 /////////
14199 // File: morphodita/tokenizer/vertical_tokenizer.cpp
14200 /////////
14201 
14202 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
14203 //
14204 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14205 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14206 //
14207 // This Source Code Form is subject to the terms of the Mozilla Public
14208 // License, v. 2.0. If a copy of the MPL was not distributed with this
14209 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14210 
14211 namespace morphodita {
14212 
next_sentence(vector<token_range> & tokens)14213 bool vertical_tokenizer::next_sentence(vector<token_range>& tokens) {
14214   if (current >= chars.size() - 1) return false;
14215 
14216   while (true) {
14217     size_t line_start = current;
14218     while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++;
14219 
14220     size_t line_end = current;
14221     if (current < chars.size() - 1) {
14222       current++;
14223       if (current < chars.size() - 1 &&
14224           ((chars[current-1].chr == '\r' && chars[current].chr == '\n') ||
14225            (chars[current-1].chr == '\n' && chars[current].chr == '\r')))
14226         current++;
14227     }
14228 
14229     if (line_start < line_end)
14230       tokens.emplace_back(line_start, line_end - line_start);
14231     else
14232       break;
14233   }
14234 
14235   return true;
14236 }
14237 
14238 } // namespace morphodita
14239 
14240 /////////
14241 // File: unilib/version.h
14242 /////////
14243 
14244 // This file is part of UniLib <http://github.com/ufal/unilib/>.
14245 //
14246 // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
14247 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14248 //
14249 // This Source Code Form is subject to the terms of the Mozilla Public
14250 // License, v. 2.0. If a copy of the MPL was not distributed with this
14251 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14252 //
14253 // UniLib version: 3.1.1
14254 // Unicode version: 8.0.0
14255 
14256 namespace unilib {
14257 
14258 struct version {
14259   unsigned major;
14260   unsigned minor;
14261   unsigned patch;
14262   std::string prerelease;
14263 
14264   // Returns current version.
14265   static version current();
14266 };
14267 
14268 } // namespace unilib
14269 
14270 /////////
14271 // File: morphodita/version/version.h
14272 /////////
14273 
14274 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
14275 //
14276 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14277 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14278 //
14279 // This Source Code Form is subject to the terms of the Mozilla Public
14280 // License, v. 2.0. If a copy of the MPL was not distributed with this
14281 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14282 
14283 namespace morphodita {
14284 
14285 class version {
14286  public:
14287   unsigned major;
14288   unsigned minor;
14289   unsigned patch;
14290   string prerelease;
14291 
14292   // Returns current MorphoDiTa version.
14293   static version current();
14294 
14295   // Returns multi-line formated version and copyright string.
14296   static string version_and_copyright(const string& other_libraries = string());
14297 };
14298 
14299 } // namespace morphodita
14300 
14301 /////////
14302 // File: morphodita/version/version.cpp
14303 /////////
14304 
14305 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
14306 //
14307 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14308 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14309 //
14310 // This Source Code Form is subject to the terms of the Mozilla Public
14311 // License, v. 2.0. If a copy of the MPL was not distributed with this
14312 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14313 
14314 namespace morphodita {
14315 
current()14316 version version::current() {
14317   return {1, 9, 3, "devel"};
14318 }
14319 
14320 // Returns multi-line formated version and copyright string.
version_and_copyright(const string & other_libraries)14321 string version::version_and_copyright(const string& other_libraries) {
14322   ostringstream info;
14323 
14324   auto morphodita = version::current();
14325   auto unilib = unilib::version::current();
14326 
14327   info << "MorphoDiTa version " << morphodita.major << '.' << morphodita.minor << '.' << morphodita.patch
14328        << (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
14329        << " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch
14330        << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
14331           "Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n"
14332           "Mathematics and Physics, Charles University in Prague, Czech Republic.";
14333 
14334   return info.str();
14335 }
14336 
14337 } // namespace morphodita
14338 
14339 /////////
14340 // File: parsito/configuration/configuration.cpp
14341 /////////
14342 
14343 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14344 //
14345 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14346 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14347 //
14348 // This Source Code Form is subject to the terms of the Mozilla Public
14349 // License, v. 2.0. If a copy of the MPL was not distributed with this
14350 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14351 
14352 namespace parsito {
14353 
init(tree * t)14354 void configuration::init(tree* t) {
14355   assert(t);
14356 
14357   t->unlink_all_nodes();
14358   this->t = t;
14359 
14360   stack.clear();
14361   if (!t->nodes.empty()) stack.push_back(0);
14362 
14363   buffer.clear();
14364   buffer.reserve(t->nodes.size());
14365   for (size_t i = t->nodes.size(); i > 1; i--)
14366     buffer.push_back(i - 1);
14367 }
14368 
final()14369 bool configuration::final() {
14370   return buffer.empty() && stack.size() <= 1;
14371 }
14372 
14373 } // namespace parsito
14374 
14375 /////////
14376 // File: parsito/configuration/node_extractor.h
14377 /////////
14378 
14379 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14380 //
14381 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14382 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14383 //
14384 // This Source Code Form is subject to the terms of the Mozilla Public
14385 // License, v. 2.0. If a copy of the MPL was not distributed with this
14386 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14387 
14388 namespace parsito {
14389 
14390 class node_extractor {
14391  public:
14392   unsigned node_count() const;
14393   void extract(const configuration& conf, vector<int>& nodes) const;
14394 
14395   bool create(string_piece description, string& error);
14396 
14397  private:
14398   enum start_t { STACK = 0, BUFFER = 1 };
14399   enum direction_t { PARENT = 0, CHILD = 1 };
14400   struct node_selector {
14401     pair<start_t, int> start;
14402     vector<pair<direction_t, int>> directions;
14403 
node_selectorufal::udpipe::parsito::node_extractor::node_selector14404     node_selector(start_t start, int start_index) : start(start, start_index) {}
14405   };
14406 
14407   vector<node_selector> selectors;
14408 };
14409 
14410 } // namespace parsito
14411 
14412 /////////
14413 // File: parsito/configuration/node_extractor.cpp
14414 /////////
14415 
14416 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14417 //
14418 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14419 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14420 //
14421 // This Source Code Form is subject to the terms of the Mozilla Public
14422 // License, v. 2.0. If a copy of the MPL was not distributed with this
14423 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14424 
14425 namespace parsito {
14426 
node_count() const14427 unsigned node_extractor::node_count() const {
14428   return selectors.size();
14429 }
14430 
extract(const configuration & conf,vector<int> & nodes) const14431 void node_extractor::extract(const configuration& conf, vector<int>& nodes) const {
14432   nodes.clear();
14433   for (auto&& selector : selectors) {
14434     // Start by locating starting node
14435     int current = -1;
14436     switch (selector.start.first) {
14437       case STACK:
14438         if (selector.start.second < int(conf.stack.size()))
14439           current = conf.stack[conf.stack.size() - 1 - selector.start.second];
14440         break;
14441       case BUFFER:
14442         if (selector.start.second < int(conf.buffer.size()))
14443           current = conf.buffer[conf.buffer.size() - 1 - selector.start.second];
14444         break;
14445     }
14446 
14447     // Follow directions to the final node
14448     if (current >= 0)
14449       for (auto&& direction : selector.directions) {
14450         const node& node = conf.t->nodes[current];
14451         switch (direction.first) {
14452           case PARENT:
14453             current = node.head ? node.head : -1;
14454             break;
14455           case CHILD:
14456             current = direction.second >= 0 && direction.second < int(node.children.size()) ?
14457                         node.children[direction.second] :
14458                       direction.second < 0 && -direction.second <= int(node.children.size()) ?
14459                         node.children[node.children.size() + direction.second] :
14460                         -1;
14461             break;
14462         }
14463         if (current <= 0) break;
14464       }
14465 
14466     // Add the selected node
14467     nodes.push_back(current);
14468   }
14469 }
14470 
create(string_piece description,string & error)14471 bool node_extractor::create(string_piece description, string& error) {
14472   selectors.clear();
14473   error.clear();
14474 
14475   vector<string_piece> lines, parts, words;
14476   split(description, '\n', lines);
14477   for (auto&& line : lines) {
14478     if (!line.len || line.str[0] == '#') continue;
14479 
14480     // Separate start and directions
14481     split(line, ',', parts);
14482 
14483     // Parse start
14484     split(parts[0], ' ', words);
14485     if (words.size() != 2)
14486       return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
14487 
14488     start_t start;
14489     if (words[0] == "stack")
14490       start = STACK;
14491     else if (words[0] == "buffer")
14492       start = BUFFER;
14493     else
14494       return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
14495 
14496     int start_index;
14497     if (!parse_int(words[1], "starting index", start_index, error)) return false;
14498 
14499     selectors.emplace_back(start, start_index);
14500 
14501     // Parse directions
14502     for (size_t i = 1; i < parts.size(); i++) {
14503       split(parts[i], ' ', words);
14504       if (words.empty())
14505         return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false;
14506 
14507       if (words[0] == "parent") {
14508         if (words.size() != 1)
14509           return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
14510         selectors.back().directions.emplace_back(PARENT, 0);
14511       } else if (words[0] == "child") {
14512         if (words.size() != 2)
14513           return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
14514         int child_index;
14515         if (!parse_int(words[1], "child index", child_index, error)) return false;
14516         selectors.back().directions.emplace_back(CHILD, child_index);
14517       } else {
14518         return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
14519       }
14520     }
14521   }
14522 
14523   return true;
14524 }
14525 
14526 } // namespace parsito
14527 
14528 /////////
14529 // File: parsito/configuration/value_extractor.h
14530 /////////
14531 
14532 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14533 //
14534 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14535 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14536 //
14537 // This Source Code Form is subject to the terms of the Mozilla Public
14538 // License, v. 2.0. If a copy of the MPL was not distributed with this
14539 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14540 
14541 namespace parsito {
14542 
14543 class value_extractor {
14544  public:
14545   void extract(const node& n, string& value) const;
14546 
14547   bool create(string_piece description, string& error);
14548 
14549  private:
14550   enum value_t { FORM = 0, LEMMA = 1, LEMMA_ID = 2, TAG = 3, UNIVERSAL_TAG = 4,
14551     FEATS = 5, UNIVERSAL_TAG_FEATS = 6, DEPREL = 7 };
14552   value_t selector;
14553 };
14554 
14555 } // namespace parsito
14556 
14557 /////////
14558 // File: parsito/configuration/value_extractor.cpp
14559 /////////
14560 
14561 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14562 //
14563 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14564 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14565 //
14566 // This Source Code Form is subject to the terms of the Mozilla Public
14567 // License, v. 2.0. If a copy of the MPL was not distributed with this
14568 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14569 
14570 namespace parsito {
14571 
extract(const node & n,string & value) const14572 void value_extractor::extract(const node& n, string& value) const {
14573   switch (selector) {
14574     case FORM:
14575       value.assign(n.form);
14576       break;
14577     case LEMMA:
14578       value.assign(n.lemma);
14579       break;
14580     case LEMMA_ID:
14581       if (!n.misc.empty()) {
14582         // Try finding LId= in misc column
14583         auto lid = n.misc.find("LId=");
14584         if (lid != string::npos) {
14585           lid += 4;
14586 
14587           // Find optional | ending the lemma_id
14588           auto lid_end = n.misc.find('|', lid);
14589           if (lid_end == string::npos) lid_end = n.misc.size();
14590 
14591           // Store the lemma_id
14592           value.assign(n.misc, lid, lid_end - lid);
14593           break;
14594         }
14595       }
14596       value.assign(n.lemma);
14597       break;
14598     case TAG:
14599       value.assign(n.xpostag);
14600       break;
14601     case UNIVERSAL_TAG:
14602       value.assign(n.upostag);
14603       break;
14604     case FEATS:
14605       value.assign(n.feats);
14606       break;
14607     case UNIVERSAL_TAG_FEATS:
14608       value.assign(n.upostag).append(n.feats);
14609       break;
14610     case DEPREL:
14611       value.assign(n.deprel);
14612       break;
14613   }
14614 }
14615 
create(string_piece description,string & error)14616 bool value_extractor::create(string_piece description, string& error) {
14617   error.clear();
14618 
14619   if (description == "form")
14620     selector = FORM;
14621   else if (description == "lemma")
14622     selector = LEMMA;
14623   else if (description == "lemma_id")
14624     selector = LEMMA_ID;
14625   else if (description == "tag")
14626     selector = TAG;
14627   else if (description == "universal_tag")
14628     selector = UNIVERSAL_TAG;
14629   else if (description == "feats")
14630     selector = FEATS;
14631   else if (description == "universal_tag_feats")
14632     selector = UNIVERSAL_TAG_FEATS;
14633   else if (description == "deprel")
14634     selector = DEPREL;
14635   else
14636     return error.assign("Cannot parse value selector '").append(description.str, description.len).append("'!"), false;
14637 
14638   return true;
14639 }
14640 
14641 } // namespace parsito
14642 
14643 /////////
14644 // File: parsito/embedding/embedding.h
14645 /////////
14646 
14647 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14648 //
14649 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14650 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14651 //
14652 // This Source Code Form is subject to the terms of the Mozilla Public
14653 // License, v. 2.0. If a copy of the MPL was not distributed with this
14654 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14655 
14656 namespace parsito {
14657 
14658 class embedding {
14659  public:
14660   unsigned dimension;
14661 
14662   int lookup_word(const string& word, string& buffer) const;
14663   int unknown_word() const;
14664   float* weight(int id); // nullptr for wrong id
14665   const float* weight(int id) const; // nullpt for wrong id
14666 
14667   bool can_update_weights(int id) const;
14668 
14669   void load(binary_decoder& data);
14670   void save(binary_encoder& enc) const;
14671 
14672   void create(unsigned dimension, int updatable_index, const vector<pair<string, vector<float>>>& words, const vector<float>& unknown_weights);
14673   void export_embeddings(vector<pair<string, vector<float>>>& words, vector<float>& unknown_weights) const;
14674  private:
14675   int updatable_index, unknown_index;
14676 
14677   unordered_map<string, int> dictionary;
14678   vector<float> weights;
14679 };
14680 
14681 } // namespace parsito
14682 
14683 /////////
14684 // File: parsito/embedding/embedding.cpp
14685 /////////
14686 
14687 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14688 //
14689 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14690 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14691 //
14692 // This Source Code Form is subject to the terms of the Mozilla Public
14693 // License, v. 2.0. If a copy of the MPL was not distributed with this
14694 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14695 
14696 namespace parsito {
14697 
lookup_word(const string & word,string & buffer) const14698 int embedding::lookup_word(const string& word, string& buffer) const {
14699   using namespace unilib;
14700 
14701   auto it = dictionary.find(word);
14702   if (it != dictionary.end()) return it->second;
14703 
14704   // We now apply several heuristics to find a match
14705 
14706   // Try locating uppercase/titlecase characters which we could lowercase
14707   bool first = true;
14708   unicode::category_t first_category = 0, other_categories = 0;
14709   for (auto&& chr : utf8::decoder(word)) {
14710     (first ? first_category : other_categories) |= unicode::category(chr);
14711     first = false;
14712   }
14713 
14714   if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) {
14715     // Lowercase all characters but the first
14716     buffer.clear();
14717     first = true;
14718     for (auto&& chr : utf8::decoder(word)) {
14719       utf8::append(buffer, first ? chr : unicode::lowercase(chr));
14720       first = false;
14721     }
14722 
14723     it = dictionary.find(buffer);
14724     if (it != dictionary.end()) return it->second;
14725   }
14726 
14727   if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) {
14728     utf8::map(unicode::lowercase, word, buffer);
14729 
14730     it = dictionary.find(buffer);
14731     if (it != dictionary.end()) return it->second;
14732   }
14733 
14734   // If the word starts with digit and contain only digits and non-letter characters
14735   // i.e. large number, date, time, try replacing it with first digit only.
14736   if ((first_category & unicode::N) && !(other_categories & unicode::L)) {
14737     buffer.clear();
14738     utf8::append(buffer, utf8::first(word));
14739 
14740     it = dictionary.find(buffer);
14741     if (it != dictionary.end()) return it->second;
14742   }
14743 
14744   return unknown_index;
14745 }
14746 
unknown_word() const14747 int embedding::unknown_word() const {
14748   return unknown_index;
14749 }
14750 
weight(int id)14751 float* embedding::weight(int id) {
14752   if (id < 0 || id * dimension >= weights.size()) return nullptr;
14753   return weights.data() + id * dimension;
14754 }
14755 
weight(int id) const14756 const float* embedding::weight(int id) const {
14757   if (id < 0 || id * dimension >= weights.size()) return nullptr;
14758   return weights.data() + id * dimension;
14759 }
14760 
load(binary_decoder & data)14761 void embedding::load(binary_decoder& data) {
14762   // Load dimemsion
14763   dimension = data.next_4B();
14764 
14765   updatable_index = numeric_limits<decltype(updatable_index)>::max();
14766 
14767   // Load dictionary
14768   dictionary.clear();
14769   string word;
14770   for (unsigned size = data.next_4B(); size; size--) {
14771     data.next_str(word);
14772     dictionary.emplace(word, dictionary.size());
14773   }
14774 
14775   unknown_index = data.next_1B() ? dictionary.size() : -1;
14776 
14777   // Load weights
14778   const float* weights_ptr = data.next<float>(dimension * (dictionary.size() + (unknown_index >= 0)));
14779   weights.assign(weights_ptr, weights_ptr + dimension * (dictionary.size() + (unknown_index >= 0)));
14780 }
14781 
14782 } // namespace parsito
14783 
14784 /////////
14785 // File: parsito/embedding/embedding_encode.cpp
14786 /////////
14787 
14788 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14789 //
14790 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14791 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14792 //
14793 // This Source Code Form is subject to the terms of the Mozilla Public
14794 // License, v. 2.0. If a copy of the MPL was not distributed with this
14795 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14796 
14797 namespace parsito {
14798 
save(binary_encoder & enc) const14799 void embedding::save(binary_encoder& enc) const {
14800   // Save dimension and update_weight
14801   enc.add_4B(dimension);
14802 
14803   // Save the dictionary
14804   vector<string_piece> words(dictionary.size());
14805   for (auto&& entry : dictionary) {
14806     assert(entry.second >= 0 && entry.second < int(dictionary.size()));
14807     words[entry.second] = entry.first;
14808   }
14809   enc.add_4B(dictionary.size());
14810   for (auto&& word : words)
14811     enc.add_str(word);
14812 
14813   enc.add_1B(unknown_index >= 0);
14814 
14815   // Save the weights
14816   enc.add_data(weights);
14817 }
14818 
can_update_weights(int id) const14819 bool embedding::can_update_weights(int id) const {
14820   return id >= int(updatable_index);
14821 }
14822 
create(unsigned dimension,int updatable_index,const vector<pair<string,vector<float>>> & words,const vector<float> & unknown_weights)14823 void embedding::create(unsigned dimension, int updatable_index, const vector<pair<string, vector<float>>>& words, const vector<float>& unknown_weights) {
14824   this->dimension = dimension;
14825   this->updatable_index = updatable_index;
14826 
14827   dictionary.clear();
14828   weights.clear();
14829   for (auto&& word : words) {
14830     assert(word.second.size() == dimension);
14831     dictionary.emplace(word.first, dictionary.size());
14832     weights.insert(weights.end(), word.second.begin(), word.second.end());
14833   }
14834 
14835   if (unknown_weights.empty()) {
14836     this->unknown_index = -1;
14837   } else {
14838     this->unknown_index = dictionary.size();
14839     weights.insert(weights.end(), unknown_weights.begin(), unknown_weights.end());
14840   }
14841 }
14842 
export_embeddings(vector<pair<string,vector<float>>> & words,vector<float> & unknown_weights) const14843 void embedding::export_embeddings(vector<pair<string, vector<float>>>& words, vector<float>& unknown_weights) const {
14844   words.clear();
14845   unknown_weights.clear();
14846 
14847   if (dictionary.empty()) return;
14848 
14849   assert(unknown_index < 0 || unknown_index == int(dictionary.size()));
14850 
14851   words.resize(dictionary.size());
14852   for (auto&& entry : dictionary) {
14853     words[entry.second].first = entry.first;
14854     words[entry.second].second.assign(weights.data() + entry.second * dimension, weights.data() + entry.second * dimension + dimension);
14855   }
14856   if (unknown_index >= 0)
14857     unknown_weights.assign(weights.data() + unknown_index * dimension, weights.data() + unknown_index * dimension + dimension);
14858 }
14859 
14860 } // namespace parsito
14861 
14862 /////////
14863 // File: parsito/network/activation_function.h
14864 /////////
14865 
14866 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14867 //
14868 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14869 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14870 //
14871 // This Source Code Form is subject to the terms of the Mozilla Public
14872 // License, v. 2.0. If a copy of the MPL was not distributed with this
14873 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14874 
14875 namespace parsito {
14876 
14877 struct activation_function {
14878   enum type { TANH = 0, CUBIC = 1, RELU = 2 };
14879 
createufal::udpipe::parsito::activation_function14880   static bool create(string_piece name, type& activation) {
14881     if (name == "tanh") return activation = TANH, true;
14882     if (name == "cubic") return activation = CUBIC, true;
14883     if (name == "relu") return activation = RELU, true;
14884     return false;
14885   }
14886 };
14887 
14888 } // namespace parsito
14889 
14890 /////////
14891 // File: parsito/network/neural_network.h
14892 /////////
14893 
14894 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14895 //
14896 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14897 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14898 //
14899 // This Source Code Form is subject to the terms of the Mozilla Public
14900 // License, v. 2.0. If a copy of the MPL was not distributed with this
14901 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14902 
14903 namespace parsito {
14904 
14905 class neural_network {
14906  public:
14907   typedef vector<vector<vector<float>>> embeddings_cache;
14908 
14909   void propagate(const vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences,
14910                  vector<float>& hidden_layer, vector<float>& outcomes, const embeddings_cache* cache = nullptr, bool softmax = true) const;
14911 
14912   void load(binary_decoder& data);
14913   void generate_tanh_cache();
14914   void generate_embeddings_cache(const vector<embedding>& embeddings, embeddings_cache& cache, unsigned max_words) const;
14915 
14916  private:
14917   friend class neural_network_trainer;
14918 
14919   void load_matrix(binary_decoder& data, vector<vector<float>>& m);
14920 
14921   activation_function::type hidden_layer_activation;
14922   vector<vector<float>> weights[2];
14923 
14924   vector<float> tanh_cache;
14925 };
14926 
14927 } // namespace parsito
14928 
14929 /////////
14930 // File: parsito/network/neural_network.cpp
14931 /////////
14932 
14933 // This file is part of Parsito <http://github.com/ufal/parsito/>.
14934 //
14935 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14936 // Mathematics and Physics, Charles University in Prague, Czech Republic.
14937 //
14938 // This Source Code Form is subject to the terms of the Mozilla Public
14939 // License, v. 2.0. If a copy of the MPL was not distributed with this
14940 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
14941 
14942 namespace parsito {
14943 
load_matrix(binary_decoder & data,vector<vector<float>> & m)14944 void neural_network::load_matrix(binary_decoder& data, vector<vector<float>>& m) {
14945   unsigned rows = data.next_4B();
14946   unsigned columns = data.next_4B();
14947 
14948   m.resize(rows);
14949   for (auto&& row : m) {
14950     const float* row_ptr = data.next<float>(columns);
14951     row.assign(row_ptr, row_ptr + columns);
14952   }
14953 }
14954 
load(binary_decoder & data)14955 void neural_network::load(binary_decoder& data) {
14956   hidden_layer_activation = activation_function::type(data.next_1B());
14957   load_matrix(data, weights[0]);
14958   load_matrix(data, weights[1]);
14959 }
14960 
propagate(const vector<embedding> & embeddings,const vector<const vector<int> * > & embedding_ids_sequences,vector<float> & hidden_layer,vector<float> & outcomes,const embeddings_cache * cache,bool softmax) const14961 void neural_network::propagate(const vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences,
14962                                vector<float>& hidden_layer, vector<float>& outcomes, const embeddings_cache* cache, bool softmax) const {
14963   assert(!weights[0].empty());
14964   assert(!weights[1].empty());
14965   for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size());
14966 
14967   unsigned hidden_layer_size = weights[0].front().size();
14968   unsigned outcomes_size = weights[1].front().size();
14969 
14970   outcomes.assign(outcomes_size, 0);
14971 
14972   // Hidden layer
14973   hidden_layer.assign(hidden_layer_size, 0);
14974 
14975   unsigned index = 0;
14976   for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++)
14977     for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++)
14978       if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) {
14979         unsigned word = embedding_ids_sequences[sequence]->at(i);
14980         if (cache && i < cache->size() && word < cache->at(i).size()) {
14981           // Use cache
14982           const float* precomputed = cache->at(i)[word].data() + sequence * hidden_layer_size;
14983           for (unsigned j = 0; j < hidden_layer_size; j++)
14984             hidden_layer[j] += precomputed[j];
14985         } else {
14986           // Compute directly
14987           const float* embedding = embeddings[i].weight(word);
14988           for (unsigned j = 0; j < embeddings[i].dimension; j++)
14989             for (unsigned k = 0; k < hidden_layer_size; k++)
14990               hidden_layer[k] += embedding[j] * weights[0][index + j][k];
14991         }
14992       }
14993   for (unsigned i = 0; i < hidden_layer_size; i++) // Bias
14994     hidden_layer[i] += weights[0][index][i];
14995 
14996   // Activation function
14997   switch (hidden_layer_activation) {
14998     case activation_function::TANH:
14999       if (!tanh_cache.empty())
15000         for (auto&& weight : hidden_layer)
15001           weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)];
15002       else
15003         for (auto&& weight : hidden_layer)
15004           weight = tanh(weight);
15005       break;
15006     case activation_function::CUBIC:
15007       for (auto&& weight : hidden_layer)
15008         weight = weight * weight * weight;
15009       break;
15010     case activation_function::RELU:
15011       for (auto&& weight : hidden_layer)
15012         if (weight < 0) weight = 0;
15013       break;
15014   }
15015 
15016   for (unsigned i = 0; i < hidden_layer_size; i++)
15017     for (unsigned j = 0; j < outcomes_size; j++)
15018       outcomes[j] += hidden_layer[i] * weights[1][i][j];
15019   for (unsigned i = 0; i < outcomes_size; i++) // Bias
15020     outcomes[i] += weights[1][hidden_layer_size][i];
15021 
15022   // Softmax if requested
15023   if (softmax) {
15024     float max = outcomes[0];
15025     for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i];
15026 
15027     float sum = 0;
15028     for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max));
15029     sum = 1 / sum;
15030 
15031     for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum;
15032   }
15033 }
15034 
generate_tanh_cache()15035 void neural_network::generate_tanh_cache() {
15036   tanh_cache.resize(2 * 10 * 32768);
15037   for (unsigned i = 0; i < tanh_cache.size(); i++)
15038     tanh_cache[i] = tanh(i / 32768.0 - 10);
15039 }
15040 
generate_embeddings_cache(const vector<embedding> & embeddings,embeddings_cache & cache,unsigned max_words) const15041 void neural_network::generate_embeddings_cache(const vector<embedding>& embeddings, embeddings_cache& cache, unsigned max_words) const {
15042   unsigned embeddings_dim = 0;
15043   for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension;
15044 
15045   unsigned sequences = weights[0].size() / embeddings_dim;
15046   assert(sequences * embeddings_dim + 1 == weights[0].size());
15047 
15048   unsigned hidden_layer_size = weights[0].front().size();
15049 
15050   cache.resize(embeddings.size());
15051   for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) {
15052     unsigned words = 0;
15053     while (words < max_words && embeddings[i].weight(words)) words++;
15054 
15055     cache[i].resize(words);
15056     for (unsigned word = 0; word < words; word++) {
15057       const float* embedding = embeddings[i].weight(word);
15058 
15059       cache[i][word].assign(sequences * hidden_layer_size, 0);
15060       for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++)
15061         for (unsigned j = 0; j < embeddings[i].dimension; j++)
15062           for (unsigned k = 0; k < hidden_layer_size; k++)
15063             cache[i][word][sequence * hidden_layer_size + k] += embedding[j] * weights[0][index + j][k];
15064     }
15065   }
15066 }
15067 
15068 } // namespace parsito
15069 
15070 /////////
15071 // File: parsito/network/network_parameters.h
15072 /////////
15073 
15074 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15075 //
15076 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15077 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15078 //
15079 // This Source Code Form is subject to the terms of the Mozilla Public
15080 // License, v. 2.0. If a copy of the MPL was not distributed with this
15081 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15082 
15083 namespace parsito {
15084 
15085 struct network_trainer {
15086   enum network_trainer_algorithm {
15087     SGD,
15088     SGD_MOMENTUM,
15089     ADAGRAD,
15090     ADADELTA,
15091     ADAM,
15092   };
15093 
15094   network_trainer_algorithm algorithm;
15095   float learning_rate, learning_rate_final;
15096   float momentum, momentum2;
15097   float epsilon;
15098 };
15099 
15100 struct network_parameters {
15101   unsigned iterations;
15102   int structured_interval;
15103   unsigned hidden_layer;
15104   activation_function::type hidden_layer_type;
15105   network_trainer trainer;
15106   unsigned batch_size;
15107   float initialization_range;
15108   float l1_regularization;
15109   float l2_regularization;
15110   float maxnorm_regularization;
15111   float dropout_hidden, dropout_input;
15112   bool early_stopping;
15113 };
15114 
15115 } // namespace parsito
15116 
15117 /////////
15118 // File: parsito/network/neural_network_trainer.h
15119 /////////
15120 
15121 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15122 //
15123 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15124 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15125 //
15126 // This Source Code Form is subject to the terms of the Mozilla Public
15127 // License, v. 2.0. If a copy of the MPL was not distributed with this
15128 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15129 
15130 namespace parsito {
15131 
15132 class neural_network_trainer {
15133  public:
15134   neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size,
15135                          const network_parameters& parameters, mt19937& generator);
15136 
15137   bool next_iteration();
15138 
15139   struct workspace {
15140     unsigned batch = 0;
15141     vector<float> outcomes;
15142     vector<float> hidden_layer;
15143     vector<float> error_outcomes;
15144     vector<float> error_hidden;
15145 
15146     // Delta accumulators
15147     vector<vector<float>> weights_batch[2];
15148     vector<vector<vector<float>>> error_embedding;
15149     vector<vector<unsigned>> error_embedding_nonempty;
15150 
15151     // Trainer data
15152     struct trainer_data {
15153       float delta = 0;
15154       float gradient = 0;
15155     };
15156     vector<vector<trainer_data>> weights_trainer[2];
15157     vector<vector<vector<trainer_data>>> embedding_trainer;
15158 
15159     // Dropout vectors
15160     vector<bool> input_dropout;
15161     vector<bool> hidden_dropout;
15162     vector<unsigned> hidden_kept;
15163   };
15164   void propagate(const vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences, workspace& w) const;
15165   void backpropagate(vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences, unsigned required_outcome, workspace& w);
15166 
15167   void finalize_sentence();
15168 
15169   void save_network(binary_encoder& enc) const;
15170 
15171  private:
15172   struct trainer_sgd {
15173     static bool need_trainer_data;
15174     static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15175   };
15176   struct trainer_sgd_momentum {
15177     static bool need_trainer_data;
15178     static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15179   };
15180   struct trainer_adagrad {
15181     static bool need_trainer_data;
15182     static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15183   };
15184   struct trainer_adadelta {
15185     static bool need_trainer_data;
15186     static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15187   };
15188   struct trainer_adam {
15189     static bool need_trainer_data;
15190     static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15191   };
15192   template <class TRAINER> void backpropagate_template(vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences, unsigned required_outcome, workspace& w);
15193 
15194   void l1_regularize();
15195   void maxnorm_regularize();
15196 
15197   void save_matrix(const vector<vector<float>>& m, binary_encoder& enc) const;
15198 
15199   neural_network& network;
15200   mt19937& generator;
15201   unsigned iteration, iterations, steps;
15202   network_trainer trainer;
15203   unsigned batch_size;
15204   float l1_regularization, l2_regularization, maxnorm_regularization;
15205   float dropout_hidden, dropout_input;
15206 };
15207 
15208 } // namespace parsito
15209 
15210 /////////
15211 // File: parsito/network/neural_network_trainer.cpp
15212 /////////
15213 
15214 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15215 //
15216 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15217 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15218 //
15219 // This Source Code Form is subject to the terms of the Mozilla Public
15220 // License, v. 2.0. If a copy of the MPL was not distributed with this
15221 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15222 
15223 namespace parsito {
15224 
neural_network_trainer(neural_network & network,unsigned input_size,unsigned output_size,const network_parameters & parameters,mt19937 & generator)15225 neural_network_trainer::neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size,
15226                                                const network_parameters& parameters, mt19937& generator) : network(network), generator(generator) {
15227   // Initialize hidden layer
15228   network.hidden_layer_activation = parameters.hidden_layer_type;
15229   if (parameters.hidden_layer) {
15230     float uniform_pre_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range :
15231         -parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer));
15232     uniform_real_distribution<float> uniform_pre_hidden(-uniform_pre_hidden_range, uniform_pre_hidden_range);
15233 
15234     network.weights[0].resize(input_size + 1/*bias*/);
15235     for (auto&& row : network.weights[0]) {
15236       row.resize(parameters.hidden_layer);
15237       for (auto&& weight : row)
15238         weight = uniform_pre_hidden(generator);
15239     }
15240 
15241     float uniform_post_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range :
15242         -parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer));
15243     uniform_real_distribution<float> uniform_post_hidden(-uniform_post_hidden_range, uniform_post_hidden_range);
15244 
15245     network.weights[1].resize(parameters.hidden_layer + 1/*bias*/);
15246     for (auto&& row : network.weights[1]) {
15247       row.resize(output_size);
15248       for (auto&& weight : row)
15249         weight = uniform_post_hidden(generator);
15250     }
15251   }
15252 
15253   // Store the network_parameters
15254   iteration = steps = 0;
15255   iterations = parameters.iterations;
15256   trainer = parameters.trainer;
15257   batch_size = parameters.batch_size;
15258   l1_regularization = parameters.l1_regularization;
15259   l2_regularization = parameters.l2_regularization;
15260   maxnorm_regularization = parameters.maxnorm_regularization;
15261   dropout_hidden = parameters.dropout_hidden;
15262   dropout_input = parameters.dropout_input;
15263 
15264   // Maxnorm regularize the created weights
15265   if (maxnorm_regularization) maxnorm_regularize();
15266 }
15267 
next_iteration()15268 bool neural_network_trainer::next_iteration() {
15269   if (iteration++ >= iterations) return false;
15270 
15271   if (trainer.algorithm != network_trainer::ADADELTA)
15272     if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1)
15273       trainer.learning_rate =
15274           exp(((iterations - iteration) * log(trainer.learning_rate) + log(trainer.learning_rate_final)) / (iterations - iteration + 1));
15275 
15276   return true;
15277 }
15278 
propagate(const vector<embedding> & embeddings,const vector<const vector<int> * > & embedding_ids_sequences,workspace & w) const15279 void neural_network_trainer::propagate(const vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences, workspace& w) const {
15280   // Initialize dropout if requested
15281   if (dropout_input) {
15282     w.input_dropout.resize(network.weights[0].size());
15283     bernoulli_distribution dropout(dropout_input);
15284     for (auto&& flag : w.input_dropout)
15285       flag = dropout(generator);
15286   }
15287 
15288   if (dropout_hidden) {
15289     w.hidden_dropout.resize(network.weights[1].size());
15290     bernoulli_distribution dropout(dropout_hidden);
15291     for (auto&& flag : w.hidden_dropout)
15292       flag = dropout(generator);
15293   }
15294   w.hidden_kept.clear();
15295   for (unsigned i = 0; i < network.weights[0].front().size(); i++)
15296     if (w.hidden_dropout.empty() || !w.hidden_dropout[i])
15297       w.hidden_kept.push_back(i);
15298 
15299   // Propagate
15300   unsigned hidden_layer_size = network.weights[0].front().size();
15301   unsigned outcomes_size = network.weights[1].front().size();
15302 
15303   w.outcomes.assign(outcomes_size, 0);
15304 
15305   // Hidden layer
15306   w.hidden_layer.assign(hidden_layer_size, 0);
15307 
15308   unsigned index = 0;
15309   for (auto&& embedding_ids : embedding_ids_sequences)
15310     // Note: The unnecessary brackets on the following for cycle are needed
15311     // to compile on VS 2015 Update 3, which otherwise fail to compile it.
15312     for (unsigned i = 0; i < embeddings.size(); i++) {
15313       if (embedding_ids && (*embedding_ids)[i] >= 0) {
15314         const float* embedding = embeddings[i].weight((*embedding_ids)[i]);
15315         for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++)
15316           if (w.input_dropout.empty() || !w.input_dropout[index])
15317             for (auto&& j : w.hidden_kept)
15318               w.hidden_layer[j] += *embedding * network.weights[0][index][j];
15319       } else {
15320         index += embeddings[i].dimension;
15321       }
15322     }
15323   if (dropout_input) { // Dropout normalization
15324     float dropout_factor = 1. / (1. - dropout_input);
15325     for (auto&& i : w.hidden_kept)
15326       w.hidden_layer[i] *= dropout_factor;
15327   }
15328   for (auto&& i : w.hidden_kept) // Bias
15329     w.hidden_layer[i] += network.weights[0][index][i];
15330 
15331   // Activation function
15332   switch (network.hidden_layer_activation) {
15333     case activation_function::TANH:
15334       for (auto&& weight : w.hidden_layer)
15335         weight = tanh(weight);
15336       break;
15337     case activation_function::CUBIC:
15338       for (auto&& weight : w.hidden_layer)
15339         weight = weight * weight * weight;
15340       break;
15341     case activation_function::RELU:
15342       for (auto&& weight : w.hidden_layer)
15343         if (weight < 0) weight = 0;
15344       break;
15345   }
15346   if (dropout_hidden) { // Dropout normalization
15347     float dropout_factor = 1. / (1. - dropout_hidden);
15348     for (auto&& i : w.hidden_kept)
15349       w.hidden_layer[i] *= dropout_factor;
15350   }
15351 
15352   for (auto&& i : w.hidden_kept)
15353     for (unsigned j = 0; j < outcomes_size; j++)
15354       w.outcomes[j] += w.hidden_layer[i] * network.weights[1][i][j];
15355   for (unsigned i = 0; i < outcomes_size; i++) // Bias
15356     w.outcomes[i] += network.weights[1][hidden_layer_size][i];
15357 
15358   // Softmax
15359   float max = w.outcomes[0];
15360   for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i];
15361 
15362   float sum = 0;
15363   for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max));
15364   sum = 1 / sum;
15365 
15366   for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum;
15367 }
15368 
15369 // SGD
15370 bool neural_network_trainer::trainer_sgd::need_trainer_data = false;
delta(float gradient,const network_trainer & trainer,workspace::trainer_data &)15371 float neural_network_trainer::trainer_sgd::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& /*data*/) {
15372   return trainer.learning_rate * gradient;
15373 }
15374 
15375 // SGD with momentum
15376 bool neural_network_trainer::trainer_sgd_momentum::need_trainer_data = true;
delta(float gradient,const network_trainer & trainer,workspace::trainer_data & data)15377 float neural_network_trainer::trainer_sgd_momentum::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15378   data.delta = trainer.momentum * data.delta + trainer.learning_rate * gradient;
15379   return data.delta;
15380 }
15381 
15382 // AdaGrad
15383 bool neural_network_trainer::trainer_adagrad::need_trainer_data = true;
delta(float gradient,const network_trainer & trainer,workspace::trainer_data & data)15384 float neural_network_trainer::trainer_adagrad::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15385   data.gradient += gradient * gradient;
15386   return trainer.learning_rate / sqrt(data.gradient + trainer.epsilon) * gradient;
15387 }
15388 
15389 // AdaDelta
15390 bool neural_network_trainer::trainer_adadelta::need_trainer_data = true;
delta(float gradient,const network_trainer & trainer,workspace::trainer_data & data)15391 float neural_network_trainer::trainer_adadelta::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15392   data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient * gradient;
15393   float delta = sqrt(data.delta + trainer.epsilon) / sqrt(data.gradient + trainer.epsilon) * gradient;
15394   data.delta = trainer.momentum * data.delta + (1 - trainer.momentum) * delta * delta;
15395   return delta;
15396 }
15397 
15398 // Adam
15399 bool neural_network_trainer::trainer_adam::need_trainer_data = true;
delta(float gradient,const network_trainer & trainer,workspace::trainer_data & data)15400 float neural_network_trainer::trainer_adam::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15401   data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient;
15402   data.delta = trainer.momentum2 * data.delta + (1 - trainer.momentum2) * gradient * gradient;
15403   return trainer.learning_rate * data.gradient / sqrt(data.delta + trainer.epsilon);
15404 }
15405 
15406 // Backpropagation
15407 template <class TRAINER>
backpropagate_template(vector<embedding> & embeddings,const vector<const vector<int> * > & embedding_ids_sequences,unsigned required_outcome,workspace & w)15408 void neural_network_trainer::backpropagate_template(vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) {
15409   size_t hidden_layer_size = network.weights[0].front().size();
15410   size_t outcomes_size = network.weights[1].front().size();
15411 
15412   // Allocate space for delta accumulators
15413   if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
15414   if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
15415   if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
15416   if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
15417 
15418   // Allocate space for trainer_data if required)
15419   workspace::trainer_data none_trainer_data;
15420   if (TRAINER::need_trainer_data) {
15421     while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
15422     while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
15423   }
15424 
15425   // Compute error vector
15426   w.error_outcomes.resize(outcomes_size);
15427   for (unsigned i = 0; i < outcomes_size; i++)
15428     w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
15429 
15430   // Backpropagate error_outcomes to error_hidden
15431   w.error_hidden.assign(hidden_layer_size, 0);
15432   for (auto&& i : w.hidden_kept)
15433     for (unsigned j = 0; j < outcomes_size; j++)
15434       w.error_hidden[i] += network.weights[1][i][j] * w.error_outcomes[j];
15435   // Dropout normalization
15436   if (dropout_hidden) {
15437     float dropout_factor = 1. / (1. - dropout_hidden);
15438     for (auto&& i : w.hidden_kept)
15439       w.error_hidden[i] *= dropout_factor;
15440   }
15441 
15442   // Perform activation function derivation
15443   switch (network.hidden_layer_activation) {
15444     case activation_function::TANH:
15445       for (auto&& i : w.hidden_kept)
15446         w.error_hidden[i] *= 1 - w.hidden_layer[i] * w.hidden_layer[i];
15447       break;
15448     case activation_function::CUBIC:
15449       for (auto&& i : w.hidden_kept) {
15450         float hidden_layer = cbrt(w.hidden_layer[i]);
15451         w.error_hidden[i] *= 3 * hidden_layer * hidden_layer;
15452       }
15453       break;
15454     case activation_function::RELU:
15455       for (auto&& i : w.hidden_kept)
15456         if (w.hidden_layer[i] <= 0)
15457           w.error_hidden[i] = 0;
15458       break;
15459   }
15460 
15461   // Update weights[1]
15462   for (auto&& i : w.hidden_kept) {
15463     if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
15464     for (unsigned j = 0; j < outcomes_size; j++)
15465       w.weights_batch[1][i][j] += w.hidden_layer[i] * w.error_outcomes[j];
15466   }
15467   // Bias
15468   if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
15469   for (unsigned i = 0; i < outcomes_size; i++)
15470     w.weights_batch[1][hidden_layer_size][i] += w.error_outcomes[i];
15471 
15472   // Dropout normalization
15473   if (dropout_input) {
15474     float dropout_factor = 1. / (1. - dropout_input);
15475     for (auto&& i : w.hidden_kept)
15476       w.error_hidden[i] *= dropout_factor;
15477   }
15478   // Update weights[0] and backpropagate to error_embedding
15479   unsigned index = 0;
15480   for (auto&& embedding_ids : embedding_ids_sequences)
15481     // Note: The unnecessary brackets on the following for cycle are needed
15482     // to compile on VS 2015 Update 3, which otherwise fail to compile it.
15483     for (unsigned i = 0; i < embeddings.size(); i++) {
15484       if (embedding_ids && (*embedding_ids)[i] >= 0) {
15485         int embedding_id = (*embedding_ids)[i];
15486 
15487         float* error_embedding = nullptr; // Accumulate embedding error if required
15488         if (embeddings[i].can_update_weights(embedding_id)) {
15489           if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
15490           if (w.error_embedding[i][embedding_id].empty()) {
15491             w.error_embedding[i][embedding_id].assign(embeddings[i].dimension, 0);
15492             w.error_embedding_nonempty[i].emplace_back(embedding_id);
15493           }
15494           error_embedding = w.error_embedding[i][embedding_id].data();
15495         }
15496 
15497         const float* embedding = embeddings[i].weight(embedding_id);
15498         for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
15499           if (w.input_dropout.empty() || !w.input_dropout[index]) {
15500             if (error_embedding)
15501               for (auto&& j : w.hidden_kept)
15502                 *error_embedding += network.weights[0][index][j] * w.error_hidden[j];
15503             if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
15504             for (auto&& j : w.hidden_kept)
15505               w.weights_batch[0][index][j] += *embedding * w.error_hidden[j];
15506           }
15507       } else {
15508         index += embeddings[i].dimension;
15509       }
15510     }
15511   // Bias
15512   {
15513     float negate_input_dropout = 1. - dropout_hidden;
15514     if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
15515     for (auto&& i : w.hidden_kept)
15516       w.weights_batch[0][index][i] += w.error_hidden[i] * negate_input_dropout;
15517   }
15518 
15519   // End if not at the end of the batch
15520   if (++w.batch < batch_size) return;
15521   w.batch = 0;
15522 
15523   // Update hidden weights
15524   if (!network.weights[0].empty())
15525     for (int i = 0; i < 2; i++) {
15526       for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
15527         if (!w.weights_batch[i][j].empty()) {
15528           for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
15529             network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
15530           w.weights_batch[i][j].clear();
15531         }
15532     }
15533 
15534   // Update embedding weights using error_embedding
15535   for (unsigned i = 0; i < embeddings.size(); i++) {
15536     for (auto&& id : w.error_embedding_nonempty[i]) {
15537       if (TRAINER::need_trainer_data) {
15538         if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
15539         if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
15540         if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
15541       }
15542       float* embedding = embeddings[i].weight(id);
15543       for (unsigned j = 0; j < embeddings[i].dimension; j++)
15544         embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j];
15545       w.error_embedding[i][id].clear();
15546     }
15547     w.error_embedding_nonempty[i].clear();
15548   }
15549 
15550   // Maxnorm regularize the updated weights
15551   if (maxnorm_regularization) maxnorm_regularize();
15552 }
15553 
backpropagate(vector<embedding> & embeddings,const vector<const vector<int> * > & embedding_ids_sequences,unsigned required_outcome,workspace & w)15554 void neural_network_trainer::backpropagate(vector<embedding>& embeddings, const vector<const vector<int>*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) {
15555   steps++;
15556 
15557   switch (trainer.algorithm) {
15558     case network_trainer::SGD:
15559       backpropagate_template<trainer_sgd>(embeddings, embedding_ids_sequences, required_outcome, w);
15560       return;
15561     case network_trainer::SGD_MOMENTUM:
15562       backpropagate_template<trainer_sgd_momentum>(embeddings, embedding_ids_sequences, required_outcome, w);
15563       return;
15564     case network_trainer::ADAGRAD:
15565       backpropagate_template<trainer_adagrad>(embeddings, embedding_ids_sequences, required_outcome, w);
15566       return;
15567     case network_trainer::ADADELTA:
15568       backpropagate_template<trainer_adadelta>(embeddings, embedding_ids_sequences, required_outcome, w);
15569       return;
15570     case network_trainer::ADAM:
15571       float original_learning_rate = trainer.learning_rate;
15572       trainer.learning_rate *= sqrt(1-pow(trainer.momentum2, steps)) / (1-pow(trainer.momentum, steps));
15573       backpropagate_template<trainer_adam>(embeddings, embedding_ids_sequences, required_outcome, w);
15574       trainer.learning_rate = original_learning_rate;
15575       return;
15576   }
15577 
15578   training_failure("Internal error, unsupported trainer!");
15579 }
15580 
l1_regularize()15581 void neural_network_trainer::l1_regularize() {
15582   if (!l1_regularization) return;
15583 
15584   for (auto&& weights : network.weights)
15585     for (unsigned i = 0; i + 1 /*ignore biases*/ < weights.size(); i++) {
15586       auto& row = weights[i];
15587       for (auto&& weight : row)
15588         if (weight < l1_regularization) weight += l1_regularization;
15589         else if (weight > l1_regularization) weight -= l1_regularization;
15590         else weight = 0;
15591     }
15592 }
15593 
maxnorm_regularize()15594 void neural_network_trainer::maxnorm_regularize() {
15595   if (!maxnorm_regularization) return;
15596 
15597   for (unsigned i = 0; i < 2; i++)
15598     for (unsigned j = 0; j < network.weights[i].front().size(); j++) {
15599       float length = 0;
15600       for (auto&& row : network.weights[i])
15601         length += row[j] * row[j];
15602 
15603       if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) {
15604         float factor = 1 / sqrt(length / (maxnorm_regularization * maxnorm_regularization));
15605         for (auto&& row : network.weights[i])
15606           row[j] *= factor;
15607       }
15608     }
15609 }
15610 
finalize_sentence()15611 void neural_network_trainer::finalize_sentence() {
15612   if (l1_regularization) l1_regularize();
15613 }
15614 
save_matrix(const vector<vector<float>> & m,binary_encoder & enc) const15615 void neural_network_trainer::save_matrix(const vector<vector<float>>& m, binary_encoder& enc) const {
15616   enc.add_4B(m.size());
15617   enc.add_4B(m.empty() ? 0 : m.front().size());
15618 
15619   for (auto&& row : m) {
15620     assert(row.size() == m.front().size());
15621     enc.add_data(row);
15622   }
15623 }
15624 
save_network(binary_encoder & enc) const15625 void neural_network_trainer::save_network(binary_encoder& enc) const {
15626   enc.add_1B(network.hidden_layer_activation);
15627   save_matrix(network.weights[0], enc);
15628   save_matrix(network.weights[1], enc);
15629 }
15630 
15631 } // namespace parsito
15632 
15633 /////////
15634 // File: parsito/transition/transition.h
15635 /////////
15636 
15637 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15638 //
15639 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15640 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15641 //
15642 // This Source Code Form is subject to the terms of the Mozilla Public
15643 // License, v. 2.0. If a copy of the MPL was not distributed with this
15644 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15645 
15646 namespace parsito {
15647 
15648 // Abstract transition class
15649 class transition {
15650  public:
~transition()15651   virtual ~transition() {}
15652 
15653   virtual bool applicable(const configuration& conf) const = 0;
15654   virtual int perform(configuration& conf) const = 0;
15655 };
15656 
15657 // Specific transition classes
15658 class transition_left_arc : public transition {
15659  public:
transition_left_arc(const string & label)15660   transition_left_arc(const string& label) : label(label), label_is_root(label == "root") {}
15661 
15662   virtual bool applicable(const configuration& conf) const override;
15663   virtual int perform(configuration& conf) const override;
15664  private:
15665   string label;
15666   bool label_is_root;
15667 };
15668 
15669 class transition_right_arc : public transition {
15670  public:
transition_right_arc(const string & label)15671   transition_right_arc(const string& label) : label(label), label_is_root(label == "root") {}
15672 
15673   virtual bool applicable(const configuration& conf) const override;
15674   virtual int perform(configuration& conf) const override;
15675  private:
15676   string label;
15677   bool label_is_root;
15678 };
15679 
15680 class transition_shift : public transition {
15681  public:
15682   virtual bool applicable(const configuration& conf) const override;
15683   virtual int perform(configuration& conf) const override;
15684 };
15685 
15686 class transition_swap : public transition {
15687  public:
15688   virtual bool applicable(const configuration& conf) const override;
15689   virtual int perform(configuration& conf) const override;
15690 };
15691 
15692 class transition_left_arc_2 : public transition {
15693  public:
transition_left_arc_2(const string & label)15694   transition_left_arc_2(const string& label) : label(label), label_is_root(label == "root") {}
15695 
15696   virtual bool applicable(const configuration& conf) const override;
15697   virtual int perform(configuration& conf) const override;
15698  private:
15699   string label;
15700   bool label_is_root;
15701 };
15702 
15703 class transition_right_arc_2 : public transition {
15704  public:
transition_right_arc_2(const string & label)15705   transition_right_arc_2(const string& label) : label(label), label_is_root(label == "root") {}
15706 
15707   virtual bool applicable(const configuration& conf) const override;
15708   virtual int perform(configuration& conf) const override;
15709  private:
15710   string label;
15711   bool label_is_root;
15712 };
15713 
15714 } // namespace parsito
15715 
15716 /////////
15717 // File: parsito/transition/transition_oracle.h
15718 /////////
15719 
15720 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15721 //
15722 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15723 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15724 //
15725 // This Source Code Form is subject to the terms of the Mozilla Public
15726 // License, v. 2.0. If a copy of the MPL was not distributed with this
15727 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15728 
15729 namespace parsito {
15730 
15731 class transition_oracle {
15732  public:
~transition_oracle()15733   virtual ~transition_oracle() {}
15734 
15735   struct predicted_transition {
15736     unsigned best;
15737     unsigned to_follow;
15738 
predicted_transitionufal::udpipe::parsito::transition_oracle::predicted_transition15739     predicted_transition(unsigned best, unsigned to_follow) : best(best), to_follow(to_follow) {}
15740   };
15741 
15742   class tree_oracle {
15743    public:
~tree_oracle()15744     virtual ~tree_oracle() {}
15745 
15746     virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const = 0;
15747     virtual void interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const = 0;
15748   };
15749 
15750   virtual unique_ptr<tree_oracle> create_tree_oracle(const tree& gold) const = 0;
15751 };
15752 
15753 } // namespace parsito
15754 
15755 /////////
15756 // File: parsito/transition/transition_system.h
15757 /////////
15758 
15759 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15760 //
15761 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15762 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15763 //
15764 // This Source Code Form is subject to the terms of the Mozilla Public
15765 // License, v. 2.0. If a copy of the MPL was not distributed with this
15766 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15767 
15768 namespace parsito {
15769 
15770 class transition_system {
15771  public:
~transition_system()15772   virtual ~transition_system() {}
15773 
15774   virtual unsigned transition_count() const;
15775   virtual bool applicable(const configuration& conf, unsigned transition) const;
15776   virtual int perform(configuration& conf, unsigned transition) const;
15777   virtual transition_oracle* oracle(const string& name) const = 0;
15778 
15779   static transition_system* create(const string& name, const vector<string>& labels);
15780 
15781  protected:
transition_system(const vector<string> & labels)15782   transition_system(const vector<string>& labels) : labels(labels) {}
15783 
15784   const vector<string>& labels;
15785   vector<unique_ptr<transition>> transitions;
15786 };
15787 
15788 } // namespace parsito
15789 
15790 /////////
15791 // File: parsito/parser/parser_nn.h
15792 /////////
15793 
15794 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15795 //
15796 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15797 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15798 //
15799 // This Source Code Form is subject to the terms of the Mozilla Public
15800 // License, v. 2.0. If a copy of the MPL was not distributed with this
15801 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15802 
15803 namespace parsito {
15804 
15805 class parser_nn : public parser {
15806  public:
15807   parser_nn(bool versioned);
15808 
15809   virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const override;
15810 
15811  protected:
15812   virtual void load(binary_decoder& data, unsigned cache) override;
15813 
15814  private:
15815   friend class parser_nn_trainer;
15816   void parse_greedy(tree& t, double* cost) const;
15817   void parse_beam_search(tree& t, unsigned beam_size, double* cost) const;
15818 
15819   bool versioned;
15820   unsigned version;
15821   bool single_root;
15822   enum { VERSION_LATEST = 2 };
15823 
15824   vector<string> labels;
15825   unique_ptr<transition_system> system;
15826 
15827   node_extractor nodes;
15828 
15829   vector<value_extractor> values;
15830   vector<embedding> embeddings;
15831 
15832   neural_network network;
15833   neural_network::embeddings_cache embeddings_cache;
15834 
15835   struct workspace {
workspaceufal::udpipe::parsito::parser_nn::workspace15836     workspace(bool single_root) : conf(single_root) {}
15837 
15838     configuration conf;
15839 
15840     string word, word_buffer;
15841     vector<vector<int>> embeddings;
15842     vector<vector<string>> embeddings_values;
15843 
15844     vector<int> extracted_nodes;
15845     vector<const vector<int>*> extracted_embeddings;
15846 
15847     vector<float> outcomes, network_buffer;
15848 
15849     // Beam-size structures
15850     struct beam_size_configuration {
beam_size_configurationufal::udpipe::parsito::parser_nn::workspace::beam_size_configuration15851       beam_size_configuration(bool single_root) : conf(single_root) {}
15852 
15853       configuration conf;
15854       vector<int> heads;
15855       vector<string> deprels;
15856       double cost;
15857 
15858       void refresh_tree();
15859       void save_tree();
15860     };
15861     struct beam_size_alternative {
15862       const beam_size_configuration* bs_conf;
15863       int transition;
15864       double cost;
operator <ufal::udpipe::parsito::parser_nn::workspace::beam_size_alternative15865       bool operator<(const beam_size_alternative& other) const { return cost > other.cost; }
15866 
beam_size_alternativeufal::udpipe::parsito::parser_nn::workspace::beam_size_alternative15867       beam_size_alternative(const beam_size_configuration* bs_conf, int transition, double cost)
15868           : bs_conf(bs_conf), transition(transition), cost(cost) {}
15869     };
15870     vector<beam_size_configuration> bs_confs[2]; size_t bs_confs_size[2];
15871     vector<beam_size_alternative> bs_alternatives;
15872   };
15873   mutable threadsafe_stack<workspace> workspaces;
15874 };
15875 
15876 } // namespace parsito
15877 
15878 /////////
15879 // File: parsito/parser/parser.cpp
15880 /////////
15881 
15882 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15883 //
15884 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15885 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15886 //
15887 // This Source Code Form is subject to the terms of the Mozilla Public
15888 // License, v. 2.0. If a copy of the MPL was not distributed with this
15889 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15890 
15891 namespace parsito {
15892 
load(const char * file,unsigned cache)15893 parser* parser::load(const char* file, unsigned cache) {
15894   ifstream in(file, ifstream::in | ifstream::binary);
15895   if (!in.is_open()) return nullptr;
15896   return load(in, cache);
15897 }
15898 
load(istream & in,unsigned cache)15899 parser* parser::load(istream& in, unsigned cache) {
15900   unique_ptr<parser> result;
15901 
15902   binary_decoder data;
15903   if (!compressor::load(in, data)) return nullptr;
15904 
15905   try {
15906     string name;
15907     data.next_str(name);
15908 
15909     result.reset(create(name));
15910     if (!result) return nullptr;
15911 
15912     result->load(data, cache);
15913   } catch (binary_decoder_error&) {
15914     return nullptr;
15915   }
15916 
15917   return result && data.is_end() ? result.release() : nullptr;
15918 }
15919 
create(const string & name)15920 parser* parser::create(const string& name) {
15921   if (name == "nn") return new parser_nn(false);
15922   if (name == "nn_versioned") return new parser_nn(true);
15923   return nullptr;
15924 }
15925 
15926 } // namespace parsito
15927 
15928 /////////
15929 // File: parsito/parser/parser_nn.cpp
15930 /////////
15931 
15932 // This file is part of Parsito <http://github.com/ufal/parsito/>.
15933 //
15934 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15935 // Mathematics and Physics, Charles University in Prague, Czech Republic.
15936 //
15937 // This Source Code Form is subject to the terms of the Mozilla Public
15938 // License, v. 2.0. If a copy of the MPL was not distributed with this
15939 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
15940 
15941 namespace parsito {
15942 
15943 // Versions:
15944 // 1: initial version
15945 // 2: add ReLU activation function
15946 
parser_nn(bool versioned)15947 parser_nn::parser_nn(bool versioned) : versioned(versioned) {}
15948 
parse(tree & t,unsigned beam_size,double * cost) const15949 void parser_nn::parse(tree& t, unsigned beam_size, double* cost) const {
15950   if (beam_size > 1)
15951     parse_beam_search(t, beam_size, cost);
15952   else
15953     parse_greedy(t, cost);
15954 }
15955 
parse_greedy(tree & t,double * cost) const15956 void parser_nn::parse_greedy(tree& t, double* cost) const {
15957   assert(system);
15958   if (cost) *cost = 0.;
15959 
15960   // Retrieve or create workspace
15961   workspace* w = workspaces.pop();
15962   if (!w) w = new workspace(single_root);
15963 
15964   // Create configuration
15965   w->conf.init(&t);
15966 
15967   // Compute embeddings of all nodes
15968   if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size());
15969   for (size_t i = 0; i < t.nodes.size(); i++) {
15970     if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size());
15971     for (size_t j = 0; j < embeddings.size(); j++) {
15972       values[j].extract(t.nodes[i], w->word);
15973       w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer);
15974     }
15975   }
15976 
15977   // Compute which transitions to perform and perform them
15978   int transitions = 0;
15979   for (; !w->conf.final(); transitions++) {
15980     // Extract nodes from the configuration
15981     nodes.extract(w->conf, w->extracted_nodes);
15982     w->extracted_embeddings.resize(w->extracted_nodes.size());
15983     for (size_t i = 0; i < w->extracted_nodes.size(); i++)
15984       w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr;
15985 
15986     // Classify using neural network
15987     network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache, cost ? true : false);
15988 
15989     // Find most probable applicable transition
15990     int best = -1;
15991     for (unsigned i = 0; i < w->outcomes.size(); i++)
15992       if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best]))
15993         best = i;
15994 
15995     // Perform the best transition
15996     int child = system->perform(w->conf, best);
15997     if (cost) *cost += log(w->outcomes[best]);
15998 
15999     // If a node was linked, recompute its embeddings as deprel has changed
16000     if (child >= 0)
16001       for (size_t i = 0; i < embeddings.size(); i++) {
16002         values[i].extract(t.nodes[child], w->word);
16003         w->embeddings[child][i] = embeddings[i].lookup_word(w->word, w->word_buffer);
16004       }
16005   }
16006 
16007   if (cost && transitions)
16008     *cost = *cost / transitions * (t.nodes.size() - 1);
16009 
16010   // Store workspace
16011   workspaces.push(w);
16012 }
16013 
parse_beam_search(tree & t,unsigned beam_size,double * cost) const16014 void parser_nn::parse_beam_search(tree& t, unsigned beam_size, double* cost) const {
16015   assert(system);
16016 
16017   // Retrieve or create workspace
16018   workspace* w = workspaces.pop();
16019   if (!w) w = new workspace(single_root);
16020 
16021   // Allocate and initialize configuration
16022   for (int i = 0; i < 2; i++) {
16023     while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root);
16024     while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back();
16025     w->bs_confs_size[i] = 0;
16026   }
16027   w->bs_confs[0][0].cost = 0;
16028   w->bs_confs[0][0].conf.init(&t);
16029   w->bs_confs[0][0].save_tree();
16030   w->bs_confs_size[0] = 1;
16031 
16032   // Compute embeddings of all nodes
16033   if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size());
16034   if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size());
16035   for (size_t i = 0; i < t.nodes.size(); i++) {
16036     if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size());
16037     if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size());
16038     for (size_t j = 0; j < embeddings.size(); j++) {
16039       values[j].extract(t.nodes[i], w->embeddings_values[i][j]);
16040       w->embeddings[i][j] = embeddings[j].lookup_word(w->embeddings_values[i][j], w->word_buffer);
16041     }
16042   }
16043 
16044   // Compute which transitions to perform and perform them
16045   size_t iteration = 0;
16046   for (bool all_final = false; !all_final; iteration++) {
16047     all_final = true;
16048     w->bs_alternatives.clear();
16049 
16050     for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) {
16051       auto& bs_conf = w->bs_confs[iteration & 1][c];
16052 
16053       if (bs_conf.conf.final()) {
16054         if (w->bs_alternatives.size() == beam_size) {
16055           if (bs_conf.cost <= w->bs_alternatives[0].cost) continue;
16056           pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16057           w->bs_alternatives.pop_back();
16058         }
16059         w->bs_alternatives.emplace_back(&bs_conf, -1, bs_conf.cost);
16060         push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16061         continue;
16062       }
16063       all_final = false;
16064 
16065       bs_conf.refresh_tree();
16066       // Update embeddings for all nodes
16067       for (size_t i = 0; i < t.nodes.size(); i++)
16068         for (size_t j = 0; j < embeddings.size(); j++) {
16069           values[j].extract(t.nodes[i], w->word);
16070           if (w->word != w->embeddings_values[i][j]) {
16071             w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer);
16072             w->embeddings_values[i][j].assign(w->word);
16073           }
16074         }
16075 
16076       // Extract nodes from the configuration
16077       nodes.extract(bs_conf.conf, w->extracted_nodes);
16078       w->extracted_embeddings.resize(w->extracted_nodes.size());
16079       for (size_t i = 0; i < w->extracted_nodes.size(); i++)
16080         w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr;
16081 
16082       // Classify using neural network
16083       network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache);
16084 
16085       // Store all alternatives
16086       for (unsigned i = 0; i < w->outcomes.size(); i++)
16087         if (system->applicable(bs_conf.conf, i)) {
16088           double cost = (bs_conf.cost * iteration + log(w->outcomes[i])) / (iteration + 1);
16089           if (w->bs_alternatives.size() == beam_size) {
16090             if (cost <= w->bs_alternatives[0].cost) continue;
16091             pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16092             w->bs_alternatives.pop_back();
16093           }
16094           w->bs_alternatives.emplace_back(&bs_conf, i, cost);
16095           push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16096         }
16097     }
16098 
16099     w->bs_confs_size[(iteration + 1) & 1] = 0;
16100     for (auto&& alternative : w->bs_alternatives) {
16101       auto& bs_conf_new = w->bs_confs[(iteration + 1) & 1][w->bs_confs_size[(iteration + 1) & 1]++];
16102       bs_conf_new = *alternative.bs_conf;
16103       bs_conf_new.cost = alternative.cost;
16104       if (alternative.transition >= 0) {
16105         bs_conf_new.refresh_tree();
16106         system->perform(bs_conf_new.conf, alternative.transition);
16107         bs_conf_new.save_tree();
16108       }
16109     }
16110   }
16111 
16112   // Return the best tree
16113   size_t best = 0;
16114   for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++)
16115     if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost)
16116       best = i;
16117   w->bs_confs[iteration & 1][best].refresh_tree();
16118 
16119   if (cost) *cost = w->bs_confs[iteration & 1][best].cost * (t.nodes.size() - 1);
16120 
16121   // Store workspace
16122   workspaces.push(w);
16123 }
16124 
refresh_tree()16125 void parser_nn::workspace::beam_size_configuration::refresh_tree() {
16126   for (auto&& node : conf.t->nodes) node.children.clear();
16127   for (size_t i = 0; i < conf.t->nodes.size(); i++) {
16128     conf.t->nodes[i].head = heads[i];
16129     conf.t->nodes[i].deprel = deprels[i];
16130     if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i);
16131   }
16132 }
16133 
save_tree()16134 void parser_nn::workspace::beam_size_configuration::save_tree() {
16135   if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size());
16136   if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size());
16137   for (size_t i = 0; i < conf.t->nodes.size(); i++) {
16138     heads[i] = conf.t->nodes[i].head;
16139     deprels[i] = conf.t->nodes[i].deprel;
16140   }
16141 }
16142 
load(binary_decoder & data,unsigned cache)16143 void parser_nn::load(binary_decoder& data, unsigned cache) {
16144   string description, error;
16145 
16146   version = versioned ? data.next_1B() : 1;
16147   if (!(version >= 1 && version <= VERSION_LATEST))
16148     throw binary_decoder_error("Unrecognized version of the parser_nn model");
16149 
16150   single_root = version >= 2 ? data.next_1B() : false;
16151 
16152   // Load labels
16153   labels.resize(data.next_2B());
16154   for (auto&& label : labels)
16155     data.next_str(label);
16156 
16157   // Load transition system
16158   string system_name;
16159   data.next_str(system_name);
16160   system.reset(transition_system::create(system_name, labels));
16161   if (!system) throw binary_decoder_error("Cannot load transition system");
16162 
16163   // Load node extractor
16164   data.next_str(description);
16165   if (!nodes.create(description, error))
16166     throw binary_decoder_error(error.c_str());
16167 
16168   // Load value extractors and embeddings
16169   values.resize(data.next_2B());
16170   for (auto&& value : values) {
16171     data.next_str(description);
16172     if (!value.create(description, error))
16173       throw binary_decoder_error(error.c_str());
16174   }
16175 
16176   embeddings.resize(values.size());
16177   for (auto&& embedding : embeddings)
16178     embedding.load(data);
16179 
16180   // Load the network
16181   network.load(data);
16182   network.generate_tanh_cache();
16183   network.generate_embeddings_cache(embeddings, embeddings_cache, cache);
16184 }
16185 
16186 } // namespace parsito
16187 
16188 /////////
16189 // File: parsito/parser/parser_nn_trainer.h
16190 /////////
16191 
16192 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16193 //
16194 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16195 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16196 //
16197 // This Source Code Form is subject to the terms of the Mozilla Public
16198 // License, v. 2.0. If a copy of the MPL was not distributed with this
16199 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16200 
16201 namespace parsito {
16202 
16203 class parser_nn_trainer {
16204  public:
16205   static void train(const string& transition_system_name, const string& transition_oracle_name, bool single_root,
16206                     const string& embeddings_description, const string& nodes_description, const network_parameters& parameters,
16207                     unsigned number_of_threads, const vector<tree>& train, const vector<tree>& heldout, binary_encoder& enc);
16208 };
16209 
16210 } // namespace parsito
16211 
16212 /////////
16213 // File: parsito/parser/parser_nn_trainer.cpp
16214 /////////
16215 
16216 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16217 //
16218 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16219 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16220 //
16221 // This Source Code Form is subject to the terms of the Mozilla Public
16222 // License, v. 2.0. If a copy of the MPL was not distributed with this
16223 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16224 
16225 namespace parsito {
16226 
train(const string & transition_system_name,const string & transition_oracle_name,bool single_root,const string & embeddings_description,const string & nodes_description,const network_parameters & parameters,unsigned,const vector<tree> & train,const vector<tree> & heldout,binary_encoder & enc)16227 void parser_nn_trainer::train(const string& transition_system_name, const string& transition_oracle_name, bool single_root,
16228                               const string& embeddings_description, const string& nodes_description, const network_parameters& parameters,
16229                               unsigned /*number_of_threads*/, const vector<tree>& train, const vector<tree>& heldout, binary_encoder& enc) {
16230   if (train.empty()) training_failure("No training data was given!");
16231 
16232   // Random generator with fixed seed for reproducibility
16233   mt19937 generator(42);
16234 
16235   // Check that all non-root nodes have heads and nonempty deprel
16236   for (auto&& tree : train)
16237     for (auto&& node : tree.nodes)
16238       if (node.id) {
16239         if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!");
16240         if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!");
16241       }
16242 
16243   // Create parser instance to be trained
16244   parser_nn parser(true); parser.version = parser_nn::VERSION_LATEST;
16245 
16246   // Generate labels for transition system
16247   unordered_set<string> labels_set;
16248   for (auto&& tree : train)
16249     for (auto&& node : tree.nodes)
16250       if (node.id && !labels_set.count(node.deprel)) {
16251         labels_set.insert(node.deprel);
16252         parser.labels.push_back(node.deprel);
16253       }
16254 
16255   // If single_root, check that exactly root nodes have "root" deprel
16256   if (single_root) {
16257     for (auto&& tree : train) {
16258       unsigned roots = 0;
16259       for (auto&& node : tree.nodes)
16260         if (node.id) {
16261           if (node.head == 0 && node.deprel != "root")
16262             training_failure("When single root is required, every root node must have 'root' deprel!");
16263           if (node.head != 0 && node.deprel == "root")
16264             training_failure("When single root is required, any non-root cannot have 'root' deprel!");
16265           roots += node.head == 0;
16266         }
16267       if (roots != 1)
16268         training_failure("When single root is required, every training tree must have single root!");
16269     }
16270 
16271     // Make sure (in case input is really small) there is "root" deprel plus another one
16272     if (!labels_set.count("root"))
16273       training_failure("When single root is required, the deprel 'root' must be present!");
16274     if (labels_set.size() <= 1)
16275       training_failure("When single root is required, deprel different from 'root' must exist!");
16276   }
16277 
16278   // Create transition system and transition oracle
16279   parser.system.reset(transition_system::create(transition_system_name, parser.labels));
16280   if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!");
16281 
16282   unique_ptr<transition_oracle> oracle(parser.system->oracle(transition_oracle_name));
16283   if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!");
16284 
16285   // Create node_extractor
16286   string error;
16287   if (!parser.nodes.create(nodes_description, error)) training_failure(error);
16288 
16289   // Load value_extractors and embeddings
16290   vector<string> value_names;
16291   vector<string_piece> lines, tokens;
16292   split(embeddings_description, '\n', lines);
16293   for (auto&& line : lines) {
16294     // Ignore empty lines and comments
16295     if (!line.len || line.str[0] == '#') continue;
16296 
16297     split(line, ' ', tokens);
16298     if (!(tokens.size() >= 3 && tokens.size() <= 6))
16299       training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!");
16300 
16301     value_names.emplace_back(string(tokens[0].str, tokens[0].len));
16302     parser.values.emplace_back();
16303     if (!parser.values.back().create(tokens[0], error)) training_failure(error);
16304 
16305     int dimension = parse_int(tokens[1], "embedding dimension");
16306     int min_count = parse_int(tokens[2], "minimum frequency count");
16307     unsigned updatable_index = 0;
16308     unsigned embeddings_from_file = 0;
16309     string embeddings_from_file_comment;
16310     vector<pair<string, vector<float>>> weights;
16311     unordered_set<string> weights_set;
16312 
16313     // Compute words and counts present in the training data
16314     string word;
16315     unordered_map<string, int> word_counts;
16316     for (auto&& tree : train)
16317       for (auto&& node : tree.nodes)
16318         if (node.id) {
16319           parser.values.back().extract(node, word);
16320           word_counts[word]++;
16321         }
16322 
16323     // Load embedding if it was given
16324     if (tokens.size() >= 4) {
16325       int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1;
16326       int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits<int>::max();
16327       ifstream in(string(tokens[3].str, tokens[3].len));
16328       if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!");
16329 
16330       // Load first line containing dictionary size and dimensions
16331       string line;
16332       vector<string_piece> parts;
16333       if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!");
16334       split(line, ' ', parts);
16335       if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!");
16336       int file_dimension = parse_int(parts[1], "embedding file dimension");
16337 
16338       if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!");
16339 
16340       // Generate random projection when smaller dimension is required
16341       vector<vector<float>> projection;
16342       if (file_dimension > dimension) {
16343         embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]";
16344 
16345         uniform_real_distribution<double> uniform(0, 1);
16346         projection.resize(dimension);
16347         for (auto&& row : projection) {
16348           row.resize(file_dimension);
16349           for (auto&& weight : row) weight = uniform(generator);
16350 
16351           double sum = 0;
16352           for (auto&& weight : row) sum += weight;
16353           for (auto&& weight : row) weight /= sum;
16354         }
16355       }
16356 
16357       // Load input embedding
16358       vector<double> input_weights(file_dimension);
16359       vector<float> projected_weights(dimension);
16360       while (getline(in, line) && int(weights.size()) < max_embeddings) {
16361         split(line, ' ', parts);
16362         if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line
16363         if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]);
16364         for (int i = 0; i < file_dimension; i++)
16365           input_weights[i] = parse_double(parts[1 + i], "embedding weight");
16366 
16367         string word(parts[0].str, parts[0].len);
16368 
16369         // For update_weights == 2, ignore embeddings for unknown words
16370         if (update_weights == 2 && !word_counts.count(word))
16371           continue;
16372 
16373         for (int i = 0; i < dimension; i++)
16374           if (file_dimension == dimension) {
16375             projected_weights[i] = input_weights[i];
16376           } else {
16377             projected_weights[i] = 0;
16378             for (int j = 0; j < file_dimension; j++)
16379               projected_weights[i] += projection[i][j] * input_weights[j];
16380           }
16381 
16382         if (!weights_set.count(word)) {
16383           weights.emplace_back(word, projected_weights);
16384           weights_set.insert(word);
16385         }
16386       }
16387       embeddings_from_file = weights.size();
16388       updatable_index = update_weights ? 0 : embeddings_from_file;
16389     }
16390 
16391     // Add embedding for non-present word with min_count, sorted by count
16392     {
16393       vector<pair<int, string>> count_words;
16394       for (auto&& word_count : word_counts)
16395         if (word_count.second >= min_count && !weights_set.count(word_count.first))
16396           count_words.emplace_back(word_count.second, word_count.first);
16397 
16398       sort(count_words.rbegin(), count_words.rend());
16399 
16400       vector<float> word_weights(dimension);
16401       uniform_real_distribution<float> uniform(-1, 1);
16402       for (auto&& count_word : count_words) {
16403         for (auto&& word_weight : word_weights)
16404           word_weight = uniform(generator);
16405 
16406         weights.emplace_back(count_word.second, word_weights);
16407       }
16408     }
16409 
16410     // If there are unknown words in the training data, create initial embedding
16411     vector<float> unknown_weights(dimension);
16412     if (min_count > 1) {
16413       uniform_real_distribution<float> uniform(-1, 1);
16414 
16415       for (auto&& weight : unknown_weights)
16416         weight = uniform(generator);
16417     }
16418 
16419     // Add the embedding
16420     parser.embeddings.emplace_back();
16421     parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights);
16422 
16423     // Count the cover of this embedding
16424     string buffer;
16425     unsigned words_total = 0, words_covered = 0, words_covered_from_file = 0;
16426     for (auto&& tree : train)
16427       for (auto&& node : tree.nodes)
16428         if (node.id) {
16429           parser.values.back().extract(node, word);
16430           words_total++;
16431           int word_id = parser.embeddings.back().lookup_word(word, buffer);
16432           words_covered += word_id != parser.embeddings.back().unknown_word();
16433           words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file;
16434         }
16435 
16436     cerr << "Initialized '" << tokens[0] << "' embedding with " << embeddings_from_file << embeddings_from_file_comment
16437          << "," << weights.size() << " words and " << fixed << setprecision(1) << 100. * words_covered_from_file / words_total
16438          << "%," << 100. * words_covered / words_total << "% coverage." << endl;
16439   }
16440 
16441   // Train the network
16442   unsigned total_dimension = 0, total_nodes = 0;
16443   for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension;
16444   for (auto&& tree : train) total_nodes += tree.nodes.size() - 1;
16445   auto scaled_parameters = parameters;
16446   scaled_parameters.l1_regularization /= train.size();
16447   scaled_parameters.l2_regularization /= total_nodes;
16448   neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator);
16449 
16450   neural_network heldout_best_network;
16451   unsigned heldout_best_correct_labelled = 0, heldout_best_iteration = 0;
16452 
16453   vector<int> permutation;
16454   for (size_t i = 0; i < train.size(); i++)
16455     permutation.push_back(permutation.size());
16456 
16457   for (int iteration = 1; network_trainer.next_iteration(); iteration++) {
16458     // Train on training data
16459     shuffle(permutation.begin(), permutation.end(), generator);
16460 
16461     atomic<unsigned> atomic_index(0);
16462     atomic<double> atomic_logprob(0);
16463     auto training = [&]() {
16464       tree t;
16465       configuration conf(single_root);
16466       string word, word_buffer;
16467       vector<vector<int>> nodes_embeddings;
16468       vector<int> extracted_nodes;
16469       vector<const vector<int>*> extracted_embeddings;
16470       neural_network_trainer::workspace workspace;
16471       double logprob = 0;
16472 
16473       // Data for structured prediction
16474       tree t_eval;
16475       configuration conf_eval(single_root);
16476       vector<vector<int>> nodes_embeddings_eval;
16477       vector<int>  extracted_nodes_eval;
16478       vector<const vector<int>*>  extracted_embeddings_eval;
16479       vector<unsigned> transitions_eval;
16480       vector<float> hidden_layer_eval, outcomes_eval;
16481 
16482       for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) {
16483         const tree& gold = train[permutation[current_index]];
16484         t = gold;
16485         t.unlink_all_nodes();
16486         conf.init(&t);
16487 
16488         // Compute embeddings
16489         if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
16490         for (size_t i = 0; i < t.nodes.size(); i++) {
16491           nodes_embeddings[i].resize(parser.embeddings.size());
16492           for (size_t j = 0; j < parser.embeddings.size(); j++) {
16493             parser.values[j].extract(t.nodes[i], word);
16494             nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer);
16495           }
16496         }
16497 
16498         // Create tree oracle
16499         auto tree_oracle = oracle->create_tree_oracle(gold);
16500 
16501         // Train the network
16502         while (!conf.final()) {
16503           // Extract nodes
16504           parser.nodes.extract(conf, extracted_nodes);
16505           extracted_embeddings.resize(extracted_nodes.size());
16506           for (size_t i = 0; i < extracted_nodes.size(); i++)
16507             extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr;
16508 
16509           // Propagate
16510           network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace);
16511 
16512           // Find most probable applicable transition
16513           int network_best = -1;
16514           for (unsigned i = 0; i < workspace.outcomes.size(); i++)
16515             if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
16516               network_best = i;
16517 
16518           // Apply the oracle
16519           auto prediction = tree_oracle->predict(conf, network_best, iteration);
16520 
16521           // If the best transition is applicable, train on it
16522           if (parser.system->applicable(conf, prediction.best)) {
16523             // Update logprob
16524             if (workspace.outcomes[prediction.best])
16525               logprob += log(workspace.outcomes[prediction.best]);
16526 
16527             // Backpropagate the chosen outcome
16528             network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace);
16529           }
16530 
16531           // Emergency break if the to_follow transition is not applicable
16532           if (!parser.system->applicable(conf, prediction.to_follow))
16533             break;
16534 
16535           // Follow the chosen outcome
16536           int child = parser.system->perform(conf, prediction.to_follow);
16537 
16538           // If a node was linked, recompute its embeddings as deprel has changed
16539           if (child >= 0)
16540             for (size_t i = 0; i < parser.embeddings.size(); i++) {
16541               parser.values[i].extract(t.nodes[child], word);
16542               nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
16543             }
16544         }
16545         network_trainer.finalize_sentence();
16546 
16547         // Structured prediction
16548         if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) {
16549           uniform_int_distribution<size_t> train_distribution(0, train.size() - 1);
16550           const tree& gold = train[train_distribution(generator)];
16551           t = gold;
16552           t.unlink_all_nodes();
16553           conf.init(&t);
16554 
16555           // Compute embeddings
16556           if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
16557           for (size_t i = 0; i < t.nodes.size(); i++) {
16558             nodes_embeddings[i].resize(parser.embeddings.size());
16559             for (size_t j = 0; j < parser.embeddings.size(); j++) {
16560               parser.values[j].extract(t.nodes[i], word);
16561               nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer);
16562             }
16563           }
16564 
16565           // Create tree oracle
16566           auto tree_oracle = oracle->create_tree_oracle(gold);
16567 
16568           // Train the network
16569           while (!conf.final()) {
16570             // Extract nodes
16571             parser.nodes.extract(conf, extracted_nodes);
16572             extracted_embeddings.resize(extracted_nodes.size());
16573             for (size_t i = 0; i < extracted_nodes.size(); i++)
16574               extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr;
16575 
16576             // Find the best transition
16577             int best = 0;
16578             int best_uas = -1;
16579             tree_oracle->interesting_transitions(conf, transitions_eval);
16580             for (auto&& transition : transitions_eval) {
16581               t_eval = t;
16582               conf_eval = conf;
16583               conf_eval.t = &t_eval;
16584               nodes_embeddings_eval = nodes_embeddings;
16585 
16586               // Perform probed transition
16587               int child = parser.system->perform(conf_eval, transition);
16588               if (child >= 0)
16589                 for (size_t i = 0; i < parser.embeddings.size(); i++) {
16590                   parser.values[i].extract(t_eval.nodes[child], word);
16591                   nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
16592                 }
16593 
16594               // Train the network
16595               while (!conf_eval.final()) {
16596                 // Extract nodes
16597                 parser.nodes.extract(conf_eval, extracted_nodes_eval);
16598                 extracted_embeddings_eval.resize(extracted_nodes_eval.size());
16599                 for (size_t i = 0; i < extracted_nodes_eval.size(); i++)
16600                   extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr;
16601 
16602                 // Classify using neural network
16603                 parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false);
16604 
16605                 // Find most probable applicable transition
16606                 int network_best = -1;
16607                 for (unsigned i = 0; i < outcomes_eval.size(); i++)
16608                   if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best]))
16609                     network_best = i;
16610 
16611                 // Perform the best transition
16612                 int child = parser.system->perform(conf_eval, network_best);
16613 
16614                 // If a node was linked, recompute its embeddings as deprel has changed
16615                 if (child >= 0)
16616                   for (size_t i = 0; i < parser.embeddings.size(); i++) {
16617                     parser.values[i].extract(t_eval.nodes[child], word);
16618                     nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
16619                   }
16620               }
16621 
16622               int uas = 0;
16623               for (unsigned i = 1; i < gold.nodes.size(); i++)
16624                 uas += gold.nodes[i].head == t_eval.nodes[i].head;
16625 
16626               if (uas > best_uas) best = transition, best_uas = uas;
16627             }
16628 
16629             // Propagate
16630             network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace);
16631 
16632             // Backpropagate for the best transition
16633             if (workspace.outcomes[best])
16634               logprob += log(workspace.outcomes[best]);
16635             network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace);
16636 
16637             //              // Find most probable applicable transition when following network outcome
16638             //              int network_best = -1;
16639             //              for (unsigned i = 0; i < workspace.outcomes.size(); i++)
16640             //                if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
16641             //                  network_best = i;
16642 
16643             // Follow the best outcome
16644             int child = parser.system->perform(conf, /*network_*/best);
16645 
16646             // If a node was linked, recompute its embeddings as deprel has changed
16647             if (child >= 0)
16648               for (size_t i = 0; i < parser.embeddings.size(); i++) {
16649                 parser.values[i].extract(t.nodes[child], word);
16650                 nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
16651               }
16652           }
16653           network_trainer.finalize_sentence();
16654         }
16655       }
16656       for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {}
16657     };
16658 
16659     cerr << "Iteration " << iteration << ": ";
16660     training();
16661     cerr << "training logprob " << scientific << setprecision(4) << atomic_logprob;
16662 
16663     // Evaluate heldout data if present
16664     if (!heldout.empty()) {
16665       tree t;
16666       unsigned total = 0, correct_unlabelled = 0, correct_labelled = 0;
16667       for (auto&& gold : heldout) {
16668         t = gold;
16669         t.unlink_all_nodes();
16670         parser.parse(t);
16671         for (size_t i = 1; i < t.nodes.size(); i++) {
16672           total++;
16673           correct_unlabelled += t.nodes[i].head == gold.nodes[i].head;
16674           correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel;
16675         }
16676       }
16677 
16678       cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%";
16679 
16680       if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) {
16681         heldout_best_network = parser.network;
16682         heldout_best_correct_labelled = correct_labelled;
16683         heldout_best_iteration = iteration;
16684       }
16685     }
16686 
16687     cerr << endl;
16688   }
16689 
16690   if (parameters.early_stopping && heldout_best_iteration > 0) {
16691     cerr << "Using early stopping -- choosing network from iteration " << heldout_best_iteration << endl;
16692     parser.network = heldout_best_network;
16693   }
16694 
16695   // Encode version
16696   enc.add_1B(parser.version);
16697 
16698   // Encode single_root
16699   enc.add_1B(single_root);
16700 
16701   // Encode transition system
16702   enc.add_2B(parser.labels.size());
16703   for (auto&& label : parser.labels)
16704     enc.add_str(label);
16705   enc.add_str(transition_system_name);
16706 
16707   // Encode nodes selector
16708   enc.add_str(nodes_description);
16709 
16710   // Encode value extractors and embeddings
16711   enc.add_2B(value_names.size());
16712   for (auto&& value_name : value_names)
16713     enc.add_str(value_name);
16714   for (auto&& embedding : parser.embeddings)
16715     embedding.save(enc);
16716 
16717   // Encode the network
16718   network_trainer.save_network(enc);
16719 }
16720 
16721 } // namespace parsito
16722 
16723 /////////
16724 // File: parsito/transition/transition.cpp
16725 /////////
16726 
16727 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16728 //
16729 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16730 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16731 //
16732 // This Source Code Form is subject to the terms of the Mozilla Public
16733 // License, v. 2.0. If a copy of the MPL was not distributed with this
16734 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16735 
16736 namespace parsito {
16737 
16738 // Left arc
applicable(const configuration & conf) const16739 bool transition_left_arc::applicable(const configuration& conf) const {
16740   if (conf.single_root && label_is_root)
16741     return false;
16742   else
16743     return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2];
16744 }
16745 
perform(configuration & conf) const16746 int transition_left_arc::perform(configuration& conf) const {
16747   assert(applicable(conf));
16748 
16749   int parent = conf.stack.back(); conf.stack.pop_back();
16750   int child = conf.stack.back(); conf.stack.pop_back();
16751   conf.stack.push_back(parent);
16752   conf.t->set_head(child, parent, label);
16753   return child;
16754 }
16755 
16756 // Right arc
applicable(const configuration & conf) const16757 bool transition_right_arc::applicable(const configuration& conf) const {
16758   if (conf.single_root && label_is_root)
16759     return conf.stack.size() == 2 && conf.buffer.empty();
16760   else if (conf.single_root) // && !label_is_root
16761     return conf.stack.size() > 2;
16762   else
16763     return conf.stack.size() >= 2;
16764 }
16765 
perform(configuration & conf) const16766 int transition_right_arc::perform(configuration& conf) const {
16767   assert(applicable(conf));
16768 
16769   int child = conf.stack.back(); conf.stack.pop_back();
16770   int parent = conf.stack.back();
16771   conf.t->set_head(child, parent, label);
16772   return child;
16773 }
16774 
16775 // Shift
applicable(const configuration & conf) const16776 bool transition_shift::applicable(const configuration& conf) const {
16777   return !conf.buffer.empty();
16778 }
16779 
perform(configuration & conf) const16780 int transition_shift::perform(configuration& conf) const {
16781   assert(applicable(conf));
16782 
16783   conf.stack.push_back(conf.buffer.back());
16784   conf.buffer.pop_back();
16785   return -1;
16786 }
16787 
16788 // Swap
applicable(const configuration & conf) const16789 bool transition_swap::applicable(const configuration& conf) const {
16790   return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1];
16791 }
16792 
perform(configuration & conf) const16793 int transition_swap::perform(configuration& conf) const {
16794   assert(applicable(conf));
16795 
16796   int top = conf.stack.back(); conf.stack.pop_back();
16797   int to_buffer = conf.stack.back(); conf.stack.pop_back();
16798   conf.stack.push_back(top);
16799   conf.buffer.push_back(to_buffer);
16800   return -1;
16801 }
16802 
16803 // Left arc 2
applicable(const configuration & conf) const16804 bool transition_left_arc_2::applicable(const configuration& conf) const {
16805   if (conf.single_root && label_is_root)
16806     return false;
16807   else
16808     return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3];
16809 }
16810 
perform(configuration & conf) const16811 int transition_left_arc_2::perform(configuration& conf) const {
16812   assert(applicable(conf));
16813 
16814   int parent = conf.stack.back(); conf.stack.pop_back();
16815   int ignore = conf.stack.back(); conf.stack.pop_back();
16816   int child = conf.stack.back(); conf.stack.pop_back();
16817   conf.stack.push_back(ignore);
16818   conf.stack.push_back(parent);
16819   conf.t->set_head(child, parent, label);
16820   return child;
16821 }
16822 
16823 // Right arc 2
applicable(const configuration & conf) const16824 bool transition_right_arc_2::applicable(const configuration& conf) const {
16825   if (conf.single_root && label_is_root)
16826     return false;
16827   else if (conf.single_root) // && !label_is_root
16828     return conf.stack.size() >= 4;
16829   else
16830     return conf.stack.size() >= 3;
16831 }
16832 
perform(configuration & conf) const16833 int transition_right_arc_2::perform(configuration& conf) const {
16834   assert(applicable(conf));
16835 
16836   int child = conf.stack.back(); conf.stack.pop_back();
16837   int to_buffer = conf.stack.back(); conf.stack.pop_back();
16838   int parent = conf.stack.back();
16839   conf.buffer.push_back(to_buffer);
16840   conf.t->set_head(child, parent, label);
16841   return child;
16842 }
16843 
16844 } // namespace parsito
16845 
16846 /////////
16847 // File: parsito/transition/transition_system_link2.h
16848 /////////
16849 
16850 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16851 //
16852 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16853 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16854 //
16855 // This Source Code Form is subject to the terms of the Mozilla Public
16856 // License, v. 2.0. If a copy of the MPL was not distributed with this
16857 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16858 
16859 namespace parsito {
16860 
16861 class transition_system_link2 : public transition_system {
16862  public:
16863   transition_system_link2(const vector<string>& labels);
16864 
16865   virtual transition_oracle* oracle(const string& name) const override;
16866 };
16867 
16868 } // namespace parsito
16869 
16870 /////////
16871 // File: parsito/transition/transition_system_projective.h
16872 /////////
16873 
16874 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16875 //
16876 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16877 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16878 //
16879 // This Source Code Form is subject to the terms of the Mozilla Public
16880 // License, v. 2.0. If a copy of the MPL was not distributed with this
16881 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16882 
16883 namespace parsito {
16884 
16885 class transition_system_projective : public transition_system {
16886  public:
16887   transition_system_projective(const vector<string>& labels);
16888 
16889   virtual transition_oracle* oracle(const string& name) const override;
16890 };
16891 
16892 } // namespace parsito
16893 
16894 /////////
16895 // File: parsito/transition/transition_system_swap.h
16896 /////////
16897 
16898 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16899 //
16900 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16901 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16902 //
16903 // This Source Code Form is subject to the terms of the Mozilla Public
16904 // License, v. 2.0. If a copy of the MPL was not distributed with this
16905 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16906 
16907 namespace parsito {
16908 
16909 class transition_system_swap : public transition_system {
16910  public:
16911   transition_system_swap(const vector<string>& labels);
16912 
16913   virtual transition_oracle* oracle(const string& name) const override;
16914 };
16915 
16916 } // namespace parsito
16917 
16918 /////////
16919 // File: parsito/transition/transition_system.cpp
16920 /////////
16921 
16922 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16923 //
16924 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16925 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16926 //
16927 // This Source Code Form is subject to the terms of the Mozilla Public
16928 // License, v. 2.0. If a copy of the MPL was not distributed with this
16929 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16930 
16931 namespace parsito {
16932 
transition_count() const16933 unsigned transition_system::transition_count() const {
16934   return transitions.size();
16935 }
16936 
applicable(const configuration & conf,unsigned transition) const16937 bool transition_system::applicable(const configuration& conf, unsigned transition) const {
16938   assert(transition < transitions.size());
16939 
16940   return transitions[transition]->applicable(conf);
16941 }
16942 
perform(configuration & conf,unsigned transition) const16943 int transition_system::perform(configuration& conf, unsigned transition) const {
16944   assert(transition < transitions.size());
16945 
16946   return transitions[transition]->perform(conf);
16947 }
16948 
create(const string & name,const vector<string> & labels)16949 transition_system* transition_system::create(const string& name, const vector<string>& labels) {
16950   if (name == "projective") return new transition_system_projective(labels);
16951   if (name == "swap") return new transition_system_swap(labels);
16952   if (name == "link2") return new transition_system_link2(labels);
16953   return nullptr;
16954 }
16955 
16956 } // namespace parsito
16957 
16958 /////////
16959 // File: parsito/transition/transition_system_link2.cpp
16960 /////////
16961 
16962 // This file is part of Parsito <http://github.com/ufal/parsito/>.
16963 //
16964 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16965 // Mathematics and Physics, Charles University in Prague, Czech Republic.
16966 //
16967 // This Source Code Form is subject to the terms of the Mozilla Public
16968 // License, v. 2.0. If a copy of the MPL was not distributed with this
16969 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
16970 
16971 namespace parsito {
16972 
transition_system_link2(const vector<string> & labels)16973 transition_system_link2::transition_system_link2(const vector<string>& labels) : transition_system(labels) {
16974   transitions.emplace_back(new transition_shift());
16975   for (auto&& label : labels) {
16976     transitions.emplace_back(new transition_left_arc(label));
16977     transitions.emplace_back(new transition_right_arc(label));
16978     transitions.emplace_back(new transition_left_arc_2(label));
16979     transitions.emplace_back(new transition_right_arc_2(label));
16980   }
16981 }
16982 
16983 // Static oracle
16984 class transition_system_link2_oracle_static : public transition_oracle {
16985  public:
transition_system_link2_oracle_static(const vector<string> & labels)16986   transition_system_link2_oracle_static(const vector<string>& labels) : labels(labels) {
16987     for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
16988   }
16989 
16990   class tree_oracle_static : public transition_oracle::tree_oracle {
16991    public:
tree_oracle_static(const vector<string> & labels,unsigned root_label,const tree & gold)16992     tree_oracle_static(const vector<string>& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {}
16993     virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
16994     virtual void interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const override;
16995    private:
16996     const vector<string>& labels;
16997     unsigned root_label;
16998     const tree& gold;
16999   };
17000 
17001   virtual unique_ptr<tree_oracle> create_tree_oracle(const tree& gold) const override;
17002  private:
17003   const vector<string>& labels;
17004   unsigned root_label;
17005 };
17006 
create_tree_oracle(const tree & gold) const17007 unique_ptr<transition_oracle::tree_oracle> transition_system_link2_oracle_static::create_tree_oracle(const tree& gold) const {
17008   return unique_ptr<transition_oracle::tree_oracle>(new tree_oracle_static(labels, root_label, gold));
17009 }
17010 
interesting_transitions(const configuration & conf,vector<unsigned> & transitions) const17011 void transition_system_link2_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const {
17012   transitions.clear();
17013 
17014   // Shift
17015   if (!conf.buffer.empty()) transitions.push_back(0);
17016 
17017   // Arcs
17018   unsigned parents[4] = {1, 2, 1, 3};
17019   unsigned children[4] = {2, 1, 3, 1};
17020   for (int direction = 0; direction < 4; direction++)
17021     if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
17022       int parent = conf.stack[conf.stack.size() - parents[direction]];
17023       int child = conf.stack[conf.stack.size() - children[direction]];
17024 
17025       // Allow arc_2 only when seeing golden edge.
17026       if (direction >= 2 && gold.nodes[child].head != parent) continue;
17027 
17028       for (size_t i = 0; i < labels.size(); i++)
17029         if (gold.nodes[child].deprel == labels[i])
17030           if (!conf.single_root ||
17031               (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
17032               (i != root_label && conf.stack.size() > 2 && direction < 2) ||
17033               (i != root_label && conf.stack.size() > 3 && direction >= 2))
17034             transitions.push_back(1 + 4*i + direction);
17035     }
17036 }
17037 
predict(const configuration & conf,unsigned,unsigned) const17038 transition_oracle::predicted_transition transition_system_link2_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const {
17039   // Arcs
17040   unsigned parents[4] = {1, 2, 1, 3};
17041   unsigned children[4] = {2, 1, 3, 1};
17042   for (int direction = 0; direction < 4; direction++)
17043     if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
17044       int parent = conf.stack[conf.stack.size() - parents[direction]];
17045       int child = conf.stack[conf.stack.size() - children[direction]];
17046 
17047       if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
17048         for (size_t i = 0; i < labels.size(); i++)
17049           if (gold.nodes[child].deprel == labels[i])
17050             return predicted_transition(1 + 4*i + direction, 1 + 4*i + direction);
17051 
17052         assert(!"label was not found");
17053       }
17054     }
17055 
17056   // Otherwise, just shift
17057   return predicted_transition(0, 0);
17058 }
17059 
17060 // Oracle factory method
oracle(const string & name) const17061 transition_oracle* transition_system_link2::oracle(const string& name) const {
17062   if (name == "static") return new transition_system_link2_oracle_static(labels);
17063   return nullptr;
17064 }
17065 
17066 } // namespace parsito
17067 
17068 /////////
17069 // File: parsito/transition/transition_system_projective.cpp
17070 /////////
17071 
17072 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17073 //
17074 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17075 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17076 //
17077 // This Source Code Form is subject to the terms of the Mozilla Public
17078 // License, v. 2.0. If a copy of the MPL was not distributed with this
17079 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17080 
17081 namespace parsito {
17082 
transition_system_projective(const vector<string> & labels)17083 transition_system_projective::transition_system_projective(const vector<string>& labels) : transition_system(labels) {
17084   transitions.emplace_back(new transition_shift());
17085   for (auto&& label : labels) {
17086     transitions.emplace_back(new transition_left_arc(label));
17087     transitions.emplace_back(new transition_right_arc(label));
17088   }
17089 }
17090 
17091 // Static oracle
17092 class transition_system_projective_oracle_static : public transition_oracle {
17093  public:
transition_system_projective_oracle_static(const vector<string> & labels)17094   transition_system_projective_oracle_static(const vector<string>& labels) : labels(labels) {
17095     for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17096   }
17097 
17098   class tree_oracle_static : public transition_oracle::tree_oracle {
17099    public:
tree_oracle_static(const vector<string> & labels,unsigned root_label,const tree & gold)17100     tree_oracle_static(const vector<string>& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {}
17101     virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17102     virtual void interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const override;
17103    private:
17104     const vector<string>& labels;
17105     unsigned root_label;
17106     const tree& gold;
17107   };
17108 
17109   virtual unique_ptr<tree_oracle> create_tree_oracle(const tree& gold) const override;
17110  private:
17111   const vector<string>& labels;
17112   unsigned root_label;
17113 };
17114 
create_tree_oracle(const tree & gold) const17115 unique_ptr<transition_oracle::tree_oracle> transition_system_projective_oracle_static::create_tree_oracle(const tree& gold) const {
17116   return unique_ptr<transition_oracle::tree_oracle>(new tree_oracle_static(labels, root_label, gold));
17117 }
17118 
interesting_transitions(const configuration & conf,vector<unsigned> & transitions) const17119 void transition_system_projective_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const {
17120   transitions.clear();
17121   if (!conf.buffer.empty()) transitions.push_back(0);
17122   if (conf.stack.size() >= 2)
17123     for (int direction = 0; direction < 2; direction++) {
17124       int child = conf.stack[conf.stack.size() - 2 + direction];
17125       for (size_t i = 0; i < labels.size(); i++)
17126         if (gold.nodes[child].deprel == labels[i])
17127           if (!conf.single_root ||
17128               (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
17129               (i != root_label && conf.stack.size() > 2))
17130             transitions.push_back(1 + 2*i + direction);
17131     }
17132 }
17133 
predict(const configuration & conf,unsigned,unsigned) const17134 transition_oracle::predicted_transition transition_system_projective_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const {
17135   // Use left if appropriate
17136   if (conf.stack.size() >= 2) {
17137     int parent = conf.stack[conf.stack.size() - 1];
17138     int child = conf.stack[conf.stack.size() - 2];
17139     if (gold.nodes[child].head == parent) {
17140       for (size_t i = 0; i < labels.size(); i++)
17141         if (gold.nodes[child].deprel == labels[i])
17142           return predicted_transition(1 + 2*i, 1 + 2*i);
17143 
17144       assert(!"label was not found");
17145     }
17146   }
17147 
17148   // Use right if appropriate
17149   if (conf.stack.size() >= 2) {
17150     int child = conf.stack[conf.stack.size() - 1];
17151     int parent = conf.stack[conf.stack.size() - 2];
17152     if (gold.nodes[child].head == parent &&
17153         (conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) {
17154       for (size_t i = 0; i < labels.size(); i++)
17155         if (gold.nodes[child].deprel == labels[i])
17156           return predicted_transition(1 + 2*i + 1, 1 + 2*i + 1);
17157 
17158       assert(!"label was not found");
17159     }
17160   }
17161 
17162   // Otherwise, just shift
17163   return predicted_transition(0, 0);
17164 }
17165 
17166 // Dynamic oracle
17167 class transition_system_projective_oracle_dynamic : public transition_oracle {
17168  public:
transition_system_projective_oracle_dynamic(const vector<string> & labels)17169   transition_system_projective_oracle_dynamic(const vector<string>& labels) : labels(labels) {
17170     for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17171   }
17172 
17173   class tree_oracle_dynamic : public transition_oracle::tree_oracle {
17174    public:
tree_oracle_dynamic(const vector<string> & labels,unsigned root_label,const tree & gold)17175     tree_oracle_dynamic(const vector<string>& labels, unsigned root_label, const tree& gold) : labels(labels), gold(gold), oracle_static(labels, root_label, gold) {}
17176     virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17177     virtual void interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const override;
17178    private:
17179     const vector<string>& labels;
17180     const tree& gold;
17181     transition_system_projective_oracle_static::tree_oracle_static oracle_static;
17182   };
17183 
17184   virtual unique_ptr<tree_oracle> create_tree_oracle(const tree& gold) const override;
17185  private:
17186   const vector<string>& labels;
17187   unsigned root_label;
17188 };
17189 
create_tree_oracle(const tree & gold) const17190 unique_ptr<transition_oracle::tree_oracle> transition_system_projective_oracle_dynamic::create_tree_oracle(const tree& gold) const {
17191   return unique_ptr<transition_oracle::tree_oracle>(new tree_oracle_dynamic(labels, root_label, gold));
17192 }
17193 
interesting_transitions(const configuration & conf,vector<unsigned> & transitions) const17194 void transition_system_projective_oracle_dynamic::tree_oracle_dynamic::interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const {
17195   oracle_static.interesting_transitions(conf, transitions);
17196 }
17197 
predict(const configuration & conf,unsigned network_outcome,unsigned iteration) const17198 transition_oracle::predicted_transition transition_system_projective_oracle_dynamic::tree_oracle_dynamic::predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const {
17199   // Use static oracle in the first iteration
17200   if (iteration <= 1)
17201     return oracle_static.predict(conf, network_outcome, iteration);
17202 
17203   // Use dynamic programming to compute transition leading to best parse tree
17204 
17205   // Start by computing the right stack
17206   vector<int> right_stack;
17207 
17208   unordered_set<int> right_stack_inserted;
17209   if (!conf.buffer.empty()) {
17210     int buffer_start = conf.buffer.back();
17211     for (size_t i = conf.buffer.size(); i--; ) {
17212       const auto& node = conf.buffer[i];
17213       bool to_right_stack = gold.nodes[node].head < buffer_start;
17214       for (auto&& child : gold.nodes[node].children)
17215         to_right_stack |= child < buffer_start || right_stack_inserted.count(child);
17216       if (to_right_stack) {
17217         right_stack.push_back(node);
17218         right_stack_inserted.insert(node);
17219       }
17220     }
17221   }
17222 
17223   // Fill the array T from the 2014 Goldberg paper
17224   class t_representation {
17225    public:
17226     t_representation(const vector<int>& stack, const vector<int>& right_stack, const tree& gold, const vector<string>& labels)
17227         : stack(stack), right_stack(right_stack), gold(gold), labels(labels) {
17228       for (int i = 0; i < 2; i++) {
17229         costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size()));
17230         transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size()));
17231       }
17232     }
17233 
17234     void prepare(unsigned diagonal) {
17235       costs[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), gold.nodes.size() + 1);
17236       transitions[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), -1);
17237     }
17238 
17239     int& cost(unsigned i, unsigned j, unsigned h) { return costs[(i+j) & 1][i * (i+j+1) + h]; }
17240     int& transition(unsigned i, unsigned j, unsigned h) { return transitions[(i+j) & 1][i * (i+j+1) + h]; }
17241 
17242     int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; }
17243     int edge_cost(int parent, int child) const { return gold.nodes[child].head != parent; }
17244     int which_arc_transition(int parent, int child) const {
17245       for (size_t i = 0; i < labels.size(); i++)
17246         if (gold.nodes[child].deprel == labels[i])
17247           return 1 + 2*i + (child > parent);
17248       assert(!"label was not found");
17249       return 0; // To keep VS 2015 happy and warning-free
17250     }
17251 
17252    private:
17253     const vector<int>& stack;
17254     const vector<int>& right_stack;
17255     const tree& gold;
17256     const vector<string>& labels;
17257     vector<int> costs[2], transitions[2];
17258   } t(conf.stack, right_stack, gold, labels);
17259 
17260   t.prepare(0);
17261   t.cost(0, 0, 0) = 0;
17262   for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) {
17263     t.prepare(diagonal + 1);
17264     for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) {
17265       unsigned j = diagonal - i;
17266 
17267       // Try extending stack
17268       if (i+1 < conf.stack.size())
17269         for (unsigned h = 0; h <= diagonal; h++) {
17270           int h_node = t.node(i, j, h), new_node = t.node(i+1, j, 0);
17271           if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) {
17272             t.cost(i+1, j, h+1) = t.cost(i, j, h) + t.edge_cost(h_node, new_node);
17273             t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node);
17274           }
17275           if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) {
17276             t.cost(i+1, j, 0) = t.cost(i, j, h) + t.edge_cost(new_node, h_node);
17277             t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node);
17278           }
17279         }
17280 
17281       // Try extending right_stack
17282       if (j+1 < right_stack.size() + 1)
17283         for (unsigned h = 0; h <= diagonal; h++) {
17284           int h_node = t.node(i, j, h), new_node = t.node(i, j+1, diagonal+1);
17285           if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) {
17286             t.cost(i, j+1, h) = t.cost(i, j, h) + t.edge_cost(h_node, new_node);
17287             t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0;
17288           }
17289           if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) {
17290             t.cost(i, j+1, diagonal+1) = t.cost(i, j, h) + t.edge_cost(new_node, h_node);
17291             t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0;
17292           }
17293         }
17294     }
17295   }
17296 
17297   return predicted_transition(t.transition(conf.stack.size() - 1, right_stack.size(), 0), network_outcome);
17298 }
17299 
17300 // Oracle factory method
oracle(const string & name) const17301 transition_oracle* transition_system_projective::oracle(const string& name) const {
17302   if (name == "static") return new transition_system_projective_oracle_static(labels);
17303   if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels);
17304   return nullptr;
17305 }
17306 
17307 } // namespace parsito
17308 
17309 /////////
17310 // File: parsito/transition/transition_system_swap.cpp
17311 /////////
17312 
17313 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17314 //
17315 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17316 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17317 //
17318 // This Source Code Form is subject to the terms of the Mozilla Public
17319 // License, v. 2.0. If a copy of the MPL was not distributed with this
17320 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17321 
17322 namespace parsito {
17323 
transition_system_swap(const vector<string> & labels)17324 transition_system_swap::transition_system_swap(const vector<string>& labels) : transition_system(labels) {
17325   transitions.emplace_back(new transition_shift());
17326   transitions.emplace_back(new transition_swap());
17327   for (auto&& label : labels) {
17328     transitions.emplace_back(new transition_left_arc(label));
17329     transitions.emplace_back(new transition_right_arc(label));
17330   }
17331 }
17332 
17333 // Static oracle
17334 class transition_system_swap_oracle_static : public transition_oracle {
17335  public:
transition_system_swap_oracle_static(const vector<string> & labels,bool lazy)17336   transition_system_swap_oracle_static(const vector<string>& labels, bool lazy) : labels(labels), lazy(lazy) {
17337     for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17338   }
17339 
17340   class tree_oracle_static : public transition_oracle::tree_oracle {
17341    public:
tree_oracle_static(const vector<string> & labels,unsigned root_label,const tree & gold,vector<int> && projective_order,vector<int> && projective_components)17342     tree_oracle_static(const vector<string>& labels, unsigned root_label, const tree& gold, vector<int>&& projective_order, vector<int>&& projective_components)
17343         : labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {}
17344     virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17345     virtual void interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const override;
17346    private:
17347     const vector<string>& labels;
17348     unsigned root_label;
17349     const tree& gold;
17350     const vector<int> projective_order;
17351     const vector<int> projective_components;
17352   };
17353 
17354   virtual unique_ptr<tree_oracle> create_tree_oracle(const tree& gold) const override;
17355  private:
17356   void create_projective_order(const tree& gold, int node, vector<int>& projective_order, int& projective_index) const;
17357   void create_projective_component(const tree& gold, int node, vector<int>& projective_components, int component_index) const;
17358 
17359   const vector<string>& labels;
17360   bool lazy;
17361   unsigned root_label;
17362 };
17363 
create_tree_oracle(const tree & gold) const17364 unique_ptr<transition_oracle::tree_oracle> transition_system_swap_oracle_static::create_tree_oracle(const tree& gold) const {
17365   vector<int> projective_order(gold.nodes.size());
17366   int projective_index;
17367   create_projective_order(gold, 0, projective_order, projective_index);
17368 
17369   vector<int> projective_components;
17370   if (lazy) {
17371     tree_oracle_static projective_oracle(labels, root_label, gold, vector<int>(), vector<int>());
17372     configuration conf(false);
17373     tree t = gold;
17374     transition_system_swap system(labels);
17375 
17376     conf.init(&t);
17377     while (!conf.final()) {
17378       auto prediction = projective_oracle.predict(conf, 0, 0);
17379       if (!system.applicable(conf, prediction.to_follow)) break;
17380       system.perform(conf, prediction.to_follow);
17381     }
17382 
17383     projective_components.assign(gold.nodes.size(), 0);
17384     for (auto&& node : conf.stack)
17385       if (node)
17386         create_projective_component(t, node, projective_components, node);
17387   }
17388 
17389   return unique_ptr<transition_oracle::tree_oracle>(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components)));
17390 }
17391 
create_projective_order(const tree & gold,int node,vector<int> & projective_order,int & projective_index) const17392 void transition_system_swap_oracle_static::create_projective_order(const tree& gold, int node, vector<int>& projective_order, int& projective_index) const {
17393   unsigned child_index = 0;
17394   while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node)
17395     create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index);
17396   projective_order[node] = projective_index++;
17397   while (child_index < gold.nodes[node].children.size())
17398     create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index);
17399 }
17400 
create_projective_component(const tree & gold,int node,vector<int> & projective_components,int component_index) const17401 void transition_system_swap_oracle_static::create_projective_component(const tree& gold, int node, vector<int>& projective_components, int component_index) const {
17402   projective_components[node] = component_index;
17403   for (auto&& child : gold.nodes[node].children)
17404     create_projective_component(gold, child, projective_components, component_index);
17405 }
17406 
interesting_transitions(const configuration & conf,vector<unsigned> & transitions) const17407 void transition_system_swap_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector<unsigned>& transitions) const {
17408   transitions.clear();
17409   if (!conf.buffer.empty()) transitions.push_back(0);
17410   if (conf.stack.size() >= 2) {
17411     // Swap
17412     if (!projective_order.empty()) {
17413       int last = conf.stack[conf.stack.size() - 1];
17414       int prev = conf.stack[conf.stack.size() - 2];
17415       if (projective_order[last] < projective_order[prev] &&
17416           (projective_components.empty() ||
17417            (conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()])))
17418         transitions.push_back(1);
17419     }
17420 
17421     // Arcs
17422     for (int direction = 0; direction < 2; direction++) {
17423       int child = conf.stack[conf.stack.size() - 2 + direction];
17424       for (size_t i = 0; i < labels.size(); i++)
17425         if (gold.nodes[child].deprel == labels[i])
17426           if (!conf.single_root ||
17427               (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
17428               (i != root_label && conf.stack.size() > 2))
17429             transitions.push_back(2 + 2*i + direction);
17430     }
17431   }
17432 }
17433 
predict(const configuration & conf,unsigned,unsigned) const17434 transition_oracle::predicted_transition transition_system_swap_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const {
17435   // Use left if appropriate
17436   if (conf.stack.size() >= 2) {
17437     int parent = conf.stack[conf.stack.size() - 1];
17438     int child = conf.stack[conf.stack.size() - 2];
17439     if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
17440       for (size_t i = 0; i < labels.size(); i++)
17441         if (gold.nodes[child].deprel == labels[i])
17442           return predicted_transition(2 + 2*i, 2 + 2*i);
17443 
17444       assert(!"label was not found");
17445     }
17446   }
17447 
17448   // Use right if appropriate
17449   if (conf.stack.size() >= 2) {
17450     int child = conf.stack[conf.stack.size() - 1];
17451     int parent = conf.stack[conf.stack.size() - 2];
17452     if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
17453       for (size_t i = 0; i < labels.size(); i++)
17454         if (gold.nodes[child].deprel == labels[i])
17455           return predicted_transition(2 + 2*i + 1, 2 + 2*i + 1);
17456 
17457       assert(!"label was not found");
17458     }
17459   }
17460 
17461   // Use swap if appropriate
17462   if (conf.stack.size() >= 2 && !projective_order.empty()) {
17463     int last = conf.stack[conf.stack.size() - 1];
17464     int prev = conf.stack[conf.stack.size() - 2];
17465     if (projective_order[last] < projective_order[prev] &&
17466         (projective_components.empty() ||
17467          (conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()])))
17468       return predicted_transition(1, 1);
17469   }
17470 
17471   // Otherwise, just shift
17472   return predicted_transition(0, 0);
17473 }
17474 
17475 // Oracle factory method
oracle(const string & name) const17476 transition_oracle* transition_system_swap::oracle(const string& name) const {
17477   if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false);
17478   if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true);
17479   return nullptr;
17480 }
17481 
17482 } // namespace parsito
17483 
17484 /////////
17485 // File: parsito/tree/tree.cpp
17486 /////////
17487 
17488 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17489 //
17490 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17491 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17492 //
17493 // This Source Code Form is subject to the terms of the Mozilla Public
17494 // License, v. 2.0. If a copy of the MPL was not distributed with this
17495 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17496 
17497 namespace parsito {
17498 
17499 const string tree::root_form = "<root>";
17500 
tree()17501 tree::tree() {
17502   clear();
17503 }
17504 
empty()17505 bool tree::empty() {
17506   return nodes.size() == 1;
17507 }
17508 
clear()17509 void tree::clear() {
17510   nodes.clear();
17511   node& root = add_node(root_form);
17512   root.lemma = root.upostag = root.xpostag = root.feats = root_form;
17513 }
17514 
add_node(const string & form)17515 node& tree::add_node(const string& form) {
17516   nodes.emplace_back(nodes.size(), form);
17517   return nodes.back();
17518 }
17519 
set_head(int id,int head,const string & deprel)17520 void tree::set_head(int id, int head, const string& deprel) {
17521   assert(id >= 0 && id < int(nodes.size()));
17522   assert(head < int(nodes.size()));
17523 
17524   // Remove existing head
17525   if (nodes[id].head >= 0) {
17526     auto& children = nodes[nodes[id].head].children;
17527     for (size_t i = children.size(); i && children[i-1] >= id; i--)
17528       if (children[i-1] == id) {
17529         children.erase(children.begin() + i - 1);
17530         break;
17531       }
17532   }
17533 
17534   // Set new head
17535   nodes[id].head = head;
17536   nodes[id].deprel = deprel;
17537   if (head >= 0) {
17538     auto& children = nodes[head].children;
17539     size_t i = children.size();
17540     while (i && children[i-1] > id) i--;
17541     if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
17542   }
17543 }
17544 
unlink_all_nodes()17545 void tree::unlink_all_nodes() {
17546   for (auto&& node : nodes) {
17547     node.head = -1;
17548     node.deprel.clear();
17549     node.children.clear();
17550   }
17551 }
17552 
17553 } // namespace parsito
17554 
17555 /////////
17556 // File: parsito/tree/tree_format.h
17557 /////////
17558 
17559 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17560 //
17561 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17562 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17563 //
17564 // This Source Code Form is subject to the terms of the Mozilla Public
17565 // License, v. 2.0. If a copy of the MPL was not distributed with this
17566 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17567 
17568 namespace parsito {
17569 
17570 // Input format
17571 class tree_input_format {
17572  public:
~tree_input_format()17573   virtual ~tree_input_format() {}
17574 
17575   virtual bool read_block(istream& in, string& block) const = 0;
17576   virtual void set_text(string_piece text, bool make_copy = false) = 0;
17577   virtual bool next_tree(tree& t) = 0;
17578   const string& last_error() const;
17579 
17580   // Static factory methods
17581   static tree_input_format* new_input_format(const string& name);
17582   static tree_input_format* new_conllu_input_format();
17583 
17584  protected:
17585   string error;
17586 };
17587 
17588 // Output format
17589 class tree_output_format {
17590  public:
~tree_output_format()17591   virtual ~tree_output_format() {}
17592 
17593   virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const = 0;
17594 
17595   // Static factory methods
17596   static tree_output_format* new_output_format(const string& name);
17597   static tree_output_format* new_conllu_output_format();
17598 };
17599 
17600 } // namespace parsito
17601 
17602 /////////
17603 // File: parsito/tree/tree_format_conllu.h
17604 /////////
17605 
17606 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17607 //
17608 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17609 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17610 //
17611 // This Source Code Form is subject to the terms of the Mozilla Public
17612 // License, v. 2.0. If a copy of the MPL was not distributed with this
17613 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17614 
17615 namespace parsito {
17616 
17617 // Input CoNLL-U format
17618 class tree_input_format_conllu : public tree_input_format {
17619  public:
17620   virtual bool read_block(istream& in, string& block) const override;
17621   virtual void set_text(string_piece text, bool make_copy = false) override;
17622   virtual bool next_tree(tree& t) override;
17623 
17624  private:
17625   friend class tree_output_format_conllu;
17626   vector<string_piece> comments;
17627   vector<pair<int, string_piece>> multiword_tokens;
17628 
17629   string_piece text;
17630   string text_copy;
17631 };
17632 
17633 // Output CoNLL-U format
17634 class tree_output_format_conllu : public tree_output_format {
17635  public:
17636   virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const override;
17637 
17638  private:
17639   static const string underscore;
underscore_on_empty(const string & str) const17640   const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
17641 };
17642 
17643 } // namespace parsito
17644 
17645 /////////
17646 // File: parsito/tree/tree_format.cpp
17647 /////////
17648 
17649 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17650 //
17651 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17652 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17653 //
17654 // This Source Code Form is subject to the terms of the Mozilla Public
17655 // License, v. 2.0. If a copy of the MPL was not distributed with this
17656 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17657 
17658 namespace parsito {
17659 
last_error() const17660 const string& tree_input_format::last_error() const {
17661   return error;
17662 }
17663 
17664 // Input Static factory methods
new_conllu_input_format()17665 tree_input_format* tree_input_format::new_conllu_input_format() {
17666   return new tree_input_format_conllu();
17667 }
17668 
new_input_format(const string & name)17669 tree_input_format* tree_input_format::new_input_format(const string& name) {
17670   if (name == "conllu") return new_conllu_input_format();
17671   return nullptr;
17672 }
17673 
17674 // Output static factory methods
new_conllu_output_format()17675 tree_output_format* tree_output_format::new_conllu_output_format() {
17676   return new tree_output_format_conllu();
17677 }
17678 
new_output_format(const string & name)17679 tree_output_format* tree_output_format::new_output_format(const string& name) {
17680   if (name == "conllu") return new_conllu_output_format();
17681   return nullptr;
17682 }
17683 
17684 } // namespace parsito
17685 
17686 /////////
17687 // File: parsito/tree/tree_format_conllu.cpp
17688 /////////
17689 
17690 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17691 //
17692 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17693 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17694 //
17695 // This Source Code Form is subject to the terms of the Mozilla Public
17696 // License, v. 2.0. If a copy of the MPL was not distributed with this
17697 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17698 
17699 namespace parsito {
17700 
17701 // Input CoNLL-U format
17702 
read_block(istream & in,string & block) const17703 bool tree_input_format_conllu::read_block(istream& in, string& block) const {
17704   return bool(getpara(in, block));
17705 }
17706 
set_text(string_piece text,bool make_copy)17707 void tree_input_format_conllu::set_text(string_piece text, bool make_copy) {
17708   if (make_copy) {
17709     text_copy.assign(text.str, text.len);
17710     text = string_piece(text_copy.c_str(), text_copy.size());
17711   }
17712   this->text = text;
17713 }
17714 
next_tree(tree & t)17715 bool tree_input_format_conllu::next_tree(tree& t) {
17716   error.clear();
17717   t.clear();
17718   comments.clear();
17719   multiword_tokens.clear();
17720   int last_multiword_token = 0;
17721 
17722   vector<string_piece> tokens, parts;
17723   while (text.len) {
17724     // Read line
17725     string_piece line(text.str, 0);
17726     while (line.len < text.len && line.str[line.len] != '\n') line.len++;
17727     text.str += line.len + (line.len < text.len);
17728     text.len -= line.len + (line.len < text.len);
17729 
17730     // Empty lines denote end of tree, unless at the beginning
17731     if (!line.len) {
17732       if (t.empty()) continue;
17733       break;
17734     }
17735 
17736     if (*line.str == '#') {
17737       // Store comments at the beginning and ignore the rest
17738       if (t.empty()) comments.push_back(line);
17739       continue;
17740     }
17741 
17742     // Parse another tree node
17743     split(line, '\t', tokens);
17744     if (tokens.size() != 10)
17745       return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
17746 
17747     // Store and skip multiword tokens
17748     if (memchr(tokens[0].str, '-', tokens[0].len)) {
17749       split(tokens[0], '-', parts);
17750       if (parts.size() != 2)
17751         return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
17752       int from, to;
17753       if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
17754         return false;
17755       if (from != int(t.nodes.size()))
17756         return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
17757       if (to < from)
17758         return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
17759       if (from <= last_multiword_token)
17760         return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
17761       last_multiword_token = to;
17762       multiword_tokens.emplace_back(from, line);
17763       continue;
17764     }
17765 
17766     // Parse node ID and head
17767     int id;
17768     if (!parse_int(tokens[0], "CoNLL-U id", id, error))
17769       return false;
17770     if (id != int(t.nodes.size()))
17771       return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
17772 
17773     int head;
17774     if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
17775       head = -1;
17776     } else {
17777       if (!parse_int(tokens[6], "CoNLL-U head", head, error))
17778         return false;
17779       if (head < 0)
17780         return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
17781     }
17782 
17783     // Add new node
17784     auto& node = t.add_node(string(tokens[1].str, tokens[1].len));
17785     if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len);
17786     if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len);
17787     if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len);
17788     if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len);
17789     node.head = head;
17790     if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len);
17791     if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len);
17792     if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len);
17793   }
17794 
17795   // Check that we got word for the last multiword token
17796   if (last_multiword_token >= int(t.nodes.size()))
17797     return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false;
17798 
17799   // Set heads correctly
17800   for (auto&& node : t.nodes)
17801     if (node.id && node.head >= 0) {
17802       if (node.head >= int(t.nodes.size()))
17803         return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false;
17804       t.set_head(node.id, node.head, node.deprel);
17805     }
17806 
17807   return !t.empty();
17808 }
17809 
17810 // Output CoNLL-U format
17811 
17812 const string tree_output_format_conllu::underscore = "_";
17813 
write_tree(const tree & t,string & output,const tree_input_format * additional_info) const17814 void tree_output_format_conllu::write_tree(const tree& t, string& output, const tree_input_format* additional_info) const {
17815   output.clear();
17816 
17817   // Try casting input format to CoNLL-U
17818   auto input_conllu = dynamic_cast<const tree_input_format_conllu*>(additional_info);
17819   size_t input_conllu_multiword_tokens = 0;
17820 
17821   // Comments if present
17822   if (input_conllu)
17823     for (auto&& comment : input_conllu->comments)
17824       output.append(comment.str, comment.len).push_back('\n');
17825 
17826   // Print out the tokens
17827   for (int i = 1 /*skip the root node*/; i < int(t.nodes.size()); i++) {
17828     // Write multiword token if present
17829     if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() &&
17830         i == input_conllu->multiword_tokens[input_conllu_multiword_tokens].first) {
17831       output.append(input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.str,
17832                     input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.len).push_back('\n');
17833       input_conllu_multiword_tokens++;
17834     }
17835 
17836     // Write the token
17837     output.append(to_string(i)).push_back('\t');
17838     output.append(t.nodes[i].form).push_back('\t');
17839     output.append(underscore_on_empty(t.nodes[i].lemma)).push_back('\t');
17840     output.append(underscore_on_empty(t.nodes[i].upostag)).push_back('\t');
17841     output.append(underscore_on_empty(t.nodes[i].xpostag)).push_back('\t');
17842     output.append(underscore_on_empty(t.nodes[i].feats)).push_back('\t');
17843     output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t');
17844     output.append(underscore_on_empty(t.nodes[i].deprel)).push_back('\t');
17845     output.append(underscore_on_empty(t.nodes[i].deps)).push_back('\t');
17846     output.append(underscore_on_empty(t.nodes[i].misc)).push_back('\n');
17847   }
17848   output.push_back('\n');
17849 }
17850 
17851 } // namespace parsito
17852 
17853 /////////
17854 // File: parsito/version/version.h
17855 /////////
17856 
17857 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17858 //
17859 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17860 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17861 //
17862 // This Source Code Form is subject to the terms of the Mozilla Public
17863 // License, v. 2.0. If a copy of the MPL was not distributed with this
17864 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17865 
17866 namespace parsito {
17867 
17868 struct version {
17869   unsigned major;
17870   unsigned minor;
17871   unsigned patch;
17872   std::string prerelease;
17873 
17874   // Returns current version.
17875   static version current();
17876 
17877   // Returns multi-line formated version and copyright string.
17878   static string version_and_copyright(const string& other_libraries = string());
17879 };
17880 
17881 } // namespace parsito
17882 
17883 /////////
17884 // File: parsito/version/version.cpp
17885 /////////
17886 
17887 // This file is part of Parsito <http://github.com/ufal/parsito/>.
17888 //
17889 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17890 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17891 //
17892 // This Source Code Form is subject to the terms of the Mozilla Public
17893 // License, v. 2.0. If a copy of the MPL was not distributed with this
17894 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17895 
17896 namespace parsito {
17897 
17898 // Returns current version.
current()17899 version version::current() {
17900   return {1, 1, 1, "devel"};
17901 }
17902 
17903 // Returns multi-line formated version and copyright string.
version_and_copyright(const string & other_libraries)17904 string version::version_and_copyright(const string& other_libraries) {
17905   ostringstream info;
17906 
17907   auto parsito = version::current();
17908   auto unilib = unilib::version::current();
17909 
17910   info << "Parsito version " << parsito.major << '.' << parsito.minor << '.' << parsito.patch
17911        << (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
17912        << " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch
17913        << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
17914           "Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n"
17915           "Mathematics and Physics, Charles University in Prague, Czech Republic.";
17916 
17917   return info.str();
17918 }
17919 
17920 } // namespace parsito
17921 
17922 /////////
17923 // File: sentence/input_format.cpp
17924 /////////
17925 
17926 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
17927 //
17928 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17929 // Mathematics and Physics, Charles University in Prague, Czech Republic.
17930 //
17931 // This Source Code Form is subject to the terms of the Mozilla Public
17932 // License, v. 2.0. If a copy of the MPL was not distributed with this
17933 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
17934 
17935 const string input_format::CONLLU_V1 = "v1";
17936 const string input_format::CONLLU_V2 = "v2";
17937 const string input_format::GENERIC_TOKENIZER_NORMALIZED_SPACES = "normalized_spaces";
17938 const string input_format::GENERIC_TOKENIZER_PRESEGMENTED = "presegmented";
17939 const string input_format::GENERIC_TOKENIZER_RANGES = "ranges";
17940 
17941 // CoNLL-U input format
17942 class input_format_conllu : public input_format {
17943  public:
input_format_conllu(unsigned version)17944   input_format_conllu(unsigned version) : version(version) {}
17945 
17946   virtual bool read_block(istream& is, string& block) const override;
17947   virtual void reset_document(string_piece id = string_piece()) override;
17948   virtual void set_text(string_piece text, bool make_copy = false) override;
17949   virtual bool next_sentence(sentence& s, string& error) override;
17950 
17951  private:
17952   unsigned version;
17953   string_piece text;
17954   string text_copy;
17955 
17956   static const string columns[10];
17957 };
17958 
17959 const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA",
17960   "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
17961 
read_block(istream & is,string & block) const17962 bool input_format_conllu::read_block(istream& is, string& block) const {
17963   return bool(getpara(is, block));
17964 }
17965 
reset_document(string_piece)17966 void input_format_conllu::reset_document(string_piece /*id*/) {
17967   set_text("");
17968 }
17969 
set_text(string_piece text,bool make_copy)17970 void input_format_conllu::set_text(string_piece text, bool make_copy) {
17971   if (make_copy) {
17972     text_copy.assign(text.str, text.len);
17973     text = string_piece(text_copy.c_str(), text_copy.size());
17974   }
17975   this->text = text;
17976 }
17977 
next_sentence(sentence & s,string & error)17978 bool input_format_conllu::next_sentence(sentence& s, string& error) {
17979   error.clear();
17980   s.clear();
17981   int last_multiword_token = 0;
17982 
17983   vector<string_piece> tokens, parts;
17984   while (text.len) {
17985     // Read line
17986     string_piece line(text.str, 0);
17987     while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++;
17988 
17989     text.str += line.len, text.len -= line.len;
17990     if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
17991       text.str += 2, text.len -= 2;
17992     else if (text.len && *text.str == '\n')
17993       text.str++, text.len--;
17994 
17995     // Empty lines denote end of tree, unless at the beginning
17996     if (!line.len) {
17997       if (s.empty()) continue;
17998       break;
17999     }
18000 
18001     if (*line.str == '#') {
18002       // Store comments at the beginning and ignore the rest
18003       if (s.empty()) s.comments.emplace_back(line.str, line.len);
18004       continue;
18005     }
18006 
18007     // Parse the line
18008     split(line, '\t', tokens);
18009     if (tokens.size() != 10)
18010       return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
18011 
18012     // Check that no column is empty and contains no spaces (except FORM and LEMMA in version >= 2)
18013     for (int i = 0; i < 10; i++) {
18014       if (!tokens[i].len)
18015         return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false;
18016       if ((version < 2 || (i != 1 && i != 2)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
18017         return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false;
18018     }
18019 
18020     // Handle multiword tokens
18021     if (memchr(tokens[0].str, '-', tokens[0].len)) {
18022       split(tokens[0], '-', parts);
18023       if (parts.size() != 2)
18024         return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
18025       int from, to;
18026       if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
18027         return false;
18028       if (from != int(s.words.size()))
18029         return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
18030       if (to < from)
18031         return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
18032       if (from <= last_multiword_token)
18033         return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
18034       last_multiword_token = to;
18035       for (int i = 2; i < 9; i++)
18036         if (tokens[i].len != 1 || tokens[i].str[0] != '_')
18037           return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false;
18038       s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]);
18039       continue;
18040     }
18041 
18042     // Handle empty nodes
18043     if (version >= 2)
18044       if (memchr(tokens[0].str, '.', tokens[0].len)) {
18045         split(tokens[0], '.', parts);
18046         if (parts.size() != 2)
18047           return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false;
18048         int id, index;
18049         if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error))
18050           return false;
18051         if (id != int(s.words.size()) - 1)
18052           return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
18053         if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) ||
18054              (!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1)))
18055           return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
18056         for (int i = 6; i < 8; i++)
18057           if (tokens[i].len != 1 || tokens[i].str[0] != '_')
18058             return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false;
18059 
18060         s.empty_nodes.emplace_back(id, index);
18061         s.empty_nodes.back().form.assign(tokens[1].str, tokens[1].len);
18062         s.empty_nodes.back().lemma.assign(tokens[2].str, tokens[2].len);
18063         if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len);
18064         if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len);
18065         if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len);
18066         if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len);
18067         if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len);
18068         continue;
18069       }
18070 
18071     // Parse word ID and head
18072     int id;
18073     if (!parse_int(tokens[0], "CoNLL-U id", id, error))
18074       return false;
18075     if (id != int(s.words.size()))
18076       return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
18077 
18078     int head;
18079     if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
18080       head = -1;
18081     } else {
18082       if (!parse_int(tokens[6], "CoNLL-U head", head, error))
18083         return false;
18084       if (head < 0)
18085         return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
18086     }
18087 
18088     // Add new word
18089     auto& word = s.add_word(tokens[1]);
18090     word.lemma.assign(tokens[2].str, tokens[2].len);
18091     if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len);
18092     if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len);
18093     if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len);
18094     word.head = head;
18095     if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len);
18096     if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len);
18097     if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len);
18098   }
18099 
18100   // Check that we got word for the last multiword token
18101   if (last_multiword_token >= int(s.words.size()))
18102     return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false;
18103 
18104   // Set heads correctly
18105   for (auto&& word : s.words)
18106     if (word.id && word.head >= 0) {
18107       if (word.head >= int(s.words.size()))
18108         return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false;
18109       s.set_head(word.id, word.head, word.deprel);
18110     }
18111 
18112   return !s.empty();
18113 }
18114 
18115 // Horizontal input format
18116 class input_format_horizontal : public input_format {
18117  public:
18118   virtual bool read_block(istream& is, string& block) const override;
18119   virtual void reset_document(string_piece id = string_piece()) override;
18120   virtual void set_text(string_piece text, bool make_copy = false) override;
18121   virtual bool next_sentence(sentence& s, string& error) override;
18122 
18123  private:
18124   string_piece text;
18125   string text_copy;
18126   bool new_document = true;
18127   string document_id;
18128   unsigned preceeding_newlines = 2;
18129   unsigned sentence_id = 1;
18130 };
18131 
read_block(istream & is,string & block) const18132 bool input_format_horizontal::read_block(istream& is, string& block) const {
18133   if (getline(is, block))
18134     return block.push_back('\n'), true;
18135   return false;
18136 }
18137 
reset_document(string_piece id)18138 void input_format_horizontal::reset_document(string_piece id) {
18139   new_document = true;
18140   document_id.assign(id.str, id.len);
18141   preceeding_newlines = 2;
18142   sentence_id = 1;
18143   set_text("");
18144 }
18145 
set_text(string_piece text,bool make_copy)18146 void input_format_horizontal::set_text(string_piece text, bool make_copy) {
18147   if (make_copy) {
18148     text_copy.assign(text.str, text.len);
18149     text = string_piece(text_copy.c_str(), text_copy.size());
18150   }
18151   this->text = text;
18152 }
18153 
next_sentence(sentence & s,string & error)18154 bool input_format_horizontal::next_sentence(sentence& s, string& error) {
18155   error.clear();
18156   s.clear();
18157 
18158   // Skip spaces and newlines
18159   while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
18160     preceeding_newlines += *text.str == '\n';
18161     text.str++, text.len--;
18162   }
18163 
18164   // Read space (and tab) separated words
18165   while (text.len && *text.str != '\r' && *text.str != '\n') {
18166     string_piece word = text;
18167 
18168     // Slurp the word
18169     while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
18170       text.str++, text.len--;
18171     word.len = text.str - word.str;
18172     s.add_word(word);
18173 
18174     // Replace &nbsp;s by regular spaces
18175     if (s.words.back().form.find("\302\240") != string::npos) {
18176       string& form = s.words.back().form;
18177       size_t form_len = 0;
18178       for (size_t i = 0; i < form.size(); i++) {
18179         if (form_len && form[form_len-1] == '\302' && form[i] == '\240')
18180           form[form_len - 1] = ' ';
18181         else
18182           form[form_len++] = form[i];
18183       }
18184       form.resize(form_len);
18185     }
18186 
18187     // Skip spaces
18188     while (text.len && (*text.str == ' ' || *text.str == '\t'))
18189       text.str++, text.len--;
18190   }
18191 
18192   if (!s.empty()) {
18193     // Mark new document if needed
18194     if (new_document)
18195       s.set_new_doc(true, document_id);
18196     new_document = false;
18197 
18198     // Mark new paragraph if needed
18199     if (preceeding_newlines >= 2)
18200       s.set_new_par(true);
18201     preceeding_newlines = 0;
18202 
18203     // Sentence id
18204     s.set_sent_id(to_string(sentence_id++));
18205   }
18206 
18207   return !s.empty();
18208 }
18209 
18210 // Vertical input format
18211 class input_format_vertical : public input_format {
18212  public:
18213   virtual bool read_block(istream& is, string& block) const override;
18214   virtual void reset_document(string_piece id = string_piece()) override;
18215   virtual void set_text(string_piece text, bool make_copy = false) override;
18216   virtual bool next_sentence(sentence& s, string& error) override;
18217 
18218  private:
18219   string_piece text;
18220   string text_copy;
18221   bool new_document = true;
18222   string document_id;
18223   unsigned preceeding_newlines = 2;
18224   unsigned sentence_id = 1;
18225 };
18226 
read_block(istream & is,string & block) const18227 bool input_format_vertical::read_block(istream& is, string& block) const {
18228   return bool(getpara(is, block));
18229 }
18230 
reset_document(string_piece id)18231 void input_format_vertical::reset_document(string_piece id) {
18232   new_document = true;
18233   document_id.assign(id.str, id.len);
18234   preceeding_newlines = 2;
18235   sentence_id = 1;
18236   set_text("");
18237 }
18238 
set_text(string_piece text,bool make_copy)18239 void input_format_vertical::set_text(string_piece text, bool make_copy) {
18240   if (make_copy) {
18241     text_copy.assign(text.str, text.len);
18242     text = string_piece(text_copy.c_str(), text_copy.size());
18243   }
18244   this->text = text;
18245 }
18246 
next_sentence(sentence & s,string & error)18247 bool input_format_vertical::next_sentence(sentence& s, string& error) {
18248   error.clear();
18249   s.clear();
18250 
18251   // Skip tabs and newlines
18252   while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
18253     preceeding_newlines += *text.str == '\n';
18254     text.str++, text.len--;
18255   }
18256 
18257   // Read first word without tabs on every line
18258   while (text.len && *text.str != '\r' && *text.str != '\n') {
18259     string_piece word = text;
18260 
18261     // Slurp the word
18262     while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
18263       text.str++, text.len--;
18264     word.len = text.str - word.str;
18265     s.add_word(word);
18266 
18267     // Skip spaces till end of line
18268     while (text.len && *text.str != '\r' && *text.str != '\n')
18269       text.str++, text.len--;
18270 
18271     // Skip one new line
18272     if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
18273       text.str += 2, text.len -= 2;
18274     else if (text.len && *text.str == '\n')
18275       text.str++, text.len--;
18276 
18277     // Skip tabs on the beginning of the line
18278     while (text.len && *text.str == '\t')
18279       text.str++, text.len--;
18280   }
18281 
18282   if (!s.empty()) {
18283     // Mark new document if needed
18284     if (new_document)
18285       s.set_new_doc(true, document_id);
18286     new_document = false;
18287 
18288     // Mark new paragraph if needed
18289     if (preceeding_newlines >= 2)
18290       s.set_new_par(true);
18291     preceeding_newlines = 0;
18292 
18293     // Sentence id
18294     s.set_sent_id(to_string(sentence_id++));
18295   }
18296 
18297   return !s.empty();
18298 }
18299 
18300 // Presegmented tokenizer
18301 class input_format_presegmented_tokenizer : public input_format {
18302  public:
input_format_presegmented_tokenizer(input_format * tokenizer)18303   input_format_presegmented_tokenizer(input_format* tokenizer) : tokenizer(tokenizer) {}
18304 
18305   virtual bool read_block(istream& is, string& block) const override;
18306   virtual void reset_document(string_piece id) override;
18307   virtual void set_text(string_piece text, bool make_copy = false) override;
18308   virtual bool next_sentence(sentence& s, string& error) override;
18309 
18310  private:
18311   unique_ptr<input_format> tokenizer;
18312   string_piece text;
18313   string text_copy;
18314   bool new_document = true;
18315   string document_id;
18316   unsigned preceeding_newlines = 2;
18317   unsigned sentence_id = 1;
18318 };
18319 
read_block(istream & is,string & block) const18320 bool input_format_presegmented_tokenizer::read_block(istream& is, string& block) const {
18321   if (getline(is, block))
18322     return block.push_back('\n'), true;
18323   return false;
18324 }
18325 
reset_document(string_piece id)18326 void input_format_presegmented_tokenizer::reset_document(string_piece id) {
18327   new_document = true;
18328   document_id.assign(id.str, id.len);
18329   preceeding_newlines = 2;
18330   sentence_id = 1;
18331   tokenizer->reset_document();
18332   set_text("");
18333 }
18334 
set_text(string_piece text,bool make_copy)18335 void input_format_presegmented_tokenizer::set_text(string_piece text, bool make_copy) {
18336   if (make_copy) {
18337     text_copy.assign(text.str, text.len);
18338     text = string_piece(text_copy.c_str(), text_copy.size());
18339   }
18340   this->text = text;
18341 }
18342 
next_sentence(sentence & s,string & error)18343 bool input_format_presegmented_tokenizer::next_sentence(sentence& s, string& error) {
18344   error.clear();
18345   s.clear();
18346 
18347   sentence partial;
18348   unsigned following_newlines = 0;
18349   while (text.len && s.empty()) {
18350     // Move next line from `text' to `line', including leading and following newlines
18351     string_piece line(text.str, 0);
18352     while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) {
18353       preceeding_newlines += line.str[line.len] == '\n';
18354       line.len++;
18355     }
18356     while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r'))
18357       line.len++;
18358     while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) {
18359       following_newlines += line.str[line.len] == '\n';
18360       line.len++;
18361     }
18362     text.str += line.len, text.len -= line.len;
18363 
18364     // Add all tokens from the line to `s'
18365     tokenizer->set_text(line, false);
18366     while (tokenizer->next_sentence(partial, error)) {
18367       // Append words
18368       size_t words = s.words.size() - 1;
18369       for (size_t i = 1; i < partial.words.size(); i++) {
18370         s.words.push_back(move(partial.words[i]));
18371         s.words.back().id += words;
18372         if (s.words.back().head > 0) s.words.back().head += words;
18373       }
18374 
18375       // Append multiword_tokens
18376       for (auto&& multiword_token : partial.multiword_tokens) {
18377         s.multiword_tokens.push_back(move(multiword_token));
18378         s.multiword_tokens.back().id_first += words;
18379         s.multiword_tokens.back().id_last += words;
18380       }
18381 
18382       // Append empty nodes
18383       for (auto&& empty_node : partial.empty_nodes) {
18384         s.empty_nodes.push_back(move(empty_node));
18385         s.empty_nodes.back().id += words;
18386       }
18387     }
18388     if (!error.empty()) return false;
18389 
18390     if (s.empty()) {
18391       preceeding_newlines += following_newlines;
18392       following_newlines = 0;
18393     }
18394   }
18395 
18396   if (!s.empty()) {
18397   // Mark new document if needed
18398     if (new_document)
18399       s.set_new_doc(true, document_id);
18400     new_document = false;
18401 
18402     // Mark new paragraph if needed
18403     if (preceeding_newlines >= 2)
18404       s.set_new_par(true);
18405     preceeding_newlines = following_newlines;
18406 
18407     // Sentence id
18408     s.set_sent_id(to_string(sentence_id++));
18409 
18410     // Fill "# text" comment
18411     s.comments.emplace_back("# text = ");
18412     for (size_t i = 1, j = 0; i < s.words.size(); i++) {
18413       const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
18414       if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
18415         i = s.multiword_tokens[j++].id_last;
18416 
18417       s.comments.back().append(tok.form);
18418       if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
18419     }
18420   }
18421 
18422   return !s.empty();
18423 }
18424 
18425 // Static factory methods
new_conllu_input_format(const string & options)18426 input_format* input_format::new_conllu_input_format(const string& options) {
18427   named_values::map parsed_options;
18428   string parse_error;
18429   if (!named_values::parse(options, parsed_options, parse_error))
18430     return nullptr;
18431 
18432   unsigned version = 2;
18433   if (parsed_options.count(CONLLU_V1))
18434     version = 1;
18435   if (parsed_options.count(CONLLU_V2))
18436     version = 2;
18437 
18438   return new input_format_conllu(version);
18439 }
18440 
new_generic_tokenizer_input_format(const string & options)18441 input_format* input_format::new_generic_tokenizer_input_format(const string& options) {
18442   named_values::map parsed_options;
18443   string parse_error;
18444   if (!named_values::parse(options, parsed_options, parse_error))
18445     return nullptr;
18446 
18447   bool normalized_spaces = parsed_options.count(GENERIC_TOKENIZER_NORMALIZED_SPACES);
18448   bool token_ranges = parsed_options.count(GENERIC_TOKENIZER_RANGES);
18449 
18450   input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges);
18451   return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result;
18452 }
18453 
new_horizontal_input_format(const string &)18454 input_format* input_format::new_horizontal_input_format(const string& /*options*/) {
18455   return new input_format_horizontal();
18456 }
18457 
new_vertical_input_format(const string &)18458 input_format* input_format::new_vertical_input_format(const string& /*options*/) {
18459   return new input_format_vertical();
18460 }
18461 
new_input_format(const string & name)18462 input_format* input_format::new_input_format(const string& name) {
18463   size_t equal = name.find('=');
18464   size_t name_len = equal != string::npos ? equal : name.size();
18465   size_t option_offset = equal != string::npos ? equal + 1 : name.size();
18466 
18467   if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset));
18468   if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset));
18469   if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset));
18470   if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset));
18471   return nullptr;
18472 }
18473 
new_presegmented_tokenizer(input_format * tokenizer)18474 input_format* input_format::new_presegmented_tokenizer(input_format* tokenizer) {
18475   return new input_format_presegmented_tokenizer(tokenizer);
18476 }
18477 
18478 /////////
18479 // File: utils/xml_encoded.h
18480 /////////
18481 
18482 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
18483 //
18484 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18485 // Mathematics and Physics, Charles University in Prague, Czech Republic.
18486 //
18487 // This Source Code Form is subject to the terms of the Mozilla Public
18488 // License, v. 2.0. If a copy of the MPL was not distributed with this
18489 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
18490 
18491 namespace utils {
18492 
18493 //
18494 // Declarations
18495 //
18496 
18497 // Print xml content while encoding <>& and optionally " using XML entities.
18498 class xml_encoded {
18499  public:
xml_encoded(string_piece str,bool encode_quot=false)18500   xml_encoded(string_piece str, bool encode_quot = false) : str(str), encode_quot(encode_quot) {}
18501 
18502   friend ostream& operator<<(ostream& os, xml_encoded data);
18503  private:
18504   string_piece str;
18505   bool encode_quot;
18506 };
18507 
18508 inline ostream& operator<<(ostream& os, xml_encoded data);
18509 
18510 //
18511 // Definitions
18512 //
18513 
operator <<(ostream & os,xml_encoded data)18514 ostream& operator<<(ostream& os, xml_encoded data) {
18515   string_piece& str = data.str;
18516   const char* to_print = str.str;
18517 
18518   while (str.len) {
18519     while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
18520       str.str++, str.len--;
18521 
18522     if (str.len) {
18523       if (to_print < str.str) os.write(to_print, str.str - to_print);
18524       os << (*str.str == '<' ? "&lt;" : *str.str == '>' ? "&gt;" : *str.str == '&' ? "&amp;" : "&quot;");
18525       str.str++, str.len--;
18526       to_print = str.str;
18527     }
18528   }
18529 
18530   if (to_print < str.str) os.write(to_print, str.str - to_print);
18531 
18532   return os;
18533 }
18534 
18535 } // namespace utils
18536 
18537 /////////
18538 // File: sentence/output_format.cpp
18539 /////////
18540 
18541 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
18542 //
18543 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18544 // Mathematics and Physics, Charles University in Prague, Czech Republic.
18545 //
18546 // This Source Code Form is subject to the terms of the Mozilla Public
18547 // License, v. 2.0. If a copy of the MPL was not distributed with this
18548 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
18549 
18550 const string output_format::CONLLU_V1 = "v1";
18551 const string output_format::CONLLU_V2 = "v2";
18552 const string output_format::HORIZONTAL_PARAGRAPHS = "paragraphs";
18553 const string output_format::PLAINTEXT_NORMALIZED_SPACES = "normalized_spaces";
18554 const string output_format::VERTICAL_PARAGRAPHS = "paragraphs";
18555 
18556 // CoNLL-U output format
18557 class output_format_conllu : public output_format {
18558  public:
output_format_conllu(unsigned version)18559   output_format_conllu(unsigned version) : version(version) {}
18560 
18561   virtual void write_sentence(const sentence& s, ostream& os) override;
18562 
18563  private:
18564   unsigned version;
18565   static const string underscore;
underscore_on_empty(const string & str) const18566   const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
18567   ostream& write_with_spaces(ostream& os, const string& str);
18568 };
18569 
18570 const string output_format_conllu::underscore = "_";
18571 
write_sentence(const sentence & s,ostream & os)18572 void output_format_conllu::write_sentence(const sentence& s, ostream& os) {
18573   // Comments
18574   for (auto&& comment : s.comments)
18575     os << comment << '\n';
18576 
18577   // Words and multiword tokens
18578   size_t multiword_token = 0, empty_node = 0;
18579   for (int i = 0; i < int(s.words.size()); i++) {
18580     // Write non-root nodes
18581     if (i > 0) {
18582       // Multiword token if present
18583       if (multiword_token < s.multiword_tokens.size() &&
18584           i == s.multiword_tokens[multiword_token].id_first) {
18585         os << s.multiword_tokens[multiword_token].id_first << '-'
18586            << s.multiword_tokens[multiword_token].id_last << '\t';
18587         write_with_spaces(os, s.multiword_tokens[multiword_token].form) << "\t_\t_\t_\t_\t_\t_\t_\t"
18588            << underscore_on_empty(s.multiword_tokens[multiword_token].misc) << '\n';
18589         multiword_token++;
18590       }
18591 
18592       // Write the word
18593       os << i << '\t';
18594       write_with_spaces(os, s.words[i].form) << '\t';
18595       write_with_spaces(os, underscore_on_empty(s.words[i].lemma)) << '\t'
18596          << underscore_on_empty(s.words[i].upostag) << '\t'
18597          << underscore_on_empty(s.words[i].xpostag) << '\t'
18598          << underscore_on_empty(s.words[i].feats) << '\t';
18599       if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t'
18600          << underscore_on_empty(s.words[i].deprel) << '\t'
18601          << underscore_on_empty(s.words[i].deps) << '\t'
18602          << underscore_on_empty(s.words[i].misc) << '\n';
18603     }
18604 
18605     // Empty nodes
18606     if (version >= 2)
18607       for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) {
18608         os << i << '.' << s.empty_nodes[empty_node].index << '\t'
18609            << s.empty_nodes[empty_node].form << '\t'
18610            << underscore_on_empty(s.empty_nodes[empty_node].lemma) << '\t'
18611            << underscore_on_empty(s.empty_nodes[empty_node].upostag) << '\t'
18612            << underscore_on_empty(s.empty_nodes[empty_node].xpostag) << '\t'
18613            << underscore_on_empty(s.empty_nodes[empty_node].feats) << '\t'
18614            << "_\t"
18615            << "_\t"
18616            << underscore_on_empty(s.empty_nodes[empty_node].deps) << '\t'
18617            << underscore_on_empty(s.empty_nodes[empty_node].misc) << '\n';
18618       }
18619   }
18620   os << endl;
18621 }
18622 
write_with_spaces(ostream & os,const string & str)18623 ostream& output_format_conllu::write_with_spaces(ostream& os, const string& str) {
18624   if (version >= 2 || str.find(' ') == string::npos)
18625     os << str;
18626   else
18627     for (auto&& chr : str)
18628       os << (chr == ' ' ? '_' : chr);
18629 
18630   return os;
18631 }
18632 
18633 // EPE output format
18634 class output_format_epe : public output_format {
18635  public:
18636   virtual void write_sentence(const sentence& s, ostream& os) override;
18637   virtual void finish_document(ostream& os) override;
18638 
18639  private:
18640   class json_builder {
18641    public:
object()18642     json_builder& object() { comma(); json.push_back('{'); stack.push_back('}'); return *this; }
array()18643     json_builder& array() { comma(); json.push_back('['); stack.push_back(']'); return *this; }
close()18644     json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; }
key(string_piece name)18645     json_builder& key(string_piece name) { comma(); string(name); json.push_back(':'); return *this; }
value(string_piece value)18646     json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
value(size_t value)18647     json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
value_true()18648     json_builder& value_true() { comma(); json.push_back('t'); json.push_back('r'); json.push_back('u'); json.push_back('e'); comma_needed=true; return *this; }
18649 
current() const18650     string_piece current() const { return string_piece(json.data(), json.size()); }
clear()18651     void clear() { json.clear(); stack.clear(); comma_needed=false; }
18652 
18653    private:
comma()18654     void comma() {
18655       if (comma_needed) {
18656         json.push_back(',');
18657         json.push_back(' ');
18658       }
18659       comma_needed = false;
18660     }
string(string_piece str)18661     void string(string_piece str) {
18662       json.push_back('"');
18663       for (; str.len; str.str++, str.len--)
18664         switch (*str.str) {
18665           case '"': json.push_back('\\'); json.push_back('\"'); break;
18666           case '\\': json.push_back('\\'); json.push_back('\\'); break;
18667           case '\b': json.push_back('\\'); json.push_back('b'); break;
18668           case '\f': json.push_back('\\'); json.push_back('f'); break;
18669           case '\n': json.push_back('\\'); json.push_back('n'); break;
18670           case '\r': json.push_back('\\'); json.push_back('r'); break;
18671           case '\t': json.push_back('\\'); json.push_back('t'); break;
18672           default:
18673             if (((unsigned char)*str.str) < 32) {
18674               json.push_back('u'); json.push_back('0'); json.push_back('0'); json.push_back('0' + (*str.str >> 4)); json.push_back("0123456789ABCDEF"[*str.str & 0xF]);
18675             } else {
18676               json.push_back(*str.str);
18677             }
18678         }
18679       json.push_back('"');
18680     }
number(size_t value)18681     void number(size_t value) {
18682       size_t start_size = json.size();
18683       for (; value || start_size == json.size(); value /= 10)
18684         json.push_back('0' + (value % 10));
18685       reverse(json.begin() + start_size, json.end());
18686     }
18687 
18688     std::vector<char> json;
18689     std::vector<char> stack;
18690     bool comma_needed = false;
18691   } json;
18692 
18693   vector<string_piece> feats;
18694   size_t sentences = 0;
18695 };
18696 
write_sentence(const sentence & s,ostream & os)18697 void output_format_epe::write_sentence(const sentence& s, ostream& os) {
18698   json.object().key("id").value(++sentences).key("nodes").array();
18699 
18700   for (size_t i = 1; i < s.words.size(); i++) {
18701     json.object().key("id").value(i).key("form").value(s.words[i].form);
18702 
18703     size_t start, end;
18704     if (s.words[i].get_token_range(start, end))
18705       json.key("start").value(start).key("end").value(end);
18706     if (s.words[i].head == 0)
18707       json.key("top").value_true();
18708 
18709     json.key("properties").object()
18710         .key("lemma").value(s.words[i].lemma)
18711         .key("upos").value(s.words[i].upostag)
18712         .key("xpos").value(s.words[i].xpostag);
18713     split(s.words[i].feats, '|', feats);
18714     for (auto&& feat : feats) {
18715       string_piece key(feat.str, 0);
18716       while (key.len < feat.len && key.str[key.len] != '=')
18717         key.len++;
18718       if (key.len + 1 < feat.len)
18719         json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1));
18720     }
18721     json.close();
18722 
18723     if (!s.words[i].children.empty()) {
18724       json.key("edges").array();
18725       for (auto&& child : s.words[i].children)
18726         json.object().key("label").value(s.words[child].deprel).key("target").value(child).close();
18727       json.close();
18728     }
18729 
18730     json.close();
18731   }
18732   json.close().close();
18733 
18734   string_piece current = json.current();
18735   os.write(current.str, current.len).put('\n');
18736   json.clear();
18737 }
18738 
finish_document(ostream &)18739 void output_format_epe::finish_document(ostream& /*os*/) {
18740   sentences = 0;
18741 }
18742 
18743 // Matxin output format
18744 class output_format_matxin : public output_format {
18745  public:
18746   virtual void write_sentence(const sentence& s, ostream& os) override;
18747   virtual void finish_document(ostream& os) override;
18748 
18749  private:
18750   void write_node(const sentence& s, int node, string& pad, ostream& os);
18751 
18752   int sentences = 0;
18753 };
18754 
write_sentence(const sentence & s,ostream & os)18755 void output_format_matxin::write_sentence(const sentence& s, ostream& os) {
18756   if (!sentences) {
18757     os << "<corpus>";
18758   }
18759   os << "\n<SENTENCE ord=\"" << ++sentences << "\" alloc=\"" << 0 << "\">\n";
18760 
18761   string pad;
18762   for (auto&& node : s.words[0].children)
18763     write_node(s, node, pad, os);
18764 
18765   os << "</SENTENCE>" << endl;
18766 }
18767 
finish_document(ostream & os)18768 void output_format_matxin::finish_document(ostream& os) {
18769   os << "</corpus>\n";
18770 
18771   sentences = 0;
18772 }
18773 
write_node(const sentence & s,int node,string & pad,ostream & os)18774 void output_format_matxin::write_node(const sentence& s, int node, string& pad, ostream& os) {
18775   // <NODE ord="%d" alloc="%d" form="%s" lem="%s" mi="%s" si="%s">
18776   pad.push_back(' ');
18777 
18778   os << pad << "<NODE ord=\"" << node << "\" alloc=\"" << 0
18779      << "\" form=\"" << xml_encoded(s.words[node].form, true)
18780      << "\" lem=\"" << xml_encoded(s.words[node].lemma, true)
18781      << "\" mi=\"" << xml_encoded(s.words[node].feats, true)
18782      << "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"';
18783 
18784   if (s.words[node].children.empty()) {
18785     os << "/>\n";
18786   } else {
18787     os << ">\n";
18788     for (auto&& child : s.words[node].children)
18789       write_node(s, child, pad, os);
18790     os << pad << "</NODE>\n";
18791   }
18792 
18793   pad.pop_back();
18794 }
18795 
18796 // Horizontal output format
18797 class output_format_horizontal : public output_format {
18798  public:
output_format_horizontal(bool paragraphs)18799   output_format_horizontal(bool paragraphs) : paragraphs(paragraphs), empty(true) {}
18800 
18801   virtual void write_sentence(const sentence& s, ostream& os) override;
finish_document(ostream &)18802   virtual void finish_document(ostream& /*os*/) override { empty = true; }
18803 
18804  private:
18805   bool paragraphs;
18806   bool empty;
18807 };
18808 
write_sentence(const sentence & s,ostream & os)18809 void output_format_horizontal::write_sentence(const sentence& s, ostream& os) {
18810   if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
18811     os << '\n';
18812   empty = false;
18813 
18814   string line;
18815   for (size_t i = 1; i < s.words.size(); i++) {
18816     // Append word, but replace spaces by &nbsp;s
18817     for (auto&& chr : s.words[i].form)
18818       if (chr == ' ')
18819         line.append("\302\240");
18820       else
18821         line.push_back(chr);
18822 
18823     if (i+1 < s.words.size())
18824       line.push_back(' ');
18825   }
18826   os << line << endl;
18827 }
18828 
18829 // Plaintext output format
18830 class output_format_plaintext : public output_format {
18831  public:
output_format_plaintext(bool normalized)18832   output_format_plaintext(bool normalized): normalized(normalized), empty(true) {}
18833 
18834   virtual void write_sentence(const sentence& s, ostream& os) override;
finish_document(ostream &)18835   virtual void finish_document(ostream& /*os*/) override { empty = true; }
18836  private:
18837   bool normalized;
18838   bool empty;
18839 };
18840 
write_sentence(const sentence & s,ostream & os)18841 void output_format_plaintext::write_sentence(const sentence& s, ostream& os) {
18842   if (normalized) {
18843     if (!empty && (s.get_new_doc() || s.get_new_par()))
18844       os << '\n';
18845     for (size_t i = 1, j = 0; i < s.words.size(); i++) {
18846       const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
18847       os << tok.form;
18848       if (i+1 < s.words.size() && tok.get_space_after())
18849         os << ' ';
18850       if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
18851         i = s.multiword_tokens[j++].id_last;
18852     }
18853     os << endl;
18854   } else {
18855     string spaces;
18856     for (size_t i = 1, j = 0; i < s.words.size(); i++) {
18857       const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
18858       tok.get_spaces_before(spaces); os << spaces;
18859       tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form);
18860       tok.get_spaces_after(spaces); os << spaces;
18861       if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
18862         i = s.multiword_tokens[j++].id_last;
18863     }
18864     os << flush;
18865   }
18866   empty = false;
18867 }
18868 
18869 // Vertical output format
18870 class output_format_vertical : public output_format {
18871  public:
output_format_vertical(bool paragraphs)18872   output_format_vertical(bool paragraphs) : paragraphs(paragraphs), empty(true) {}
18873 
18874   virtual void write_sentence(const sentence& s, ostream& os) override;
finish_document(ostream &)18875   virtual void finish_document(ostream& /*os*/) override { empty = true; }
18876 
18877  private:
18878   bool paragraphs;
18879   bool empty;
18880 };
18881 
write_sentence(const sentence & s,ostream & os)18882 void output_format_vertical::write_sentence(const sentence& s, ostream& os) {
18883   if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
18884     os << '\n';
18885   empty = false;
18886 
18887   for (size_t i = 1; i < s.words.size(); i++)
18888     os << s.words[i].form << '\n';
18889   os << endl;
18890 }
18891 
18892 // Static factory methods
new_conllu_output_format(const string & options)18893 output_format* output_format::new_conllu_output_format(const string& options) {
18894   named_values::map parsed_options;
18895   string parse_error;
18896   if (!named_values::parse(options, parsed_options, parse_error))
18897     return nullptr;
18898 
18899   unsigned version = 2;
18900   if (parsed_options.count(CONLLU_V1))
18901     version = 1;
18902   if (parsed_options.count(CONLLU_V2))
18903     version = 2;
18904 
18905   return new output_format_conllu(version);
18906 }
18907 
new_epe_output_format(const string &)18908 output_format* output_format::new_epe_output_format(const string& /*options*/) {
18909   return new output_format_epe();
18910 }
18911 
new_matxin_output_format(const string &)18912 output_format* output_format::new_matxin_output_format(const string& /*options*/) {
18913   return new output_format_matxin();
18914 }
18915 
new_horizontal_output_format(const string & options)18916 output_format* output_format::new_horizontal_output_format(const string& options) {
18917   named_values::map parsed_options;
18918   string parse_error;
18919   if (!named_values::parse(options, parsed_options, parse_error))
18920     return nullptr;
18921 
18922   return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS));
18923 }
18924 
new_plaintext_output_format(const string & options)18925 output_format* output_format::new_plaintext_output_format(const string& options) {
18926   named_values::map parsed_options;
18927   string parse_error;
18928   if (!named_values::parse(options, parsed_options, parse_error))
18929     return nullptr;
18930 
18931   return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES));
18932 }
18933 
new_vertical_output_format(const string & options)18934 output_format* output_format::new_vertical_output_format(const string& options) {
18935   named_values::map parsed_options;
18936   string parse_error;
18937   if (!named_values::parse(options, parsed_options, parse_error))
18938     return nullptr;
18939 
18940   return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS));
18941 }
18942 
new_output_format(const string & name)18943 output_format* output_format::new_output_format(const string& name) {
18944   size_t equal = name.find('=');
18945   size_t name_len = equal != string::npos ? equal : name.size();
18946   size_t option_offset = equal != string::npos ? equal + 1 : name.size();
18947 
18948   if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset));
18949   if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset));
18950   if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset));
18951   if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset));
18952   if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset));
18953   if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset));
18954   return nullptr;
18955 }
18956 
18957 /////////
18958 // File: sentence/sentence.cpp
18959 /////////
18960 
18961 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
18962 //
18963 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18964 // Mathematics and Physics, Charles University in Prague, Czech Republic.
18965 //
18966 // This Source Code Form is subject to the terms of the Mozilla Public
18967 // License, v. 2.0. If a copy of the MPL was not distributed with this
18968 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
18969 
18970 const string sentence::root_form = "<root>";
18971 
sentence()18972 sentence::sentence() {
18973   clear();
18974 }
18975 
empty()18976 bool sentence::empty() {
18977   return words.size() == 1;
18978 }
18979 
clear()18980 void sentence::clear() {
18981   words.clear();
18982   multiword_tokens.clear();
18983   empty_nodes.clear();
18984   comments.clear();
18985 
18986   word& root = add_word(root_form);
18987   root.lemma = root.upostag = root.xpostag = root.feats = root_form;
18988 }
18989 
add_word(string_piece form)18990 word& sentence::add_word(string_piece form) {
18991   words.emplace_back(words.size(), form);
18992   return words.back();
18993 }
18994 
set_head(int id,int head,const string & deprel)18995 void sentence::set_head(int id, int head, const string& deprel) {
18996   assert(id >= 0 && id < int(words.size()));
18997   assert(head < int(words.size()));
18998 
18999   // Remove existing head
19000   if (words[id].head >= 0) {
19001     auto& children = words[words[id].head].children;
19002     for (size_t i = children.size(); i && children[i-1] >= id; i--)
19003       if (children[i-1] == id) {
19004         children.erase(children.begin() + i - 1);
19005         break;
19006       }
19007   }
19008 
19009   // Set new head
19010   words[id].head = head;
19011   words[id].deprel = deprel;
19012   if (head >= 0) {
19013     auto& children = words[head].children;
19014     size_t i = children.size();
19015     while (i && children[i-1] > id) i--;
19016     if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
19017   }
19018 }
19019 
unlink_all_words()19020 void sentence::unlink_all_words() {
19021   for (auto&& word : words) {
19022     word.head = -1;
19023     word.deprel.clear();
19024     word.children.clear();
19025   }
19026 }
19027 
get_new_doc(string * id) const19028 bool sentence::get_new_doc(string* id) const {
19029   if (get_comment("newdoc id", id))
19030     return true;
19031   return get_comment("newdoc", id);
19032 }
19033 
set_new_doc(bool new_doc,string_piece id)19034 void sentence::set_new_doc(bool new_doc, string_piece id) {
19035   remove_comment("newdoc");
19036   remove_comment("newdoc id");
19037 
19038   if (new_doc && id.len)
19039     set_comment("newdoc id", id);
19040   else if (new_doc)
19041     set_comment("newdoc");
19042 }
19043 
get_new_par(string * id) const19044 bool sentence::get_new_par(string* id) const {
19045   if (get_comment("newpar id", id))
19046     return true;
19047   return get_comment("newpar", id);
19048 }
19049 
set_new_par(bool new_par,string_piece id)19050 void sentence::set_new_par(bool new_par, string_piece id) {
19051   remove_comment("newpar");
19052   remove_comment("newpar id");
19053 
19054   if (new_par && id.len)
19055     set_comment("newpar id", id);
19056   else if (new_par)
19057     set_comment("newpar");
19058 }
19059 
get_sent_id(string & id) const19060 bool sentence::get_sent_id(string& id) const {
19061   id.clear();
19062 
19063   return get_comment("sent_id", &id);
19064 }
19065 
set_sent_id(string_piece id)19066 void sentence::set_sent_id(string_piece id) {
19067   remove_comment("sent_id");
19068 
19069   if (id.len)
19070     set_comment("sent_id", id);
19071 }
19072 
get_text(string & text) const19073 bool sentence::get_text(string& text) const {
19074   text.clear();
19075 
19076   return get_comment("text", &text);
19077 }
19078 
set_text(string_piece text)19079 void sentence::set_text(string_piece text) {
19080   remove_comment("text");
19081 
19082   if (text.len)
19083     set_comment("text", text);
19084 }
19085 
get_comment(string_piece name,string * value) const19086 bool sentence::get_comment(string_piece name, string* value) const {
19087   for (auto&& comment : comments)
19088     if (comment[0] == '#') {
19089       // Skip spaces
19090       unsigned j = 1;
19091       while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
19092 
19093       // Try matching the name
19094       if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) {
19095         j += name.len;
19096         while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
19097         if (j < comment.size() && comment[j] == '=') {
19098           //We have a value
19099           j++;
19100           while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
19101           if (value) value->assign(comment, j, comment.size() - j);
19102         } else {
19103           // No value
19104           if (value) value->clear();
19105         }
19106 
19107         return true;
19108       }
19109     }
19110 
19111   return false;
19112 }
19113 
remove_comment(string_piece name)19114 void sentence::remove_comment(string_piece name) {
19115   for (unsigned i = comments.size(); i--; )
19116     if (comments[i][0] == '#') {
19117       // Skip spaces
19118       unsigned j = 1;
19119       while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++;
19120 
19121       // Remove matching comments
19122       if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0)
19123         comments.erase(comments.begin() + i);
19124     }
19125 }
19126 
set_comment(string_piece name,string_piece value)19127 void sentence::set_comment(string_piece name, string_piece value) {
19128   remove_comment(name);
19129 
19130   string comment;
19131   comment.append("# ").append(name.str, name.len);
19132   if (value.len) {
19133     comment.append(" = ");
19134     for (size_t i = 0; i < value.len; i++)
19135       comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]);
19136   }
19137   comments.push_back(move(comment));
19138 }
19139 
19140 /////////
19141 // File: sentence/token.cpp
19142 /////////
19143 
19144 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19145 //
19146 // Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of
19147 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19148 //
19149 // This Source Code Form is subject to the terms of the Mozilla Public
19150 // License, v. 2.0. If a copy of the MPL was not distributed with this
19151 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19152 
token(string_piece form,string_piece misc)19153 token::token(string_piece form, string_piece misc) {
19154   if (form.len) this->form.assign(form.str, form.len);
19155   if (misc.len) this->misc.assign(misc.str, misc.len);
19156 }
19157 
19158 // CoNLL-U defined SpaceAfter=No feature
get_space_after() const19159 bool token::get_space_after() const {
19160   string_piece value;
19161 
19162   return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0);
19163 }
19164 
set_space_after(bool space_after)19165 void token::set_space_after(bool space_after) {
19166   if (space_after)
19167     remove_misc_field("SpaceAfter");
19168   else
19169     start_misc_field("SpaceAfter").append("No");
19170 }
19171 
19172 // UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
get_spaces_before(string & spaces_before) const19173 void token::get_spaces_before(string& spaces_before) const {
19174   string_piece value;
19175 
19176   if (get_misc_field("SpacesBefore", value))
19177     unescape_spaces(value, spaces_before);
19178   else
19179     spaces_before.clear();
19180 }
19181 
set_spaces_before(string_piece spaces_before)19182 void token::set_spaces_before(string_piece spaces_before) {
19183   if (spaces_before.len == 0)
19184     remove_misc_field("SpacesBefore");
19185   else
19186     append_escaped_spaces(spaces_before, start_misc_field("SpacesBefore"));
19187 }
19188 
get_spaces_after(string & spaces_after) const19189 void token::get_spaces_after(string& spaces_after) const {
19190   string_piece value;
19191 
19192   if (get_misc_field("SpacesAfter", value))
19193     unescape_spaces(value, spaces_after);
19194   else
19195     spaces_after.assign(get_space_after() ? " " : "");
19196 }
19197 
set_spaces_after(string_piece spaces_after)19198 void token::set_spaces_after(string_piece spaces_after) {
19199   if (spaces_after.len == 0) {
19200     set_space_after(false);
19201     remove_misc_field("SpacesAfter");
19202   } else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') {
19203     set_space_after(true);
19204     remove_misc_field("SpacesAfter");
19205   } else {
19206     set_space_after(true);
19207     append_escaped_spaces(spaces_after, start_misc_field("SpacesAfter"));
19208   }
19209 }
19210 
get_spaces_in_token(string & spaces_in_token) const19211 void token::get_spaces_in_token(string& spaces_in_token) const {
19212   string_piece value;
19213 
19214   if (get_misc_field("SpacesInToken", value))
19215     unescape_spaces(value, spaces_in_token);
19216   else
19217     spaces_in_token.clear();
19218 }
19219 
set_spaces_in_token(string_piece spaces_in_token)19220 void token::set_spaces_in_token(string_piece spaces_in_token) {
19221   if (spaces_in_token.len == 0)
19222     remove_misc_field("SpacesInToken");
19223   else
19224     append_escaped_spaces(spaces_in_token, start_misc_field("SpacesInToken"));
19225 }
19226 
19227 // UDPipe-specific TokenRange feature
get_token_range(size_t & start,size_t & end) const19228 bool token::get_token_range(size_t& start, size_t& end) const {
19229   string_piece value;
19230 
19231   if (!get_misc_field("TokenRange", value)) return false;
19232 
19233   start = 0;
19234   while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
19235     if (start > (numeric_limits<size_t>::max() - (value.str[0] - '0')) / 10)
19236       return false;
19237     start = 10 * start + (value.str[0] - '0');
19238     value.str++, value.len--;
19239   }
19240 
19241   if (value.len == 0 || value.str[0] != ':') return false;
19242   value.str++, value.len--;
19243 
19244   end = 0;
19245   while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
19246     if (end > (numeric_limits<size_t>::max() - (value.str[0] - '0')) / 10)
19247       return false;
19248     end = 10 * end + (value.str[0] - '0');
19249     value.str++, value.len--;
19250   }
19251 
19252   return true;
19253 }
19254 
set_token_range(size_t start,size_t end)19255 void token::set_token_range(size_t start, size_t end) {
19256   if (start == size_t(string::npos))
19257     remove_misc_field("TokenRange");
19258   else
19259     start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end));
19260 }
19261 
19262 // Private MISC field helpers
get_misc_field(string_piece name,string_piece & value) const19263 bool token::get_misc_field(string_piece name, string_piece& value) const {
19264   for (size_t index = 0; index < misc.size(); ) {
19265     if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
19266       index += name.len + 1;
19267       value.str = misc.c_str() + index;
19268       value.len = misc.find('|', index);
19269       value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index;
19270       return true;
19271     }
19272     index = misc.find('|', index);
19273     if (index != size_t(string::npos)) index++;
19274   }
19275   return false;
19276 }
19277 
remove_misc_field(string_piece name)19278 void token::remove_misc_field(string_piece name) {
19279   for (size_t index = 0; index < misc.size(); )
19280     if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
19281       size_t end_index = misc.find('|', index + name.len + 1);
19282       if (end_index == size_t(string::npos)) end_index = misc.size();
19283 
19284       // Be careful to delete at most one neighboring '|'
19285       if (index)
19286         misc.erase(index - 1, end_index - (index - 1));
19287       else
19288         misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index);
19289     } else {
19290       index = misc.find('|', index);
19291       if (index != size_t(string::npos)) index++;
19292     }
19293 }
19294 
start_misc_field(string_piece name)19295 string& token::start_misc_field(string_piece name) {
19296   remove_misc_field(name);
19297   if (!misc.empty()) misc.push_back('|');
19298   misc.append(name.str, name.len).push_back('=');
19299   return misc;
19300 }
19301 
append_escaped_spaces(string_piece spaces,string & escaped_spaces) const19302 void token::append_escaped_spaces(string_piece spaces, string& escaped_spaces) const {
19303   for (unsigned i = 0; i < spaces.len; i++)
19304     switch (spaces.str[i]) {
19305       case ' ':
19306         escaped_spaces.push_back('\\'); escaped_spaces.push_back('s'); break;
19307       case '|':
19308         escaped_spaces.push_back('\\'); escaped_spaces.push_back('p'); break;
19309       case '\t':
19310         escaped_spaces.push_back('\\'); escaped_spaces.push_back('t'); break;
19311       case '\r':
19312         escaped_spaces.push_back('\\'); escaped_spaces.push_back('r'); break;
19313       case '\n':
19314         escaped_spaces.push_back('\\'); escaped_spaces.push_back('n'); break;
19315       case '\\':
19316         escaped_spaces.push_back('\\'); escaped_spaces.push_back('\\'); break;
19317       default:
19318         escaped_spaces.push_back(spaces.str[i]);
19319     }
19320 }
19321 
unescape_spaces(string_piece escaped_spaces,string & spaces) const19322 void token::unescape_spaces(string_piece escaped_spaces, string& spaces) const {
19323   spaces.clear();
19324 
19325   for (unsigned i = 0; i < escaped_spaces.len; i++)
19326     if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len)
19327       spaces.push_back(escaped_spaces.str[i]);
19328     else switch (escaped_spaces.str[++i]) {
19329       case 's':
19330         spaces.push_back(' '); break;
19331       case 'p':
19332         spaces.push_back('|'); break;
19333       case 't':
19334         spaces.push_back('\t'); break;
19335       case 'r':
19336         spaces.push_back('\r'); break;
19337       case 'n':
19338         spaces.push_back('\n'); break;
19339       case '\\':
19340         spaces.push_back('\\'); break;
19341       default:
19342         spaces.push_back(escaped_spaces.str[i - 1]);
19343         spaces.push_back(escaped_spaces.str[i]);
19344     }
19345 }
19346 
19347 /////////
19348 // File: tokenizer/detokenizer.h
19349 /////////
19350 
19351 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19352 //
19353 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19354 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19355 //
19356 // This Source Code Form is subject to the terms of the Mozilla Public
19357 // License, v. 2.0. If a copy of the MPL was not distributed with this
19358 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19359 
19360 class detokenizer {
19361  public:
19362   detokenizer(const string& plain_text);
19363 
19364   void detokenize(sentence& s) const;
19365  private:
19366   enum { LOWERCASE, CATEGORIZE, TOTAL };
19367 
19368   int difference(const string& left, const string& right, bool separate, int mode) const;
19369 
19370   static string perform_lowercase(const string& input);
19371   static string perform_categorize(const string& input);
19372   bool has_letters(const string& word) const;
19373   bool only_digits(const string& word) const;
19374 
19375   class suffix_array {
19376    public:
19377     suffix_array(const string& str);
19378     suffix_array(suffix_array&& other) = default;
19379 
19380     unsigned count(const string& data) const;
19381 
19382    private:
19383     vector<unsigned> sa;
19384 
19385     struct suffix_compare {
suffix_compareufal::udpipe::detokenizer::suffix_array::suffix_compare19386       suffix_compare(const string& str) : str(str) {}
operator ()ufal::udpipe::detokenizer::suffix_array::suffix_compare19387       bool operator()(unsigned a, unsigned b) const { return str.compare(a, string::npos, str, b, string::npos) < 0; }
19388      private:
19389       const string& str;
19390     } suffix_comparator;
19391 
19392     struct suffix_lower_find {
suffix_lower_findufal::udpipe::detokenizer::suffix_array::suffix_lower_find19393       suffix_lower_find(const string& str) : str(str) {}
operator ()ufal::udpipe::detokenizer::suffix_array::suffix_lower_find19394       bool operator()(unsigned a, const string& data) const { return str.compare(a, data.size(), data) < 0; }
19395 
19396      private:
19397       const string& str;
19398     } suffix_lower_finder;
19399 
19400     struct suffix_upper_find {
suffix_upper_findufal::udpipe::detokenizer::suffix_array::suffix_upper_find19401       suffix_upper_find(const string& str) : str(str) {}
operator ()ufal::udpipe::detokenizer::suffix_array::suffix_upper_find19402       bool operator()(const string& data, unsigned a) const { return str.compare(a, data.size(), data) > 0; }
19403 
19404      private:
19405       const string& str;
19406     } suffix_upper_finder;
19407   };
19408 
19409   string data_lowercased, data_categorized;
19410   suffix_array sa_lowercased, sa_categorized;
19411 };
19412 
19413 /////////
19414 // File: tokenizer/detokenizer.cpp
19415 /////////
19416 
19417 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19418 //
19419 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19420 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19421 //
19422 // This Source Code Form is subject to the terms of the Mozilla Public
19423 // License, v. 2.0. If a copy of the MPL was not distributed with this
19424 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19425 
detokenizer(const string & plain_text)19426 detokenizer::detokenizer(const string& plain_text)
19427     : data_lowercased(perform_lowercase(plain_text)), data_categorized(perform_categorize(plain_text)),
19428     sa_lowercased(data_lowercased), sa_categorized(data_categorized) {}
19429 
detokenize(sentence & s) const19430 void detokenizer::detokenize(sentence& s) const {
19431   token* previous_tok = nullptr;
19432   for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19433     token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i];
19434 
19435     if (previous_tok) {
19436       // Should we add SpaceAfter=No to the previous form?
19437       int score = difference(previous_tok->form, tok->form, true, LOWERCASE);
19438       if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0;
19439       if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0;
19440       if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE);
19441       if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE);
19442       if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE);
19443 
19444       if (score > 0)
19445         previous_tok->set_space_after(false);
19446     }
19447 
19448     // Remove the SpaceAfter attribute on current token
19449     tok->set_space_after(true);
19450     previous_tok = tok;
19451 
19452     if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
19453       i = s.multiword_tokens[j++].id_last;
19454   }
19455 }
19456 
difference(const string & left,const string & right,bool separate,int mode) const19457 int detokenizer::difference(const string& left, const string& right, bool separate, int mode) const {
19458   auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize;
19459   auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized;
19460 
19461   string left_mapped = func(left);
19462   string right_mapped = func(right);
19463   string pattern;
19464 
19465   pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":"");
19466   int together = sa.count(pattern);
19467 
19468   pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
19469   int apart = sa.count(pattern);
19470 
19471   return together - apart;
19472 }
19473 
perform_lowercase(const string & input)19474 string detokenizer::perform_lowercase(const string& input) {
19475   using namespace unilib;
19476 
19477   string output;
19478   for (auto&& chr : utf8::decoder(input))
19479     utf8::append(output, unicode::lowercase(chr));
19480   return output;
19481 }
19482 
perform_categorize(const string & input)19483 string detokenizer::perform_categorize(const string& input) {
19484   using namespace unilib;
19485 
19486   string output;
19487   for (auto&& chr : utf8::decoder(input)) {
19488     auto category = unicode::category(chr);
19489     if (category & unicode::C) output.push_back('C');
19490     if (category & unicode::L) output.push_back('L');
19491     if (category & unicode::M) output.push_back('M');
19492     if (category & unicode::N) output.push_back('N');
19493     if (category & unicode::Pc) output.push_back('c');
19494     if (category & unicode::Pd) output.push_back('d');
19495     if (category & unicode::Pe) output.push_back('e');
19496     if (category & unicode::Pf) output.push_back('f');
19497     if (category & unicode::Pi) output.push_back('i');
19498     if (category & unicode::Po) output.push_back('o');
19499     if (category & unicode::Ps) output.push_back('s');
19500     if (category & unicode::S) output.push_back('S');
19501     if (category & unicode::Zl) output.push_back('Z');
19502     if (category & unicode::Zp) output.push_back('z');
19503     if (category & unicode::Zs) output.push_back(' ');
19504   }
19505   return output;
19506 }
19507 
has_letters(const string & word) const19508 bool detokenizer::has_letters(const string& word) const {
19509   using namespace unilib;
19510 
19511   for (auto&& chr : utf8::decoder(word))
19512     if (unicode::category(chr) & unicode::L)
19513       return true;
19514   return false;
19515 }
19516 
only_digits(const string & word) const19517 bool detokenizer::only_digits(const string& word) const {
19518   using namespace unilib;
19519 
19520   for (auto&& chr : utf8::decoder(word))
19521     if (unicode::category(chr) & ~unicode::N)
19522       return false;
19523   return true;
19524 }
19525 
suffix_array(const string & str)19526 detokenizer::suffix_array::suffix_array(const string& str) : suffix_comparator(str), suffix_lower_finder(str), suffix_upper_finder(str) {
19527   sa.reserve(str.size());
19528   for (unsigned i = 0; i < str.size(); i++)
19529     sa.push_back(i);
19530 
19531   sort(sa.begin(), sa.end(), suffix_comparator);
19532 }
19533 
count(const string & data) const19534 unsigned detokenizer::suffix_array::count(const string& data) const {
19535   auto lower_it = lower_bound(sa.begin(), sa.end(), data, suffix_lower_finder);
19536   auto upper_it = upper_bound(sa.begin(), sa.end(), data, suffix_upper_finder);
19537   return upper_it - lower_it;
19538 }
19539 
19540 /////////
19541 // File: tokenizer/morphodita_tokenizer_wrapper.cpp
19542 /////////
19543 
19544 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19545 //
19546 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
19547 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19548 //
19549 // This Source Code Form is subject to the terms of the Mozilla Public
19550 // License, v. 2.0. If a copy of the MPL was not distributed with this
19551 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19552 
morphodita_tokenizer_wrapper(morphodita::tokenizer * tokenizer,const multiword_splitter * splitter,bool normalized_spaces,bool token_ranges)19553 morphodita_tokenizer_wrapper::morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter,
19554                                                            bool normalized_spaces, bool token_ranges)
19555   : tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {}
19556 
read_block(istream & is,string & block) const19557 bool morphodita_tokenizer_wrapper::read_block(istream& is, string& block) const {
19558   return bool(getpara(is, block));
19559 }
19560 
reset_document(string_piece id)19561 void morphodita_tokenizer_wrapper::reset_document(string_piece id) {
19562   new_document = true;
19563   document_id.assign(id.str, id.len);
19564   preceeding_newlines = 2;
19565   sentence_id = 1;
19566   set_text("");
19567   unicode_offset = 0;
19568   text_unicode_length = 0;
19569   saved_spaces.clear();
19570 }
19571 
set_text(string_piece text,bool make_copy)19572 void morphodita_tokenizer_wrapper::set_text(string_piece text, bool make_copy) {
19573   // Start by skipping spaces and copying them to saved_spaces
19574   string_piece following;
19575   for (char32_t chr;
19576        text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
19577                     (unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t');
19578        text = following, unicode_offset++)
19579     saved_spaces.append(text.str, following.str - text.str);
19580 
19581   // Offset unicode_offset by length of previous text, update text_unicode_length for the new text
19582   unicode_offset += text_unicode_length;
19583   text_unicode_length = 0;
19584   for (following = text; following.len; unilib::utf8::decode(following.str, following.len))
19585     text_unicode_length++;
19586 
19587   // Copy the text to local storage if needed
19588   if (make_copy) {
19589     text_copy.assign(text.str, text.len);
19590     text = string_piece(text_copy.c_str(), text_copy.size());
19591   }
19592 
19593   // Store the text locally and in the morphodita::tokenizer
19594   this->text = text;
19595   tokenizer->set_text(this->text, false);
19596 
19597 }
19598 
next_sentence(sentence & s,string & error)19599 bool morphodita_tokenizer_wrapper::next_sentence(sentence& s, string& error) {
19600   unsigned following_newlines = 0;
19601 
19602   s.clear();
19603   error.clear();
19604 
19605   if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) {
19606     // The forms returned by GRU tokenizer *should not* start/end with spaces,
19607     // but we trim them anyway (including all "remove empty forms/sentences" machinery).
19608     for (size_t i = 0; i < forms.size(); i++) {
19609       while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' ||
19610                               forms[i].str[0] == '\t' || forms[i].str[0] == ' '))
19611         forms[i].str++, forms[i].len--;
19612       while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' ||
19613                               forms[i].str[forms[i].len-1] == '\t' || forms[i].str[forms[i].len-1] == ' '))
19614         forms[i].len--;
19615       if (!forms[i].len)
19616         forms.erase(forms.begin() + i--);
19617     }
19618     if (!forms.size()) return next_sentence(s, error);
19619 
19620     for (size_t i = 0; i < forms.size(); i++) {
19621       // The form might contain spaces, even '\r', '\n' or '\t',
19622       // which we change to space. We also normalize multiple spaces to one.
19623       tok.form.clear();
19624       for (size_t j = 0; j < forms[i].len; j++) {
19625         char chr = forms[i].str[j];
19626         if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' ';
19627         if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ')
19628           tok.form.push_back(chr);
19629       }
19630 
19631       // Track pre-sentence spaces and store SpacesBefore
19632       if (i == 0) {
19633         if (forms[0].str > text.str)
19634           saved_spaces.append(text.str, forms[0].str - text.str);
19635         preceeding_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n');
19636       }
19637       if (!normalized_spaces) {
19638         tok.set_spaces_before(i == 0 ? saved_spaces : "");
19639       }
19640       saved_spaces.clear();
19641 
19642       // Track post-sentence spaces and store SpaceAfter, SpacesInToken and SpacesAfter
19643       if (i+1 == forms.size()) {
19644         text.len -= forms[i].str + forms[i].len - text.str;
19645         text.str = forms[i].str + forms[i].len;
19646 
19647         string_piece following;
19648         for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
19649                                         (unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following)
19650           saved_spaces.append(text.str, following.str - text.str);
19651 
19652         following_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n');
19653       }
19654       if (normalized_spaces) {
19655         tok.set_space_after(!(i+1 < forms.size() && forms[i+1].str == forms[i].str + forms[i].len));
19656       } else {
19657         tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : "");
19658         tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len));
19659       }
19660       saved_spaces.clear();
19661 
19662       // Store TokenRange if requested
19663       if (token_ranges)
19664         tok.set_token_range(unicode_offset + tokens[i].start, unicode_offset + tokens[i].start + tokens[i].length);
19665 
19666       if (splitter)
19667         splitter->append_token(tok.form, tok.misc, s);
19668       else
19669         s.add_word(tok.form).misc.assign(tok.misc);
19670     }
19671 
19672     // Mark new document if needed
19673     if (new_document) {
19674       s.set_new_doc(true, document_id);
19675       new_document = false;
19676     }
19677 
19678     // Mark new paragraph if needed
19679     if (preceeding_newlines >= 2)
19680       s.set_new_par(true);
19681     preceeding_newlines = following_newlines;
19682 
19683     s.set_sent_id(to_string(sentence_id++));
19684 
19685     // Fill "# text" comment
19686     s.comments.emplace_back("# text = ");
19687     for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19688       const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
19689       if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
19690         i = s.multiword_tokens[j++].id_last;
19691 
19692       s.comments.back().append(tok.form);
19693       if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
19694     }
19695 
19696     return true;
19697   }
19698 
19699   // Save unused text parts.
19700   if (text.len) {
19701     saved_spaces.append(text.str, text.len);
19702     text.str += text.len;
19703     text.len = 0;
19704   }
19705 
19706   return false;
19707 }
19708 
19709 /////////
19710 // File: tokenizer/multiword_splitter.cpp
19711 /////////
19712 
19713 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19714 //
19715 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19716 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19717 //
19718 // This Source Code Form is subject to the terms of the Mozilla Public
19719 // License, v. 2.0. If a copy of the MPL was not distributed with this
19720 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19721 
append_token(string_piece token,string_piece misc,sentence & s) const19722 void multiword_splitter::append_token(string_piece token, string_piece misc, sentence& s) const {
19723   using namespace unilib;
19724 
19725   // Buffer
19726   s.add_word();
19727   string& buffer = s.words.back().form;
19728 
19729   // Lowercase the token
19730   utf8::map(unicode::lowercase, token.str, token.len, buffer);
19731   reverse(buffer.begin(), buffer.end());
19732 
19733   // Try finding lowercased version in the full_rules
19734   size_t prefix_len = 0;
19735   auto it = full_rules.find(buffer);
19736 
19737   if (it == full_rules.end()) {
19738     if (version >= 2) {
19739       string& suffix = s.words.back().misc;
19740       // Try searching suffix_rules if needed
19741       while (suffix.size() + 1 < buffer.size()) {
19742         suffix.push_back(buffer[suffix.size()]);
19743 
19744         auto suffix_it = suffix_rules.find(suffix);
19745         if (suffix_it == suffix_rules.end())
19746           break;
19747 
19748         if (!suffix_it->second.words.empty()) {
19749           it = suffix_it;
19750           prefix_len = buffer.size() - suffix.size();
19751         }
19752       }
19753       suffix.clear();
19754     }
19755 
19756     if (!prefix_len) {
19757       // No match
19758       s.words.back().form.assign(token.str, token.len);
19759       if (misc.len) s.words.back().misc.assign(misc.str, misc.len);
19760       return;
19761     }
19762   }
19763 
19764   // Determine casing
19765   enum { UC_FIRST, UC_ALL, OTHER } casing = OTHER;
19766 
19767   if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) {
19768     casing = UC_ALL;
19769     for (auto&& chr : utf8::decoder(token.str, token.len))
19770       if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; }
19771   }
19772 
19773   // Fill the multiword token
19774   s.multiword_tokens.emplace_back(s.words.back().id, s.words.back().id + it->second.words.size() - 1, token, misc);
19775 
19776   s.words.back().form.clear();
19777   if (prefix_len) {
19778     // Note that prefix_len is measured in byte length of lowercased characters
19779     string_piece suffix(token);
19780     while (s.words.back().form.size() < prefix_len && suffix.len)
19781       utf8::append(s.words.back().form, unicode::lowercase(utf8::decode(suffix.str, suffix.len)));
19782     s.words.back().form.assign(token.str, token.len - suffix.len);
19783   }
19784   for (auto&& chr : utf8::decoder(it->second.words[0]))
19785     utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr);
19786 
19787   for (size_t i = 1; i < it->second.words.size(); i++)
19788     if (casing != UC_ALL) {
19789       s.add_word(it->second.words[i]);
19790     } else {
19791       s.add_word();
19792       utf8::map(unicode::uppercase, it->second.words[i], s.words.back().form);
19793     }
19794 }
19795 
load(istream & is)19796 multiword_splitter* multiword_splitter::load(istream& is) {
19797   char version;
19798   if (!is.get(version)) return nullptr;
19799   if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr;
19800 
19801   binary_decoder data;
19802   if (!compressor::load(is, data)) return nullptr;
19803 
19804   unique_ptr<multiword_splitter> splitter(new multiword_splitter(version));
19805   try {
19806     for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) {
19807       string full_rule;
19808       data.next_str(full_rule);
19809       reverse(full_rule.begin(), full_rule.end());
19810 
19811       // Add the full_rule and its words
19812       auto& info = splitter->full_rules[full_rule];
19813       for (unsigned words = data.next_1B(); words; words--) {
19814         info.words.emplace_back();
19815         data.next_str(info.words.back());
19816       }
19817       if (info.words.empty()) return nullptr;
19818     }
19819 
19820     if (version >= 2)
19821       for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) {
19822         string suffix_rule;
19823         data.next_str(suffix_rule);
19824         reverse(suffix_rule.begin(), suffix_rule.end());
19825 
19826         // Add the suffix_rule and its words
19827         auto& info = splitter->suffix_rules[suffix_rule];
19828         for (unsigned words = data.next_1B(); words; words--) {
19829           info.words.emplace_back();
19830           data.next_str(info.words.back());
19831         }
19832         if (info.words.empty()) return nullptr;
19833 
19834         // Add prefixes of the suffix with empty data
19835         if (!suffix_rule.empty())
19836           for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back())
19837             splitter->suffix_rules[suffix_rule];
19838       }
19839   } catch (binary_decoder_error&) {
19840     return nullptr;
19841   }
19842 
19843   return data.is_end() ? splitter.release() : nullptr;
19844 }
19845 
19846 /////////
19847 // File: tokenizer/multiword_splitter_trainer.h
19848 /////////
19849 
19850 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19851 //
19852 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19853 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19854 //
19855 // This Source Code Form is subject to the terms of the Mozilla Public
19856 // License, v. 2.0. If a copy of the MPL was not distributed with this
19857 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19858 
19859 class multiword_splitter_trainer {
19860  public:
19861   static bool train(const vector<sentence>& data, ostream& os, string& error);
19862 };
19863 
19864 /////////
19865 // File: tokenizer/multiword_splitter_trainer.cpp
19866 /////////
19867 
19868 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19869 //
19870 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19871 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19872 //
19873 // This Source Code Form is subject to the terms of the Mozilla Public
19874 // License, v. 2.0. If a copy of the MPL was not distributed with this
19875 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19876 
train(const vector<sentence> & data,ostream & os,string & error)19877 bool multiword_splitter_trainer::train(const vector<sentence>& data, ostream& os, string& error) {
19878   using namespace unilib;
19879   error.clear();
19880 
19881   // Train
19882   struct rule_info {
19883     vector<string> words;
19884     unsigned count = 0;
19885   };
19886   map<string, rule_info> full_rules, suffix_rules;
19887 
19888   // Full rules
19889   string lc_form;
19890   vector<string> lc_words;
19891   for (auto&& sentence : data)
19892     for (auto&& multiword : sentence.multiword_tokens) {
19893       utf8::map(unicode::lowercase, multiword.form, lc_form);
19894       lc_words.clear();
19895       for (int i = multiword.id_first; i <= multiword.id_last; i++)
19896         utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back()));
19897 
19898       auto& info = full_rules[lc_form];
19899       if (info.words.empty())
19900         info.words.assign(lc_words.begin(), lc_words.end());
19901       info.count += lc_words == info.words;
19902       if (!info.count) full_rules.erase(lc_form);
19903     }
19904 
19905   // Remove the full rules which trigger too negatively
19906   for (auto&& sentence : data)
19907     for (size_t i = 1, j = 0; i < sentence.words.size(); i++) {
19908       if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
19909         i = sentence.multiword_tokens[j++].id_last;
19910         continue;
19911       }
19912 
19913       utf8::map(unicode::lowercase, sentence.words[i].form, lc_form);
19914       auto it = full_rules.find(lc_form);
19915       if (it != full_rules.end())
19916         if (!--it->second.count)
19917           full_rules.erase(it);
19918     }
19919 
19920   // Suffix rules
19921   for (auto&& full_rule : full_rules) {
19922     size_t prefix_match = 0;
19923     while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++;
19924     for (; prefix_match; prefix_match--)
19925       if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) {
19926         lc_form.assign(full_rule.first, prefix_match, string::npos);
19927         lc_words.assign(full_rule.second.words.begin(), full_rule.second.words.end());
19928         lc_words[0].erase(0, prefix_match);
19929 
19930         auto& info = suffix_rules[lc_form];
19931         if (info.words.empty())
19932           info.words.assign(lc_words.begin(), lc_words.end());
19933         info.count += lc_words == info.words;
19934         if (!info.count) suffix_rules.erase(lc_form);
19935       }
19936   }
19937 
19938   // Remove the suffix rules which trigger too negatively
19939   for (auto&& sentence : data)
19940     for (size_t i = 1, j = 0; i < sentence.words.size(); i++) {
19941       if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
19942         i = sentence.multiword_tokens[j++].id_last;
19943         continue;
19944       }
19945 
19946       utf8::map(unicode::lowercase, sentence.words[i].form, lc_form);
19947       while (lc_form.size() > 1) {
19948         lc_form.erase(0, 1);
19949         auto it = suffix_rules.find(lc_form);
19950         if (it != suffix_rules.end()) {
19951           if (it->second.count <= 10)
19952             suffix_rules.erase(it);
19953           else
19954             it->second.count -= 10;
19955         }
19956       }
19957     }
19958 
19959   // Encode
19960   binary_encoder enc;
19961   enc.add_4B(full_rules.size());
19962   for (auto&& full_rule : full_rules) {
19963     enc.add_str(full_rule.first);
19964     enc.add_1B(full_rule.second.words.size());
19965     for (auto& word : full_rule.second.words)
19966       enc.add_str(word);
19967   }
19968   enc.add_4B(suffix_rules.size());
19969   for (auto&& suffix_rule : suffix_rules) {
19970     enc.add_str(suffix_rule.first);
19971     enc.add_1B(suffix_rule.second.words.size());
19972     for (auto& word : suffix_rule.second.words)
19973       enc.add_str(word);
19974   }
19975 
19976   // Save
19977   os.put(multiword_splitter::VERSION_LATEST);
19978   if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false;
19979 
19980   return true;
19981 }
19982 
19983 /////////
19984 // File: trainer/trainer.h
19985 /////////
19986 
19987 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
19988 //
19989 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
19990 // Mathematics and Physics, Charles University in Prague, Czech Republic.
19991 //
19992 // This Source Code Form is subject to the terms of the Mozilla Public
19993 // License, v. 2.0. If a copy of the MPL was not distributed with this
19994 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
19995 
19996 class trainer {
19997  public:
19998   static bool train(const string& method, const vector<sentence>& train, const vector<sentence>& heldout,
19999                     const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error);
20000 
20001   static const string DEFAULT;
20002   static const string NONE;
20003 
20004  protected:
20005   static unsigned hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum);
20006   static double hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum);
20007   static double hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum);
20008 
20009  private:
20010   static double rnd(unsigned run, unsigned index);
20011 };
20012 
20013 /////////
20014 // File: trainer/trainer_morphodita_parsito.h
20015 /////////
20016 
20017 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
20018 //
20019 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20020 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20021 //
20022 // This Source Code Form is subject to the terms of the Mozilla Public
20023 // License, v. 2.0. If a copy of the MPL was not distributed with this
20024 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20025 
20026 class trainer_morphodita_parsito : public trainer {
20027  public:
20028   static bool train(const vector<sentence>& training, const vector<sentence>& heldout,
20029                     const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error);
20030 
20031  private:
20032   static bool train_tokenizer(const vector<sentence>& training, const vector<sentence>& heldout,
20033                               const string& options, ostream& os, string& error);
20034   static bool train_tagger(const vector<sentence>& training, const vector<sentence>& heldout,
20035                            const string& options, ostream& os, string& error);
20036   static bool train_parser(const vector<sentence>& training, const vector<sentence>& heldout,
20037                            const string& options, const string& tagger_model, ostream& os, string& error);
20038 
20039   // Generic model methods
20040   enum model_type { TOKENIZER_MODEL, TAGGER_MODEL, PARSER_MODEL };
20041   static bool load_model(const string& data, model_type model, string_piece& range);
20042   static const string& model_normalize_form(string_piece form, string& output);
20043   static const string& model_normalize_lemma(string_piece lemma, string& output);
20044   static void model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word);
20045 
20046   // Tagger-specific model methods
20047   static bool train_tagger_model(const vector<sentence>& training, const vector<sentence>& heldout,
20048                                  unsigned model, unsigned models, const named_values::map& tagger, ostream& os, string& error);
20049   static bool can_combine_tag(const word& w, string& error);
20050   static const string& combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag);
20051   static const string& most_frequent_tag(const vector<sentence>& data, const string& upostag, bool xpostag, bool feats, string& combined_tag);
20052   static const string& combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set<string>& flat_lemmas = unordered_set<string>());
20053 
20054   // Generic options handling
20055   static const string& option_str(const named_values::map& options, const string& name, int model = -1);
20056   static bool option_int(const named_values::map& options, const string& name, int& value, string& error, int model = -1);
20057   static bool option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model = -1);
20058   static bool option_double(const named_values::map& options, const string& name, double& value, string& error, int model = -1);
20059 
20060   // Various string data
20061   static const string empty_string;
20062   static const string tag_separators;
20063   static const string tagger_features_tagger;
20064   static const string tagger_features_lemmatizer;
20065   static const string parser_nodes;
20066 };
20067 
20068 /////////
20069 // File: trainer/trainer.cpp
20070 /////////
20071 
20072 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
20073 //
20074 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20075 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20076 //
20077 // This Source Code Form is subject to the terms of the Mozilla Public
20078 // License, v. 2.0. If a copy of the MPL was not distributed with this
20079 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20080 
20081 const string trainer::DEFAULT;
20082 const string trainer::NONE = "none";
20083 
train(const string & method,const vector<sentence> & training,const vector<sentence> & heldout,const string & tokenizer,const string & tagger,const string & parser,ostream & os,string & error)20084 bool trainer::train(const string& method, const vector<sentence>& training, const vector<sentence>& heldout,
20085                     const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) {
20086   error.clear();
20087 
20088   stringstream os_buffer;
20089   os_buffer.put(method.size());
20090   os_buffer.write(method.c_str(), method.size());
20091 
20092   try {
20093     if (method == "morphodita_parsito") {
20094       if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error))
20095         return false;
20096     } else {
20097       error.assign("Unknown UDPipe method '").append(method).append("'!");
20098       return false;
20099     }
20100   } catch (training_error& e) {
20101     error.assign(e.what());
20102     return false;
20103   }
20104 
20105   os << os_buffer.rdbuf();
20106   return true;
20107 }
20108 
hyperparameter_integer(unsigned run,unsigned index,unsigned minimum,unsigned maximum)20109 unsigned trainer::hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum) {
20110   return minimum + int((maximum - minimum + 1) * rnd(run, index));
20111 }
20112 
hyperparameter_uniform(unsigned run,unsigned index,double minimum,double maximum)20113 double trainer::hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum) {
20114   return minimum + (maximum - minimum) * rnd(run, index);
20115 }
20116 
hyperparameter_logarithmic(unsigned run,unsigned index,double minimum,double maximum)20117 double trainer::hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum) {
20118   return exp(log(minimum) + (log(maximum) - log(minimum)) * rnd(run, index));
20119 }
20120 
rnd(unsigned run,unsigned index)20121 double trainer::rnd(unsigned run, unsigned index) {
20122   uint32_t state = 12345U;
20123   for (unsigned i = 0; i < 10; i++)
20124     state = state * 1103515245U + run * 19999999U + index * 1000000007U + 12345U;
20125   return (state >> 16) / double(1<<16);
20126 }
20127 
20128 /////////
20129 // File: morphodita/tagger/elementary_features_encoder.h
20130 /////////
20131 
20132 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
20133 //
20134 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20135 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20136 //
20137 // This Source Code Form is subject to the terms of the Mozilla Public
20138 // License, v. 2.0. If a copy of the MPL was not distributed with this
20139 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20140 
20141 namespace morphodita {
20142 
20143 template <class Map>
save(ostream & os)20144 inline bool elementary_features<Map>::save(ostream& os) {
20145   binary_encoder enc;
20146 
20147   enc.add_1B(maps.size());
20148   for (auto&& map : maps)
20149     map.save(enc);
20150 
20151   return compressor::save(os, enc);
20152 }
20153 
20154 } // namespace morphodita
20155 
20156 /////////
20157 // File: morphodita/tagger/feature_sequences_encoder.h
20158 /////////
20159 
20160 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
20161 //
20162 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20163 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20164 //
20165 // This Source Code Form is subject to the terms of the Mozilla Public
20166 // License, v. 2.0. If a copy of the MPL was not distributed with this
20167 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20168 
20169 namespace morphodita {
20170 
20171 template <class ElementaryFeatures, class Map>
parse(int window_size,istream & is)20172 void feature_sequences<ElementaryFeatures, Map>::parse(int window_size, istream& is) {
20173   unordered_map<string, elementary_feature_description> elementary_map;
20174   for (auto&& description : ElementaryFeatures::descriptions)
20175     if (!elementary_map.emplace(description.name, description).second)
20176       training_failure("Repeated elementary feature with name " << description.name << '!');
20177 
20178   string line;
20179   vector<string> tokens;
20180   while (getline(is, line)) {
20181     split(line, ',', tokens);
20182     if (tokens.empty()) training_failure("Feature sequence cannot be empty!");
20183 
20184     bool contains_only_current = false;
20185     sequences.emplace_back();
20186     for (auto&& token : tokens) {
20187       vector<string> parts;
20188       split(token, ' ', parts);
20189       if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!");
20190       auto it = elementary_map.find(parts[0]);
20191       if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!");
20192 
20193       auto& desc = it->second;
20194       int sequence_index = parse_int(parts[1].c_str(), "sequence_index");
20195       if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
20196       if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
20197       if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
20198 
20199       sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index);
20200       if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1);
20201       if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index);
20202       contains_only_current |= desc.range == ONLY_CURRENT;
20203     }
20204     if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
20205   }
20206 
20207   stable_sort(sequences.begin(), sequences.end(), [](const feature_sequence& a, const feature_sequence& b) { return a.dependant_range > b.dependant_range; });
20208   scores.resize(sequences.size());
20209 }
20210 
20211 template <class ElementaryFeatures, class Map>
save(ostream & os)20212 inline bool feature_sequences<ElementaryFeatures, Map>::save(ostream& os) {
20213   if (!elementary.save(os)) return false;
20214 
20215   binary_encoder enc;
20216   enc.add_1B(sequences.size());
20217   for (auto&& sequence : sequences) {
20218     enc.add_4B(sequence.dependant_range);
20219     enc.add_1B(sequence.elements.size());
20220     for (auto&& element : sequence.elements) {
20221       enc.add_4B(element.type);
20222       enc.add_4B(element.elementary_index);
20223       enc.add_4B(element.sequence_index);
20224     }
20225   }
20226 
20227   enc.add_1B(scores.size());
20228   for (auto&& score : scores)
20229     score.save(enc);
20230 
20231   return compressor::save(os, enc);
20232 }
20233 
20234 } // namespace morphodita
20235 
20236 /////////
20237 // File: morphodita/tagger/training_maps.h
20238 /////////
20239 
20240 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
20241 //
20242 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20243 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20244 //
20245 // This Source Code Form is subject to the terms of the Mozilla Public
20246 // License, v. 2.0. If a copy of the MPL was not distributed with this
20247 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20248 
20249 namespace morphodita {
20250 
20251 // Declarations
20252 class training_elementary_feature_map {
20253  public:
20254   inline elementary_feature_value value(const char* feature, int len) const;
20255   mutable unordered_map<string, elementary_feature_value> map = {{"", elementary_feature_empty}};
20256  private:
20257   mutable string key;
20258 };
20259 
20260 class training_feature_sequence_map {
20261  public:
20262   struct info {
20263     // We deliberately use feature_sequence*s*_score to check for overflow
20264     feature_sequences_score alpha = 0;
20265     feature_sequences_score gamma = 0;
20266     int last_gamma_update = 0;
20267   };
20268 
20269   inline feature_sequence_score score(const char* feature, int len) const;
20270   mutable unordered_map<string, info> map;
20271  private:
20272   mutable string key;
20273 };
20274 
20275 template <template <class> class ElementaryFeatures> using train_feature_sequences = feature_sequences<ElementaryFeatures<training_elementary_feature_map>, training_feature_sequence_map>;
20276 
20277 // Definitions
value(const char * feature,int len) const20278 elementary_feature_value training_elementary_feature_map::value(const char* feature, int len) const {
20279   key.assign(feature, len);
20280   return map.emplace(key, elementary_feature_empty + map.size()).first->second;
20281 }
20282 
score(const char * feature,int len) const20283 feature_sequence_score training_feature_sequence_map::score(const char* feature, int len) const {
20284   key.assign(feature, len);
20285   auto it = map.find(key);
20286   return it != map.end() ? it->second.alpha : 0;
20287 }
20288 
20289 } // namespace morphodita
20290 
20291 /////////
20292 // File: morphodita/tagger/feature_sequences_optimizer.h
20293 /////////
20294 
20295 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
20296 //
20297 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20298 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20299 //
20300 // This Source Code Form is subject to the terms of the Mozilla Public
20301 // License, v. 2.0. If a copy of the MPL was not distributed with this
20302 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20303 
20304 namespace morphodita {
20305 
20306 // Declarations
20307 template <class T>
20308 class feature_sequences_optimizer;
20309 
20310 template <template <class, class> class FeatureSequences, template <class> class ElementaryFeatures>
20311 class feature_sequences_optimizer<FeatureSequences<ElementaryFeatures<training_elementary_feature_map>, training_feature_sequence_map>> {
20312  public:
20313   typedef FeatureSequences<ElementaryFeatures<training_elementary_feature_map>, training_feature_sequence_map> original_feature_sequences;
20314   typedef FeatureSequences<ElementaryFeatures<persistent_elementary_feature_map>, persistent_feature_sequence_map> optimized_feature_sequences;
20315 
20316   static void optimize(const original_feature_sequences& features, optimized_feature_sequences& optimized_features);
20317 };
20318 
20319 // Definitions
20320 template <template <class, class> class FeatureSequences, template <class> class ElementaryFeatures>
optimize(const original_feature_sequences & features,optimized_feature_sequences & optimized_features)20321 void feature_sequences_optimizer<FeatureSequences<ElementaryFeatures<training_elementary_feature_map>, training_feature_sequence_map>>::optimize(const original_feature_sequences& features, optimized_feature_sequences& optimized_features) {
20322   const ElementaryFeatures<training_elementary_feature_map>& elementary = features.elementary;
20323   ElementaryFeatures<persistent_elementary_feature_map>& optimized_elementary = optimized_features.elementary;
20324 
20325   // Iterate over feature sequences of non-zero weight and count number of
20326   // occurences in corresponding elementary feature maps.
20327   // In order to be able to do so, precompute map_index for elements of features.sequences.
20328   vector<vector<int>> map_indices(features.sequences.size());
20329   for (unsigned i = 0; i < map_indices.size(); i++) {
20330     for (auto&& element : features.sequences[i].elements)
20331       for (auto&& description : decltype(features.elementary)::descriptions)
20332         if (element.type == description.type && element.elementary_index == description.index)
20333           map_indices[i].emplace_back(description.map_index);
20334 
20335     assert(map_indices[i].size() == features.sequences[i].elements.size());
20336   }
20337 
20338   struct count_info { elementary_feature_value ori = 0; int count = 0; };
20339   vector<vector<count_info>> counts(elementary.maps.size());
20340   vector<elementary_feature_value> elementary_ids;
20341   for (unsigned i = 0; i < features.sequences.size(); i++)
20342     for (auto&& element : features.scores[i].map)
20343       if (element.second.gamma) {
20344         elementary_ids.clear();
20345         for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size()))
20346           elementary_ids.emplace_back(vli<elementary_feature_value>::decode(key));
20347 
20348         assert(elementary_ids.size() == features.sequences[i].elements.size());
20349         for (unsigned j = 0; j < elementary_ids.size(); j++) {
20350           if (map_indices[i][j] < 0) continue;
20351           if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1);
20352           counts[map_indices[i][j]][elementary_ids[j]].count++;
20353         }
20354       }
20355 
20356   // Sort counts by sizes decreasing
20357   for (auto&& count : counts) {
20358     if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1);
20359     count[elementary_feature_unknown].count = 0;
20360     count[elementary_feature_empty].count = 1;
20361     for (elementary_feature_value i = 0; i < count.size(); i++) count[i].ori = i;
20362     sort(count.begin() + elementary_feature_empty + 1, count.end(), [](const count_info& a, const count_info& b){ return a.count > b.count; });
20363   }
20364 
20365   // Create an elementary ids map
20366   vector<vector<elementary_feature_value>> elementary_ids_map(counts.size());
20367   for (unsigned i = 0; i < counts.size(); i++) {
20368     elementary_ids_map[i].resize(counts[i].size());
20369     for (elementary_feature_value j = 0; j < counts[i].size(); j++)
20370       elementary_ids_map[i][counts[i][j].ori] = counts[i][j].count ? j : elementary_feature_unknown;
20371   }
20372 
20373   // Make optimized elementary maps by applying elementary ids map
20374   optimized_elementary.maps.clear();
20375   for (unsigned i = 0; i < elementary.maps.size(); i++) {
20376     unordered_map<string, elementary_feature_value> mapped_ids;
20377     for (auto&& element : elementary.maps[i].map)
20378       if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown)
20379         mapped_ids.emplace(element.first, elementary_ids_map[i][element.second]);
20380 
20381     optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) {
20382       enc.add_4B(id);
20383     }));
20384   }
20385 
20386   // Remap keys in feature sequences by applying elementary_ids_map to appropriate subkeys
20387   optimized_features.sequences = features.sequences;
20388   optimized_features.scores.clear();
20389   vector<char> key_buffer;
20390   for (unsigned i = 0; i < features.sequences.size(); i++) {
20391     decltype(features.scores[i].map) updated_map;
20392     for (auto&& element : features.scores[i].map)
20393       if (element.second.gamma) {
20394         elementary_ids.clear();
20395         for (const char* key = element.first.c_str(); key < element.first.c_str() + element.first.size(); )
20396           elementary_ids.emplace_back(vli<elementary_feature_value>::decode(key));
20397 
20398         assert(elementary_ids.size() == features.sequences[i].elements.size());
20399         for (unsigned j = 0; j < elementary_ids.size(); j++) {
20400           if (map_indices[i][j] < 0) continue;
20401           assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown);
20402           elementary_ids[j] = elementary_ids_map[map_indices[i][j]][elementary_ids[j]];
20403         }
20404 
20405         key_buffer.resize(elementary_ids.size() * vli<elementary_feature_value>::max_length());
20406         char* key = key_buffer.data();
20407         for (unsigned j = 0; j < elementary_ids.size(); j++)
20408           vli<elementary_feature_value>::encode(elementary_ids[j], key);
20409 
20410         updated_map.emplace(string(key_buffer.data(), key - key_buffer.data()), element.second);
20411       }
20412 
20413     optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) {
20414       assert(feature_sequence_score(info.gamma) == info.gamma);
20415       enc.add_4B(info.gamma);
20416     }));
20417   }
20418 
20419   // Original code which only dropped feature sequences with gamma == 0
20420   // optimized_elementary.maps.clear();
20421   // for (auto&& map : elementary.maps)
20422   //   optimized_elementary.maps.emplace_back(persistent_unordered_map(map.map, 1, [](binary_encoder& enc, elementary_feature_value value) {
20423   //     enc.add_4B(value);
20424   //   }));
20425   //
20426   // optimized_features.sequences = features.sequences;
20427   // optimized_features.scores.clear();
20428   // for (auto&& score : features.scores) {
20429   //   decltype(score.map) pruned_map;
20430   //   for (auto&& element : score.map)
20431   //     if (element.second.gamma)
20432   //       pruned_map.insert(element);
20433   //
20434   //   optimized_features.scores.emplace_back(persistent_unordered_map(pruned_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) {
20435   //     enc.add_4B(info.gamma);
20436   //   }));
20437   // }
20438 }
20439 
20440 } // namespace morphodita
20441 
20442 /////////
20443 // File: morphodita/tagger/tagger_trainer.h
20444 /////////
20445 
20446 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
20447 //
20448 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20449 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20450 //
20451 // This Source Code Form is subject to the terms of the Mozilla Public
20452 // License, v. 2.0. If a copy of the MPL was not distributed with this
20453 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20454 
20455 namespace morphodita {
20456 
20457 // Declarations
20458 template <class TaggerTrainer>
20459 class tagger_trainer {
20460  public:
20461   struct sentence {
20462     vector<string> words;
20463     vector<string_piece> forms;
20464     vector<vector<tagged_lemma>> analyses;
20465     vector<tagged_lemma> gold;
20466     vector<int> gold_index;
20467   };
20468 
20469   static void train(int decoding_order, int window_size, int iterations, istream& in_morpho_dict, bool use_guesser, istream& in_feature_templates, bool prune_features, istream& in_train, istream& in_heldout, bool early_stopping, ostream& out_tagger);
20470 
20471  private:
20472   static double load_data(istream& is, const morpho& d, bool use_guesser, vector<sentence>& sentences, bool add_gold);
20473 };
20474 
20475 // Definitions
20476 template <class TaggerTrainer>
train(int decoding_order,int window_size,int iterations,istream & in_morpho_dict,bool use_guesser,istream & in_feature_templates,bool prune_features,istream & in_train,istream & in_heldout,bool early_stopping,ostream & out_tagger)20477 void tagger_trainer<TaggerTrainer>::train(int decoding_order, int window_size, int iterations, istream& in_morpho_dict, bool use_guesser, istream& in_feature_templates, bool prune_features, istream& in_train, istream& in_heldout, bool early_stopping, ostream& out_tagger) {
20478 //  cerr << "Loading dictionary: ";
20479   unique_ptr<morpho> d(morpho::load(in_morpho_dict));
20480   if (!d) training_failure("Cannot load dictionary!");
20481 //  cerr << "done" << endl;
20482   if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!");
20483 
20484   vector<sentence> train_data;
20485 //  cerr << "Loading train data: ";
20486 //  cerr << "done, matched " << fixed << setprecision(2) << 100 * load_data(in_train, *d, use_guesser, train_data, true) << '%' << endl;
20487   load_data(in_train, *d, use_guesser, train_data, true);
20488 
20489   vector<sentence> heldout_data;
20490   if (in_heldout) {
20491 //    cerr << "Loading heldout data: ";
20492 //    cerr << "done, matched " << fixed << setprecision(2) << 100 * load_data(in_heldout, *d, use_guesser, heldout_data, false) << '%' << endl;
20493     load_data(in_heldout, *d, use_guesser, heldout_data, false);
20494   }
20495 
20496   // Encode morphological dictionary
20497 //  cerr << "Encoding morphological dictionary." << endl;
20498   out_tagger << in_morpho_dict.rdbuf();
20499   out_tagger.put(use_guesser);
20500 
20501   // Train and encode the tagger
20502   TaggerTrainer::train(decoding_order, window_size, iterations, train_data, heldout_data, early_stopping, prune_features, in_feature_templates, out_tagger);
20503 }
20504 
20505 template <class TaggerTrainer>
load_data(istream & is,const morpho & d,bool use_guesser,vector<sentence> & sentences,bool add_gold)20506 double tagger_trainer<TaggerTrainer>::load_data(istream& is, const morpho& d, bool use_guesser, vector<sentence>& sentences, bool add_gold) {
20507   sentences.clear();
20508 
20509   int forms = 0, forms_matched = 0;
20510 
20511   string line;
20512   vector<string> tokens;
20513   sentences.emplace_back();
20514   while (getline(is, line)) {
20515     if (line.empty()) {
20516       if (!sentences.back().words.empty())
20517         sentences.emplace_back();
20518       continue;
20519     }
20520 
20521     split(line, '\t', tokens);
20522     if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!");
20523 
20524     // Add form to sentence
20525     forms++;
20526     sentence& s = sentences.back();
20527     s.words.emplace_back(tokens[0]);
20528     s.forms.emplace_back(string_piece(s.words.back().c_str(), d.raw_form_len(s.words.back())));
20529     s.gold.emplace_back(tokens[1], tokens[2]);
20530     s.gold_index.emplace_back(-1);
20531 
20532     // Analyse
20533     s.analyses.emplace_back();
20534     d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back());
20535 
20536     // Locate gold analysis
20537     for (size_t i = 0; i < s.analyses.back().size(); i++)
20538       if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) {
20539         s.gold_index.back() = i;
20540         forms_matched++;
20541         break;
20542       }
20543     if (s.gold_index.back() == -1 && add_gold) {
20544       s.gold_index.back() = s.analyses.back().size();
20545       s.analyses.back().emplace_back(tokens[1], tokens[2]);
20546     }
20547   }
20548   if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back();
20549 
20550   return forms_matched / double(forms);
20551 }
20552 
20553 } // namespace morphodita
20554 
20555 /////////
20556 // File: morphodita/tagger/perceptron_tagger_trainer.h
20557 /////////
20558 
20559 // This file is part of MorphoDiTa <http://github.com/ufal/morphodita/>.
20560 //
20561 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20562 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20563 //
20564 // This Source Code Form is subject to the terms of the Mozilla Public
20565 // License, v. 2.0. If a copy of the MPL was not distributed with this
20566 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20567 
20568 namespace morphodita {
20569 
20570 // Declarations
20571 template <class FeatureSequences>
20572 class perceptron_tagger_trainer {
20573  public:
20574   typedef typename tagger_trainer<perceptron_tagger_trainer<FeatureSequences>>::sentence sentence;
20575 
20576   static void train(int decoding_order, int window_size, int iterations, const vector<sentence>& train, const vector<sentence>& heldout, bool early_stopping, bool prune_features, istream& in_feature_templates, ostream& out_tagger);
20577 
20578  private:
20579   static void train_viterbi(int decoding_order, int window_size, int iterations, const vector<sentence>& train, const vector<sentence>& heldout, bool early_stopping, bool prune_features, FeatureSequences& features);
20580 };
20581 
20582 // Definitions
20583 template <class FeatureSequences>
train(int decoding_order,int window_size,int iterations,const vector<sentence> & train,const vector<sentence> & heldout,bool early_stopping,bool prune_features,istream & in_feature_templates,ostream & out_tagger)20584 void perceptron_tagger_trainer<FeatureSequences>::train(int decoding_order, int window_size, int iterations, const vector<sentence>& train, const vector<sentence>& heldout, bool early_stopping, bool prune_features, istream& in_feature_templates, ostream& out_tagger) {
20585   FeatureSequences features;
20586 
20587 //  cerr << "Parsing feature templates..." << endl;
20588   features.parse(window_size, in_feature_templates);
20589 
20590 //  cerr << "Training tagger..." << endl;
20591   train_viterbi(decoding_order, window_size, iterations, train, heldout, early_stopping, prune_features, features);
20592 
20593 //  cerr << "Encoding tagger..." << endl;
20594   typedef feature_sequences_optimizer<FeatureSequences> optimizer;
20595   typename optimizer::optimized_feature_sequences optimized_features;
20596   optimizer::optimize(features, optimized_features);
20597   if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!");
20598 }
20599 
20600 template <class FeatureSequences>
train_viterbi(int decoding_order,int window_size,int iterations,const vector<sentence> & train,const vector<sentence> & heldout,bool early_stopping,bool prune_features,FeatureSequences & features)20601 void perceptron_tagger_trainer<FeatureSequences>::train_viterbi(int decoding_order, int window_size, int iterations, const vector<sentence>& train, const vector<sentence>& heldout, bool early_stopping, bool prune_features, FeatureSequences& features) {
20602   int best_correct = 0, best_iteration = -1;
20603   FeatureSequences best_features;
20604 
20605   viterbi<FeatureSequences> decoder(features, decoding_order, window_size);
20606   typename decltype(decoder)::cache decoder_cache(decoder);
20607 
20608   typename FeatureSequences::cache feature_sequences_cache(features);
20609   typename FeatureSequences::dynamic_features decoded_dynamic_features, gold_dynamic_features;
20610   vector<string> decoded_feature_sequences_keys, gold_feature_sequences_keys;
20611 
20612   vector<int> window(window_size);
20613 
20614   // Initialize feature sequences for the gold decoding only if requested
20615   if (prune_features)
20616     for (unsigned s = 0; s < train.size(); s++) {
20617       auto& sentence = train[s];
20618       features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache);
20619       for (int i = 0; i < int(sentence.forms.size()); i++) {
20620         window.assign(window_size, -1);
20621         for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j];
20622 
20623         features.compute_dynamic_features(i, window[0], &gold_dynamic_features, gold_dynamic_features, feature_sequences_cache);
20624         features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache);
20625 
20626         for (unsigned f = 0; f < features.scores.size(); f++)
20627           if (!gold_feature_sequences_keys[f].empty())
20628             features.scores[f].map[gold_feature_sequences_keys[f]];
20629       }
20630     }
20631 
20632   // Train for given number of iterations
20633   for (int i = 0; i < iterations; i++) {
20634     // Train
20635     int train_correct = 0, train_total = 0;
20636     cerr << "Iteration " << i + 1 << ": ";
20637 
20638     vector<int> tags;
20639     for (unsigned s = 0; s < train.size(); s++) {
20640       auto& sentence = train[s];
20641 
20642       // Run Viterbi
20643       if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size());
20644       decoder.tag(sentence.forms, sentence.analyses, decoder_cache, tags);
20645 
20646       // Compute feature sequence keys or decoded result and gold result and update alpha & gamma
20647       features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache);
20648       for (int i = 0; i < int(sentence.forms.size()); i++) {
20649         train_correct += tags[i] == sentence.gold_index[i];
20650         train_total++;
20651 
20652         window.assign(window_size, -1);
20653         for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j];
20654         features.compute_dynamic_features(i, window[0], &decoded_dynamic_features, decoded_dynamic_features, feature_sequences_cache);
20655         features.feature_keys(i, window.data(), 0, decoded_dynamic_features, decoded_feature_sequences_keys, feature_sequences_cache);
20656 
20657         for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j];
20658         features.compute_dynamic_features(i, window[0], &gold_dynamic_features, gold_dynamic_features, feature_sequences_cache);
20659         features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache);
20660 
20661         for (unsigned f = 0; f < features.scores.size(); f++) {
20662           if (decoded_feature_sequences_keys[f] != gold_feature_sequences_keys[f]) {
20663             if (!decoded_feature_sequences_keys[f].empty()) {
20664               auto it = features.scores[f].map.find(decoded_feature_sequences_keys[f]);
20665               if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
20666               if (it != features.scores[f].map.end()) {
20667                 auto& decoded_info = it->second;
20668                 decoded_info.gamma += decoded_info.alpha * (s - decoded_info.last_gamma_update);
20669                 decoded_info.last_gamma_update = s;
20670                 decoded_info.alpha--;
20671               }
20672             }
20673 
20674             if (!gold_feature_sequences_keys[f].empty()) {
20675               auto it = features.scores[f].map.find(gold_feature_sequences_keys[f]);
20676               if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
20677               if (it != features.scores[f].map.end()) {
20678                 auto& gold_info = it->second;
20679                 gold_info.gamma += gold_info.alpha * (s - gold_info.last_gamma_update);
20680                 gold_info.last_gamma_update = s;
20681                 gold_info.alpha++;
20682               }
20683             }
20684           }
20685         }
20686       }
20687     }
20688 
20689     // Finalize incremental gamma updates
20690     for (auto&& score : features.scores)
20691       for (auto&& element : score.map) {
20692         element.second.gamma += element.second.alpha * (train.size() - element.second.last_gamma_update);
20693         element.second.last_gamma_update = 0;
20694       }
20695     cerr << "done, accuracy " << fixed << setprecision(2) << train_correct * 100 / double(train_total) << '%';
20696 
20697     // If we have any heldout data, compute accuracy and if requested store best tagger configuration
20698     if (!heldout.empty()) {
20699       enum { TAGS, LEMMAS, BOTH, TOTAL };
20700       int heldout_correct[TOTAL] = {}, heldout_total = 0;
20701 
20702       typedef feature_sequences_optimizer<FeatureSequences> optimizer;
20703       typename optimizer::optimized_feature_sequences frozen_features;
20704       optimizer::optimize(features, frozen_features);
20705       viterbi<decltype(frozen_features)> frozen_decoder(frozen_features, decoding_order, window_size);
20706       typename decltype(frozen_decoder)::cache frozen_decoder_cache(frozen_decoder);
20707 
20708       for (auto&& sentence : heldout) {
20709         if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2);
20710         frozen_decoder.tag(sentence.forms, sentence.analyses, frozen_decoder_cache, tags);
20711 
20712         for (unsigned i = 0; i < sentence.forms.size(); i++) {
20713           heldout_correct[TAGS] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag;
20714           heldout_correct[LEMMAS] += sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma;
20715           heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma;
20716           heldout_total++;
20717         }
20718       }
20719 
20720       if (early_stopping && heldout_correct[BOTH] > best_correct) {
20721         best_correct = heldout_correct[BOTH];
20722         best_iteration = i;
20723         best_features = features;
20724       }
20725 
20726       cerr << ", heldout accuracy " << fixed << setprecision(2)
20727           << 100 * heldout_correct[TAGS] / double(heldout_total) << "%t/"
20728           << 100 * heldout_correct[LEMMAS] / double(heldout_total) << "%l/"
20729           << 100 * heldout_correct[BOTH] / double(heldout_total) << "%b";
20730     }
20731     cerr << endl;
20732   }
20733 
20734   if (early_stopping && best_iteration >= 0) {
20735     cerr << "Chosen tagger model from iteration " << best_iteration + 1 << endl;
20736     features = best_features;
20737   }
20738 }
20739 
20740 } // namespace morphodita
20741 
20742 /////////
20743 // File: utils/options.h
20744 /////////
20745 
20746 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
20747 //
20748 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20749 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20750 //
20751 // This Source Code Form is subject to the terms of the Mozilla Public
20752 // License, v. 2.0. If a copy of the MPL was not distributed with this
20753 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20754 
20755 namespace utils {
20756 
20757 class options {
20758  public:
20759   typedef unordered_map<string, string> map;
20760 
20761   struct value {
20762     enum allowed_t { NONE, ANY, SET };
20763     allowed_t allowed;
20764     unordered_set<string> set;
20765 
valueufal::udpipe::utils::options::value20766     value(initializer_list<string> set) : allowed(SET), set(set) {}
20767     static const value none;
20768     static const value any;
20769 
20770    private:
valueufal::udpipe::utils::options::value20771     value(allowed_t allowed) : allowed(allowed) {}
20772   };
20773 
20774   // Parse options according to allowed map. If successful, argv is reordered so
20775   // that non-option arguments are placed in argv[1] to argv[argc-1]. The '--'
20776   // indicates end of option arguments (as usual).  The allowed map contains
20777   // values allowed for every option. If empty, no value is allowed, if it
20778   // contains just an empty string, any value is allowed.
20779   static bool parse(const unordered_map<string, value>& allowed, int& argc, char**& argv, map& options);
20780 };
20781 
20782 } // namespace utils
20783 
20784 /////////
20785 // File: version/version.h
20786 /////////
20787 
20788 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
20789 //
20790 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
20791 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20792 //
20793 // This Source Code Form is subject to the terms of the Mozilla Public
20794 // License, v. 2.0. If a copy of the MPL was not distributed with this
20795 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20796 
20797 class version {
20798  public:
20799   unsigned major;
20800   unsigned minor;
20801   unsigned patch;
20802   std::string prerelease;
20803 
20804   // Returns current version.
20805   static version current();
20806 
20807   // Returns multi-line formated version and copyright string.
20808   static string version_and_copyright(const string& other_libraries = string());
20809 };
20810 
20811 /////////
20812 // File: trainer/trainer_morphodita_parsito.cpp
20813 /////////
20814 
20815 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
20816 //
20817 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20818 // Mathematics and Physics, Charles University in Prague, Czech Republic.
20819 //
20820 // This Source Code Form is subject to the terms of the Mozilla Public
20821 // License, v. 2.0. If a copy of the MPL was not distributed with this
20822 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
20823 
train(const vector<sentence> & training,const vector<sentence> & heldout,const string & tokenizer,const string & tagger,const string & parser,ostream & os,string & error)20824 bool trainer_morphodita_parsito::train(const vector<sentence>& training, const vector<sentence>& heldout,
20825                                        const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) {
20826   error.clear();
20827 
20828   // Save model version info
20829   os.put(model_morphodita_parsito::VERSION_LATEST);
20830   // Add sentinel required since version 2
20831   os.put(0x7F).put(0x7F);
20832 
20833   // Check input data
20834   for (auto&& sentence : training)
20835     for (size_t i = 1; i < sentence.words.size(); i++)
20836       if (!can_combine_tag(sentence.words[i], error))
20837         return false;
20838   for (auto&& sentence : heldout)
20839     for (size_t i = 1; i < sentence.words.size(); i++)
20840       if (!can_combine_tag(sentence.words[i], error))
20841         return false;
20842 
20843   if (!train_tokenizer(training, heldout, tokenizer, os, error)) return false;
20844   string tagger_model;
20845   {
20846     ostringstream os_tagger;
20847     if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false;
20848     tagger_model.assign(os_tagger.str());
20849     os.write(tagger_model.data(), tagger_model.size());
20850   }
20851   if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false;
20852 
20853   return true;
20854 }
20855 
train_tokenizer(const vector<sentence> & training,const vector<sentence> & heldout,const string & options,ostream & os,string & error)20856 bool trainer_morphodita_parsito::train_tokenizer(const vector<sentence>& training, const vector<sentence>& heldout,
20857                                                  const string& options, ostream& os, string& error) {
20858   if (options == NONE) {
20859     os.put(0);
20860   } else {
20861     // Tokenizer options
20862     named_values::map tokenizer;
20863     if (!named_values::parse(options, tokenizer, error)) return false;
20864     int run = 0; if (!option_int(tokenizer, "run", run, error)) return false;
20865 
20866     if (tokenizer.count("from_model")) {
20867       // Use specified tokenizer model
20868       string_piece tokenizer_data;
20869       if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data))
20870         return error.assign("Cannot load model from which the tokenizer should be used!"), false;
20871 
20872       cerr << "Using tokenizer from given model." << endl;
20873       os.write(tokenizer_data.str, tokenizer_data.len);
20874     } else {
20875       os.put(1);
20876       const string& model = option_str(tokenizer, "model");
20877 
20878       // Tokenizer itself
20879       if (model == "generic") {
20880         os.put(morphodita::tokenizer_id::GENERIC);
20881         morphodita::generic_tokenizer_factory_encoder::encode(morphodita::generic_tokenizer::LATEST, os);
20882       } else if (model.empty() || model == "gru") {
20883         // Create a detokenizator if required
20884         unique_ptr<detokenizer> detokenizer;
20885         if (tokenizer.count("detokenize")) {
20886           detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"]));
20887           if (!detokenizer) return error.assign("Cannot create detokenizer!"), false;
20888         }
20889 
20890         // Prepare training data for the gru_tokenizer
20891         vector<morphodita::tokenized_sentence> sentences;
20892         bool spaces_in_training = false;
20893         for (size_t training_sentence = 0; training_sentence < training.size(); training_sentence++) {
20894           sentence s = training[training_sentence];
20895           if (detokenizer) detokenizer->detokenize(s);
20896 
20897           auto& sentence = (sentences.emplace_back(), sentences.back());
20898 
20899           for (size_t i = 1, j = 0; i < s.words.size(); i++) {
20900             const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ?
20901                 (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
20902 
20903             sentence.tokens.emplace_back(sentence.sentence.size(), 0);
20904             for (auto&& chr : unilib::utf8::decoder(tok.form)) {
20905               sentence.sentence.push_back(chr);
20906               if (unilib::unicode::category(chr) & unilib::unicode::Zs) spaces_in_training = true;
20907             }
20908             sentence.tokens.back().length = sentence.sentence.size() - sentence.tokens.back().start;
20909 
20910             if (tok.get_space_after()) sentence.sentence.push_back(' ');
20911 
20912             if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
20913               i = s.multiword_tokens[j++].id_last;
20914           }
20915           if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
20916             sentence.sentence.append(2, '\n');
20917         }
20918 
20919         // Heldout data
20920         vector<morphodita::tokenized_sentence> heldout_sentences;
20921 
20922         bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false;
20923         for (size_t heldout_sentence = 0; heldout_sentence < heldout.size(); heldout_sentence++) {
20924           sentence s = heldout[heldout_sentence];
20925           if (detokenizer && detokenize_handout) detokenizer->detokenize(s);
20926 
20927           auto& sentence = (heldout_sentences.emplace_back(), heldout_sentences.back());
20928 
20929           for (size_t i = 1, j = 0; i < s.words.size(); i++) {
20930             const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ?
20931                 (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
20932 
20933             sentence.tokens.emplace_back(sentence.sentence.size(), 0);
20934             for (auto&& chr : unilib::utf8::decoder(tok.form))
20935               sentence.sentence.push_back(chr);
20936             sentence.tokens.back().length = sentence.sentence.size() - sentence.tokens.back().start;
20937 
20938             if (tok.get_space_after()) sentence.sentence.push_back(' ');
20939 
20940             if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
20941               i = s.multiword_tokens[j++].id_last;
20942           }
20943           if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
20944             sentence.sentence.append(2, '\n');
20945         }
20946 
20947         // Options
20948         bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false;
20949         int segment_size = 50; // if (!option_int(tokenizer, "segment_size", segment_size, error)) return false;
20950         bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false;
20951         int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false;
20952         int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false;
20953         int batch_size = run <= 1 ? 50 : 50 + 50 * hyperparameter_integer(run, 1, 0, 1);
20954         if (!option_int(tokenizer, "batch_size", batch_size, error)) return false;
20955         double learning_rate = run <= 1 ? 0.005 : hyperparameter_logarithmic(run, 2, 0.0005, 0.01);
20956         if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false;
20957         double learning_rate_final = 0.0; // if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false;
20958         double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false;
20959         double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false;
20960         bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false;
20961 
20962         if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size
20963                            << ", learning_rate=" << fixed << setprecision(8) << learning_rate << endl;
20964 
20965         cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0)
20966              << ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl
20967              << "  epochs=" << epochs << ", batch_size=" << batch_size << ", learning_rate=" << fixed << setprecision(4) << learning_rate
20968              << ", dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl;
20969 
20970         // Train and encode gru_tokenizer
20971         os.put(morphodita::tokenizer_ids::GRU);
20972         if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0,
20973                                                       segment_size, allow_spaces, dimension, epochs, batch_size, learning_rate,
20974                                                       learning_rate_final, dropout, initialization_range, early_stopping,
20975                                                       sentences, heldout_sentences, os, error))
20976           return false;
20977       } else {
20978         return error.assign("Unknown tokenizer model '").append(model).append("'!"), false;
20979       }
20980 
20981       // Multiword splitter
20982       if (!multiword_splitter_trainer::train(training, os, error)) return false;
20983     }
20984   }
20985 
20986   return true;
20987 }
20988 
train_tagger(const vector<sentence> & training,const vector<sentence> & heldout,const string & options,ostream & os,string & error)20989 bool trainer_morphodita_parsito::train_tagger(const vector<sentence>& training, const vector<sentence>& heldout,
20990                                               const string& options, ostream& os, string& error) {
20991   if (options == NONE) {
20992     os.put(0);
20993   } else {
20994     // Parse options
20995     named_values::map tagger;
20996     if (!named_values::parse(options, tagger, error)) return false;
20997 
20998     if (tagger.count("from_model")) {
20999       // Use specified tokenizer model(s)
21000       int model_index = 1, taggers_total = 0;
21001       string model_name = "from_model";
21002       vector<string_piece> taggers_data;
21003       do {
21004         taggers_data.emplace_back();
21005         if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back()))
21006           return error.assign("Cannot load model from which the tagger should be used!"), false;
21007         if (taggers_data.back().str[0])
21008           taggers_total += taggers_data.back().str[0];
21009         else
21010           taggers_data.pop_back();
21011         model_name = "from_model_" + to_string(++model_index);
21012       } while (tagger.count(model_name));
21013       if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false;
21014 
21015       cerr << "Using tagger from given model(s)." << endl;
21016       os.put(taggers_total);
21017       for (auto&& tagger_data : taggers_data)
21018         os.write(tagger_data.str + 1, tagger_data.len - 1);
21019     } else {
21020       // Create MorphoDiTa model(s)
21021       int models = 1; if (!option_int(tagger, "models", models, error)) return false;
21022       if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false;
21023       if (models > 4) return error.assign("Cannot create more than four tagger models!"), false;
21024 
21025       os.put(models);
21026       for (int model = 0; model < models; model++)
21027         if (!train_tagger_model(training, heldout, model, models, tagger, os, error))
21028           return false;
21029     }
21030   }
21031 
21032   return true;
21033 }
21034 
train_parser(const vector<sentence> & training,const vector<sentence> & heldout,const string & options,const string & tagger_model,ostream & os,string & error)21035 bool trainer_morphodita_parsito::train_parser(const vector<sentence>& training, const vector<sentence>& heldout,
21036                                               const string& options, const string& tagger_model, ostream& os, string& error) {
21037   if (options == NONE) {
21038     os.put(0);
21039   } else {
21040     // Create Parsito model
21041     named_values::map parser;
21042     if (!named_values::parse(options, parser, error)) return false;
21043     int run = 0; if (!option_int(parser, "run", run, error)) return false;
21044 
21045     if (parser.count("from_model")) {
21046       // Use specified parser model
21047       string_piece parser_data;
21048       if (!load_model(parser["from_model"], PARSER_MODEL, parser_data))
21049         return error.assign("Cannot load model from which the parser should be used!"), false;
21050 
21051       cerr << "Using parser from given model." << endl;
21052       os.write(parser_data.str, parser_data.len);
21053     } else {
21054       os.put(1);
21055 
21056       // Parsito options
21057       string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective";
21058       string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] :
21059           transition_system == "projective" ? "dynamic" :
21060           transition_system == "swap" ? "static_lazy" :
21061           "static";
21062 
21063       int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false;
21064       int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false;
21065       int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false;
21066       int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false;
21067       int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false;
21068       int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false;
21069       int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false;
21070       int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false;
21071       string embeddings;
21072       if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n");
21073       if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n");
21074       if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n");
21075       if (embedding_form) {
21076         embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount));
21077         if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
21078         embeddings.push_back('\n');
21079       }
21080       if (embedding_lemma) {
21081         embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount));
21082         if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
21083         embeddings.push_back('\n');
21084       }
21085       if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n");
21086 
21087       bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false;
21088       int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false;
21089       int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false;
21090       int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false;
21091       int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2);
21092       if (!option_int(parser, "structured_interval", structured_interval, error)) return false;
21093       double learning_rate = run <= 1 ? 0.02 : hyperparameter_logarithmic(run, 2, 0.005, 0.04);
21094       if (!option_double(parser, "learning_rate", learning_rate, error)) return false;
21095       double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false;
21096       double l2 = run <= 1 ? 0.5 : hyperparameter_uniform(run, 3, 0.2, 0.6);
21097       if (!option_double(parser, "l2", l2, error)) return false;
21098       bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false;
21099 
21100       if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval
21101                          << ", learning_rate=" << fixed << setprecision(8) << learning_rate
21102                          << ", l2=" << l2 << endl;
21103 
21104       // Prepare data in the correct format
21105       parsito::network_parameters parameters;
21106       parameters.iterations = iterations;
21107       parameters.structured_interval = structured_interval;
21108       parameters.hidden_layer = hidden_layer;
21109       parameters.hidden_layer_type = parsito::activation_function::TANH;
21110       parameters.trainer.algorithm = parsito::network_trainer::SGD;
21111       parameters.trainer.learning_rate = learning_rate;
21112       parameters.trainer.learning_rate_final = learning_rate_final;
21113       parameters.trainer.momentum = 0;
21114       parameters.trainer.epsilon = 0;
21115       parameters.batch_size = batch_size;
21116       parameters.initialization_range = 0.1f;
21117       parameters.l1_regularization = 0;
21118       parameters.l2_regularization = l2;
21119       parameters.maxnorm_regularization = 0;
21120       parameters.dropout_hidden = 0;
21121       parameters.dropout_input = 0;
21122       parameters.early_stopping = early_stopping;
21123 
21124       // Tag the input if required
21125       unique_ptr<model> tagger;
21126       bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false;
21127       if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) {
21128         stringstream tagger_description;
21129         tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
21130         tagger.reset(model_morphodita_parsito::load(tagger_description));
21131         if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false;
21132       }
21133 
21134       // Training data
21135       sentence tagged;
21136       vector<parsito::tree> train_trees;
21137       for (auto&& sentence : training) {
21138         tagged = sentence;
21139         if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
21140 
21141         train_trees.emplace_back();
21142         for (size_t i = 1; i < tagged.words.size(); i++) {
21143           train_trees.back().add_node(string());
21144           model_normalize_form(tagged.words[i].form, train_trees.back().nodes.back().form);
21145           train_trees.back().nodes.back().lemma.assign(tagged.words[i].lemma);
21146           train_trees.back().nodes.back().upostag.assign(tagged.words[i].upostag);
21147           train_trees.back().nodes.back().xpostag.assign(tagged.words[i].xpostag);
21148           train_trees.back().nodes.back().feats.assign(tagged.words[i].feats);
21149         }
21150         for (size_t i = 1; i < tagged.words.size(); i++)
21151           train_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel);
21152       }
21153 
21154       // Heldout data
21155       vector<parsito::tree> heldout_trees;
21156       for (auto&& sentence : heldout) {
21157         tagged = sentence;
21158         if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
21159 
21160         heldout_trees.emplace_back();
21161         for (size_t i = 1; i < tagged.words.size(); i++) {
21162           heldout_trees.back().add_node(string());
21163           model_normalize_form(tagged.words[i].form, heldout_trees.back().nodes.back().form);
21164           heldout_trees.back().nodes.back().lemma.assign(tagged.words[i].lemma);
21165           heldout_trees.back().nodes.back().upostag.assign(tagged.words[i].upostag);
21166           heldout_trees.back().nodes.back().xpostag.assign(tagged.words[i].xpostag);
21167           heldout_trees.back().nodes.back().feats.assign(tagged.words[i].feats);
21168         }
21169         for (size_t i = 1; i < tagged.words.size(); i++)
21170           heldout_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel);
21171       }
21172 
21173       cerr << "Parser transition options: system=" << transition_system << ", oracle=" << transition_oracle
21174            << ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl
21175            << "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl
21176            << "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag
21177            << ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl
21178            << "  form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
21179            << "  lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
21180            << "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl
21181            << "  learning_rate=" << fixed << setprecision(4) << learning_rate << ", learning_rate_final=" << learning_rate_final
21182            << ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl;
21183 
21184       // Train the parser
21185       binary_encoder enc;
21186       enc.add_str("nn_versioned");
21187       parsito::parser_nn_trainer::train(transition_system, transition_oracle, single_root, embeddings, parser_nodes,
21188                                         parameters, 1, train_trees, heldout_trees, enc);
21189       compressor::save(os, enc);
21190     }
21191   }
21192 
21193   return true;
21194 }
21195 
load_model(const string & data,model_type model,string_piece & range)21196 bool trainer_morphodita_parsito::load_model(const string& data, model_type model, string_piece& range) {
21197   istringstream is(data);
21198 
21199   // Check that it is morphodita_parsito model.
21200   char len;
21201   if (!is.get(len)) return false;
21202   string name(len, ' ');
21203   if (!is.read(&name[0], len)) return false;
21204   if (name != "morphodita_parsito") return false;
21205 
21206   char version;
21207   if (!is.get(version)) return false;
21208   if (!(version >= 1 && version <= model_morphodita_parsito::VERSION_LATEST)) return false;
21209 
21210   // Because UDPipe 1.0 does not check the model version,
21211   // a specific sentinel was added since version 2 so that
21212   // loading of such model fail on UDPipe 1.0
21213   if (version >= 2) {
21214     char sentinel;
21215     if (!is.get(sentinel) || sentinel != 0x7F) return false;
21216     if (!is.get(sentinel) || sentinel != 0x7F) return false;
21217   }
21218 
21219   // Tokenizer
21220   {
21221     if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg();
21222     char tokenizer; if (!is.get(tokenizer)) return false;
21223     unique_ptr<morphodita::tokenizer_factory> tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
21224     if (tokenizer && !tokenizer_factory) return false;
21225     unique_ptr<multiword_splitter> splitter(tokenizer ? multiword_splitter::load(is) : nullptr);
21226     if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
21227   }
21228 
21229   // Tagger
21230   {
21231     if (model == TAGGER_MODEL) range.str = data.data() + is.tellg();
21232     char taggers; if (!is.get(taggers)) return false;
21233     for (char i = 0; i < taggers; i++) {
21234       char lemma; if (!is.get(lemma)) return false;
21235       char xpostag; if (!is.get(xpostag)) return false;
21236       char feats; if (!is.get(feats)) return false;
21237       unique_ptr<morphodita::tagger> tagger(morphodita::tagger::load(is));
21238       if (!tagger) return false;
21239     }
21240     if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
21241   }
21242 
21243   // Parser
21244   {
21245     if (model == PARSER_MODEL) range.str = data.data() + is.tellg();
21246     char parser;
21247     if (!is.get(parser)) return false;
21248     unique_ptr<parsito::parser> parser_model(parser ? parsito::parser::load(is) : nullptr);
21249     if (parser && !parser_model) return false;
21250     if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
21251   }
21252 
21253   return false;
21254 }
21255 
model_normalize_form(string_piece form,string & output)21256 const string& trainer_morphodita_parsito::model_normalize_form(string_piece form, string& output) {
21257   return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_form(form, output);
21258 }
21259 
model_normalize_lemma(string_piece lemma,string & output)21260 const string& trainer_morphodita_parsito::model_normalize_lemma(string_piece lemma, string& output) {
21261   return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_lemma(lemma, output);
21262 }
21263 
model_fill_word_analysis(const morphodita::tagged_lemma & analysis,bool upostag,int lemma,bool xpostag,bool feats,word & word)21264 void trainer_morphodita_parsito::model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word) {
21265   model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).fill_word_analysis(analysis, upostag, lemma, xpostag, feats, word);
21266 }
21267 
21268 // Tagger model helper functions
21269 
train_tagger_model(const vector<sentence> & training,const vector<sentence> & heldout,unsigned model,unsigned models,const named_values::map & tagger,ostream & os,string & error)21270 bool trainer_morphodita_parsito::train_tagger_model(const vector<sentence>& training, const vector<sentence>& heldout,
21271                                                     unsigned model, unsigned models, const named_values::map& tagger,
21272                                                     ostream& os, string& error) {
21273   unique_ptr<input_format> conllu_input_format(input_format::new_conllu_input_format());
21274 
21275   int run = 0; if (!option_int(tagger, "run", run, error, model)) return false;
21276 
21277   bool have_lemma = false;
21278   for (auto&& sentence : training)
21279     for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++)
21280       if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_")
21281         have_lemma = true;
21282   bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false;
21283   int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false;
21284   int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0;
21285   bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false;
21286   bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false;
21287 
21288   bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false;
21289   bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false;
21290   bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false;
21291   os.put(char(provide_lemma ? use_lemma : 0));
21292   os.put(char(provide_xpostag && use_xpostag));
21293   os.put(char(provide_feats && use_feats));
21294 
21295   cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0)
21296        << ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0)
21297        << ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl;
21298 
21299   // Start by creating the morphological dictionary
21300   stringstream morpho_description;
21301   string normalized_form, combined_tag, combined_lemma;
21302 
21303   // Generic options
21304   const string& dictionary_model = option_str(tagger, "dictionary_model", model);
21305   if (!dictionary_model.empty()) {
21306     // Use specified morphological dictionary
21307     cerr << "Using given morphological dictionary for tagger model " << model+1 << "." << endl;
21308     morpho_description << dictionary_model;
21309   } else {
21310     // Create the morphological dictionary and guesser from data
21311     cerr << "Creating morphological dictionary for tagger model " << model+1 << "." << endl;
21312 
21313     // Dictionary options
21314     int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false;
21315     unordered_set<string> flat_lemmas;
21316     if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) {
21317       vector<string> lemmas;
21318       split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas);
21319       for (auto&& lemma : lemmas) {
21320         if (lemma.find('~') != string::npos)
21321           return error.assign("Dictionary_flat_lemmas cannot contain '~' character!"), false;
21322         flat_lemmas.insert(lemma);
21323       }
21324     } else {
21325       flat_lemmas.insert("greek.expression");
21326     }
21327 
21328     if (!option_str(tagger, "dictionary", model).empty())
21329       return error.assign("The tagger 'dictionary' option is no longer supported, use 'dictionary_file' instead!"), false;
21330     const string& dictionary_file = option_str(tagger, "dictionary_file", model);
21331     int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false;
21332 
21333     cerr << "Tagger model " << model+1 << " dictionary options: " << "max_form_analyses=" << max_form_analyses
21334          << ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl;
21335 
21336     // Guesser options
21337     int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false;
21338     int guesser_suffix_rules = run <= 1 ? 8 : 5 + hyperparameter_integer(run, 1, 0, 7);
21339     if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false;
21340     int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false;
21341     int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false;
21342     int guesser_enrich_dictionary = run <= 1 ? 6 : 3 + hyperparameter_integer(run, 2, 0, 7);
21343     if (!dictionary_file.empty()) guesser_enrich_dictionary = 0;
21344     if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false;
21345 
21346     if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules
21347                        << ", guesser_enrich_dictionary=" << guesser_enrich_dictionary << endl;
21348 
21349     cerr << "Tagger model " << model+1 << " guesser options: " << "suffix_rules=" << guesser_suffix_rules
21350          << ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count
21351          << ", enrich_dictionary=" << guesser_enrich_dictionary << endl;
21352 
21353     // Start by generating statistical guesser
21354     stringstream guesser_description;
21355     {
21356       stringstream guesser_input;
21357       for (auto&& sentence : training) {
21358         for (size_t i = 1; i < sentence.words.size(); i++)
21359           guesser_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t'
21360               << combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas) << '\t'
21361               << combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n';
21362         guesser_input << '\n';
21363       }
21364       morphodita::morpho_statistical_guesser_trainer::train(guesser_input, guesser_suffix_len, guesser_suffix_rules, guesser_prefixes_max, guesser_prefix_min_count, guesser_description);
21365     }
21366 
21367     // Generate morphological dictionary data from the input
21368     unordered_set<string> dictionary_entries;
21369     {
21370       unordered_map<string, unordered_map<string, int>> entries;
21371       string entry;
21372       for (auto&& sentence : training)
21373         for (size_t i = 1; i < sentence.words.size(); i++) {
21374           model_normalize_form(sentence.words[i].form, normalized_form);
21375           entry.assign(combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas))
21376               .append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag))
21377               .append("\t").append(normalized_form);
21378           entries[normalized_form][entry]++;
21379         }
21380 
21381       vector<pair<int, string>> analyses;
21382       for (auto&& form_analyses : entries) {
21383         analyses.clear();
21384         for (auto&& analysis : form_analyses.second)
21385           analyses.emplace_back(analysis.second, analysis.first);
21386         if (max_form_analyses && int(analyses.size()) > max_form_analyses) {
21387           sort(analyses.begin(), analyses.end(), greater<pair<int, string>>());
21388           analyses.resize(max_form_analyses);
21389         }
21390         for (auto&& analysis : analyses)
21391           dictionary_entries.insert(analysis.second);
21392       }
21393     }
21394     morphodita::generic_morpho_encoder::tags dictionary_special_tags;
21395     dictionary_special_tags.unknown_tag = "~X";
21396     dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag);
21397     dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag);
21398     dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag);
21399 
21400     // Append given dictionary_file if given
21401     if (!dictionary_file.empty()) {
21402       ifstream is(dictionary_file);
21403       if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false;
21404 
21405       vector<string_piece> dictionary_parts;
21406       word entry;
21407       string entry_encoded, line;
21408       while (getline(is, line)) {
21409         // Skip empty lines
21410         if (line.empty()) continue;
21411 
21412         split(line, '\t', dictionary_parts);
21413 
21414         if (dictionary_parts.size() != 5)
21415           return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false;
21416 
21417         model_normalize_form(dictionary_parts[0], entry.form);
21418         entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len);
21419         entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len);
21420         entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len);
21421         entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len);
21422 
21423         entry_encoded.assign(combine_lemma(entry, use_lemma, combined_lemma, flat_lemmas))
21424             .append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag))
21425             .append("\t").append(entry.form);
21426         dictionary_entries.insert(entry_encoded);
21427       }
21428     }
21429 
21430     // Enrich the dictionary if required
21431     if (guesser_enrich_dictionary) {
21432       // Create temporary morphology using only the guesser
21433       stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology;
21434       guesser_only_morphology.put(morphodita::morpho_ids::GENERIC);
21435       morphodita::generic_morpho_encoder::encode(empty_data, dictionary_suffix_len, dictionary_special_tags, guesser_description_copy, guesser_only_morphology);
21436 
21437       unique_ptr<morphodita::morpho> guesser_only_morpho(morphodita::morpho::load(guesser_only_morphology));
21438       if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false;
21439 
21440       string entry;
21441       unordered_set<string> analyzed_forms;
21442       vector<morphodita::tagged_lemma> analyses;
21443       for (auto&& sentence : training)
21444         for (size_t i = 1; i < sentence.words.size(); i++) {
21445           const auto& form = model_normalize_form(sentence.words[i].form, normalized_form);
21446           if (!analyzed_forms.count(form)) {
21447             guesser_only_morpho->analyze(form, morphodita::morpho::GUESSER, analyses);
21448 
21449             int to_add = guesser_enrich_dictionary;
21450             for (auto&& analyse : analyses) {
21451               entry.assign(analyse.lemma).push_back('\t');
21452               entry.append(analyse.tag).push_back('\t');
21453               entry.append(form);
21454               if (dictionary_entries.insert(entry).second)
21455                 if (!--to_add)
21456                   break;
21457             }
21458             analyzed_forms.insert(form);
21459           }
21460         }
21461     }
21462 
21463     // Create the dictionary
21464     vector<string> sorted_dictionary(dictionary_entries.begin(), dictionary_entries.end());
21465     sort(sorted_dictionary.begin(), sorted_dictionary.end());
21466 
21467     stringstream morpho_input;
21468     for (auto&& entry : sorted_dictionary)
21469       morpho_input << entry << '\n';
21470 
21471     morpho_description.put(morphodita::morpho_ids::GENERIC);
21472     morphodita::generic_morpho_encoder::encode(morpho_input, dictionary_suffix_len, dictionary_special_tags, guesser_description, morpho_description);
21473   }
21474 
21475   // Measure dictionary accuracy if required
21476   const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model);
21477   if (!dictionary_accuracy.empty()) {
21478     unique_ptr<morphodita::morpho> morpho(morphodita::morpho::load(morpho_description));
21479     if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false;
21480     morpho_description.seekg(0, ios::beg);
21481 
21482     // Measure dictionary accuracy on given data
21483     unsigned words = 0, total_analyses = 0, upostag = 0, xpostag = 0, feats = 0, all_tags = 0, lemma = 0;
21484 
21485     word w;
21486     vector<morphodita::tagged_lemma> analyses;
21487     conllu_input_format->set_text(dictionary_accuracy.c_str());
21488     for (sentence sentence; conllu_input_format->next_sentence(sentence, error); )
21489       for (size_t i = 1; i < sentence.words.size(); i++) {
21490         morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses);
21491         unsigned upostag_ok = 0, xpostag_ok = 0, feats_ok = 0, all_tags_ok = 0, lemma_ok = 0;
21492         for (auto&& analysis : analyses) {
21493           w.lemma.assign("_");
21494           model_fill_word_analysis(analysis, true, use_lemma, true, true, w);
21495           upostag_ok |= int(sentence.words[i].upostag == w.upostag);
21496           xpostag_ok |= int(sentence.words[i].xpostag == w.xpostag);
21497           feats_ok |= int(sentence.words[i].feats == w.feats);
21498           all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats);
21499           lemma_ok |= int(sentence.words[i].lemma == w.lemma);
21500         }
21501         words++;
21502         total_analyses += analyses.size();
21503         upostag += upostag_ok;
21504         xpostag += xpostag_ok;
21505         feats += feats_ok;
21506         all_tags += all_tags_ok;
21507         lemma += lemma_ok;
21508       }
21509     if (!error.empty()) return false;
21510 
21511     cerr << "Dictionary accuracy for tagging model " << model+1 << " - forms: " << words
21512          << ", analyses per form: " << fixed << setprecision(2) << total_analyses / double(words)
21513          << ", upostag: " << setprecision(1) << 100. * upostag / words << "%, xpostag: " << 100. * xpostag / words
21514          << "%, feats: " << 100. * feats / words << "%, all tags: " << 100. * all_tags / words << "%, lemma: " << 100. * lemma / words << '%' << endl;
21515   }
21516 
21517   // Tagger options
21518   double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false;
21519   morphodita::tagger_id tagger_id;
21520   if (tagger_order == 2) tagger_id = morphodita::tagger_ids::CONLLU2;
21521   else if (tagger_order == 2.5) tagger_id = morphodita::tagger_ids::CONLLU2_3;
21522   else if (tagger_order == 3) tagger_id = morphodita::tagger_ids::CONLLU3;
21523   else return error.assign("The tagger_order can be only 2, 2.5 or 3!"), false;
21524 
21525   int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false;
21526   bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false;
21527   bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false;
21528   const string& tagger_feature_templates =
21529       option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger :
21530       option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer :
21531       !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
21532       model == 1 ? tagger_features_lemmatizer : tagger_features_tagger;
21533   if (heldout.empty()) tagger_early_stopping = false;
21534 
21535   cerr << "Tagger model " << model+1 << " options: iterations=" << tagger_iterations
21536        << ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates="
21537        << (tagger_feature_templates == tagger_features_tagger ? "tagger" :
21538            tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl;
21539 
21540   // Train the tagger
21541   cerr << "Training tagger model " << model+1 << "." << endl;
21542   stringstream input, heldout_input, feature_templates_input(tagger_feature_templates);
21543   for (auto&& sentence : training) {
21544     for (size_t i = 1; i < sentence.words.size(); i++)
21545       input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t'
21546           << combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t'
21547           << combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n';
21548     input << '\n';
21549   }
21550 
21551   for (auto&& sentence : heldout) {
21552     for (size_t i = 1; i < sentence.words.size(); i++)
21553       heldout_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t'
21554           << combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t'
21555           << combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n';
21556     heldout_input << '\n';
21557   }
21558 
21559   os.put(tagger_id);
21560   morphodita::tagger_trainer<morphodita::perceptron_tagger_trainer<morphodita::train_feature_sequences<morphodita::conllu_elementary_features>>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os);
21561 
21562   return true;
21563 }
21564 
can_combine_tag(const word & w,string & error)21565 bool trainer_morphodita_parsito::can_combine_tag(const word& w, string& error) {
21566   error.clear();
21567 
21568   unsigned separator = 0;
21569   while (separator < tag_separators.size() &&
21570          (w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos))
21571     separator++;
21572 
21573   if (separator >= tag_separators.size()) {
21574     error.assign("Cannot find tag separating character, UPOSTAG and XPOSTAG contain all of '").append(tag_separators).append("'!");
21575     return false;
21576   }
21577   return true;
21578 }
21579 
combine_tag(const word & w,bool xpostag,bool feats,string & combined_tag)21580 const string& trainer_morphodita_parsito::combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag) {
21581   unsigned separator = 0;
21582   while (separator < tag_separators.size() &&
21583          (w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos))
21584     separator++;
21585   if (separator >= tag_separators.size())
21586     // Should not happen, as can_combine_tag was called before
21587     separator = 0;
21588 
21589   combined_tag.assign(1, tag_separators[separator]);
21590   combined_tag.append(w.upostag);
21591   if (xpostag || feats) {
21592     combined_tag.push_back(tag_separators[separator]);
21593     if (xpostag) combined_tag.append(w.xpostag);
21594     if (feats) combined_tag.push_back(tag_separators[separator]);
21595     if (feats) combined_tag.append(w.feats);
21596   }
21597 
21598   return combined_tag;
21599 }
21600 
most_frequent_tag(const vector<sentence> & data,const string & upostag,bool xpostag,bool feats,string & combined_tag)21601 const string& trainer_morphodita_parsito::most_frequent_tag(const vector<sentence>& data, const string& upostag, bool xpostag, bool feats, string& combined_tag) {
21602   unordered_map<string, unsigned> counts;
21603 
21604   for (auto&& sentence : data)
21605     for (size_t i = 1; i < sentence.words.size(); i++)
21606       if (sentence.words[i].upostag == upostag)
21607         counts[combine_tag(sentence.words[i], xpostag, feats, combined_tag)]++;
21608 
21609   combined_tag.assign("~").append(upostag);
21610   unsigned best = 0;
21611   for (auto&& tags : counts)
21612     if (tags.second > best) {
21613       best = tags.second;
21614       combined_tag.assign(tags.first);
21615     }
21616   return combined_tag;
21617 }
21618 
combine_lemma(const word & w,int use_lemma,string & combined_lemma,const unordered_set<string> & flat_lemmas)21619 const string& trainer_morphodita_parsito::combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set<string>& flat_lemmas) {
21620   switch (use_lemma) {
21621     case 0:
21622       return model_normalize_form(w.form, combined_lemma);
21623     case 1:
21624       model_normalize_lemma(w.lemma, combined_lemma);
21625       if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma))
21626         return model_normalize_form(w.form, combined_lemma);
21627       return combined_lemma;
21628     default: /*2*/
21629       if (w.lemma == "")
21630         return model_normalize_form(w.form, combined_lemma), combined_lemma.insert(0, "~~");
21631       else if (w.lemma == "_")
21632         return model_normalize_form(w.form, combined_lemma), combined_lemma.insert(0, "~_~");
21633 
21634       model_normalize_lemma(w.lemma, combined_lemma);
21635       if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) {
21636         string normalized_form;
21637         model_normalize_form(w.form, normalized_form);
21638         return combined_lemma.insert(0, "~").append("~").append(normalized_form);
21639       }
21640       return combined_lemma;
21641   }
21642 }
21643 
21644 // Generic options handling
21645 
option_str(const named_values::map & options,const string & name,int model)21646 const string& trainer_morphodita_parsito::option_str(const named_values::map& options, const string& name, int model) {
21647   string indexed_name(name);
21648   if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
21649 
21650   return options.count(indexed_name) ? options.at(indexed_name) : options.count(name) ? options.at(name) : empty_string;
21651 }
21652 
option_int(const named_values::map & options,const string & name,int & value,string & error,int model)21653 bool trainer_morphodita_parsito::option_int(const named_values::map& options, const string& name, int& value, string& error, int model) {
21654   string indexed_name(name);
21655   if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
21656 
21657   if (options.count(indexed_name))
21658     return parse_int(options.at(indexed_name), name.c_str(), value, error);
21659   if (options.count(name))
21660     return parse_int(options.at(name), name.c_str(), value, error);
21661   return true;
21662 }
21663 
option_bool(const named_values::map & options,const string & name,bool & value,string & error,int model)21664 bool trainer_morphodita_parsito::option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model) {
21665   string indexed_name(name);
21666   if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
21667 
21668   if (options.count(indexed_name) || options.count(name)) {
21669     int int_value;
21670     if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error))
21671       return false;
21672     value = int_value != 0;
21673   }
21674   return true;
21675 }
21676 
option_double(const named_values::map & options,const string & name,double & value,string & error,int model)21677 bool trainer_morphodita_parsito::option_double(const named_values::map& options, const string& name, double& value, string& error, int model) {
21678   string indexed_name(name);
21679   if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
21680 
21681   if (options.count(indexed_name))
21682     return parse_double(options.at(indexed_name), name.c_str(), value, error);
21683   if (options.count(name))
21684     return parse_double(options.at(name), name.c_str(), value, error);
21685   return true;
21686 }
21687 
21688 // Various string data
21689 
21690 const string trainer_morphodita_parsito::empty_string;
21691 
21692 const string trainer_morphodita_parsito::tag_separators = "~!@#$%^&*()/";
21693 
21694 const string trainer_morphodita_parsito::tagger_features_tagger =
21695   "Tag 0\n"
21696   "Tag 0,Tag -1\n"
21697   "Tag 0,TagUPos -1\n"
21698   "Tag 0,Tag -1,Tag -2\n"
21699   "Tag 0,TagUPos -1,TagUPos -2\n"
21700   "Tag 0,Tag -2\n"
21701   "Tag 0,Form 0\n"
21702   "Tag 0,Form 0,Form -1\n"
21703   "Tag 0,Form -1\n"
21704   "Tag 0,Form -2\n"
21705   "Tag 0,Form -1,Form -2\n"
21706   "Tag 0,Form 1\n"
21707   "Tag 0,Form 1,Form 2\n"
21708   "Tag 0,PreviousVerbTag 0\n"
21709   "Tag 0,PreviousVerbForm 0\n"
21710   "Tag 0,FollowingVerbTag 0\n"
21711   "Tag 0,FollowingVerbForm 0\n"
21712   "Tag 0,Lemma -1\n"
21713   "Tag 0,Form 1\n"
21714   "Lemma 0,Tag -1\n"
21715   "Tag 0,Prefix1 0\n"
21716   "Tag 0,Prefix2 0\n"
21717   "Tag 0,Prefix3 0\n"
21718   "Tag 0,Prefix4 0\n"
21719   "Tag 0,Prefix5 0\n"
21720   "Tag 0,Prefix6 0\n"
21721   "Tag 0,Prefix7 0\n"
21722   "Tag 0,Prefix8 0\n"
21723   "Tag 0,Prefix9 0\n"
21724   "Tag 0,Suffix1 0\n"
21725   "Tag 0,Suffix2 0\n"
21726   "Tag 0,Suffix3 0\n"
21727   "Tag 0,Suffix4 0\n"
21728   "Tag 0,Suffix5 0\n"
21729   "Tag 0,Suffix6 0\n"
21730   "Tag 0,Suffix7 0\n"
21731   "Tag 0,Suffix8 0\n"
21732   "Tag 0,Suffix9 0\n"
21733   "TagUPos 0\n"
21734   "TagUPos 0,TagUPos -1\n"
21735   "TagUPos 0,TagUPos -1,TagUPos -2\n"
21736   "TagCase 0,TagCase -1\n"
21737   "TagCase 0,TagCase -1,TagCase -2\n"
21738   "TagGender 0,TagGender -1\n"
21739   "TagGender 0,TagGender -1,TagGender -2\n"
21740   "TagUPos 0,Prefix1 0\n"
21741   "TagUPos 0,Prefix2 0\n"
21742   "TagUPos 0,Prefix3 0\n"
21743   "TagUPos 0,Prefix4 0\n"
21744   "TagUPos 0,Prefix5 0\n"
21745   "TagUPos 0,Prefix6 0\n"
21746   "TagUPos 0,Prefix7 0\n"
21747   "TagUPos 0,Prefix8 0\n"
21748   "TagUPos 0,Prefix9 0\n"
21749   "TagUPos 0,Suffix1 0\n"
21750   "TagUPos 0,Suffix2 0\n"
21751   "TagUPos 0,Suffix3 0\n"
21752   "TagUPos 0,Suffix4 0\n"
21753   "TagUPos 0,Suffix5 0\n"
21754   "TagUPos 0,Suffix6 0\n"
21755   "TagUPos 0,Suffix7 0\n"
21756   "TagUPos 0,Suffix8 0\n"
21757   "TagUPos 0,Suffix9 0\n"
21758   "Tag 0,Num 0\n"
21759   "Tag 0,Cap 0\n"
21760   "Tag 0,Dash 0\n"
21761   "TagNegative 0,Prefix1 0\n"
21762   "TagNegative 0,Prefix2 0\n"
21763   "TagNegative 0,Prefix3 0\n"
21764   "TagCase 0,Suffix1 0\n"
21765   "TagCase 0,Suffix2 0\n"
21766   "TagCase 0,Suffix3 0\n"
21767   "TagCase 0,Suffix4 0\n"
21768   "TagCase 0,Suffix5 0\n";
21769 
21770 const string trainer_morphodita_parsito::tagger_features_lemmatizer =
21771   "Tag 0\n"
21772   "Tag 0,Tag -1\n"
21773   "Tag 0,Tag -1,Tag -2\n"
21774   "Tag 0,Tag -2\n"
21775   "Tag 0,Form 0\n"
21776   "Tag 0,Form 0,Form -1\n"
21777   "Tag 0,Form -1\n"
21778   "Tag 0,Form -2\n"
21779   "Tag 0,PreviousVerbTag 0\n"
21780   "Tag 0,PreviousVerbForm 0\n"
21781   "Tag 0,FollowingVerbTag 0\n"
21782   "Tag 0,FollowingVerbForm 0\n"
21783   "Tag 0,Lemma -1\n"
21784   "Tag 0,Form 1\n"
21785   "Lemma 0\n"
21786   "Lemma 0,Tag -1\n"
21787   "Lemma 0,Tag -1,Tag -2\n"
21788   "Lemma 0,Tag -2\n"
21789   "Lemma 0,Form -1\n"
21790   "Lemma 0,Form -1,Form -2\n"
21791   "Lemma 0,Form -2\n"
21792   "Lemma 0,PreviousVerbTag 0\n"
21793   "Lemma 0,PreviousVerbForm 0\n"
21794   "Lemma 0,FollowingVerbTag 0\n"
21795   "Lemma 0,FollowingVerbForm 0\n"
21796   "Lemma 0,Form 1\n"
21797   "Tag 0,Prefix1 0\n"
21798   "Tag 0,Prefix2 0\n"
21799   "Tag 0,Prefix3 0\n"
21800   "Tag 0,Prefix4 0\n"
21801   "Tag 0,Prefix5 0\n"
21802   "Tag 0,Suffix1 0\n"
21803   "Tag 0,Suffix2 0\n"
21804   "Tag 0,Suffix3 0\n"
21805   "Tag 0,Suffix4 0\n"
21806   "Tag 0,Suffix5 0\n"
21807   "Tag 0,Num 0\n"
21808   "Tag 0,Cap 0\n"
21809   "Tag 0,Dash 0\n";
21810 
21811 const string trainer_morphodita_parsito::parser_nodes =
21812   "stack 0\n"
21813   "stack 1\n"
21814   "stack 2\n"
21815   "buffer 0\n"
21816   "buffer 1\n"
21817   "buffer 2\n"
21818   "stack 0,child 0\n"
21819   "stack 0,child 1\n"
21820   "stack 0,child -2\n"
21821   "stack 0,child -1\n"
21822   "stack 1,child 0\n"
21823   "stack 1,child 1\n"
21824   "stack 1,child -2\n"
21825   "stack 1,child -1\n"
21826   "stack 0,child 0,child 0\n"
21827   "stack 0,child -1,child -1\n"
21828   "stack 1,child 0,child 0\n"
21829   "stack 1,child -1,child -1\n";
21830 
21831 /////////
21832 // File: trainer/training_failure.cpp
21833 /////////
21834 
21835 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
21836 //
21837 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
21838 // Mathematics and Physics, Charles University in Prague, Czech Republic.
21839 //
21840 // This Source Code Form is subject to the terms of the Mozilla Public
21841 // License, v. 2.0. If a copy of the MPL was not distributed with this
21842 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
21843 
training_error()21844 training_error::training_error() : runtime_error(message_collector.str()) {
21845   message_collector.str(string());
21846 }
21847 
21848 ostringstream training_error::message_collector;
21849 
21850 /////////
21851 // File: unilib/unicode.cpp
21852 /////////
21853 
21854 // This file is part of UniLib <http://github.com/ufal/unilib/>.
21855 //
21856 // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
21857 // Mathematics and Physics, Charles University in Prague, Czech Republic.
21858 //
21859 // This Source Code Form is subject to the terms of the Mozilla Public
21860 // License, v. 2.0. If a copy of the MPL was not distributed with this
21861 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
21862 //
21863 // UniLib version: 3.1.1
21864 // Unicode version: 8.0.0
21865 
21866 namespace unilib {
21867 
21868 const char32_t unicode::CHARS;
21869 
21870 const int32_t unicode::DEFAULT_CAT;
21871 
21872 const uint8_t unicode::category_index[unicode::CHARS >> 8] = {
21873   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,17,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,33,41,42,43,44,45,46,47,48,39,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,49,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,50,51,17,17,17,52,17,53,54,55,56,57,58,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,59,60,60,60,60,60,60,60,60,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,17,62,63,17,64,65,66,67,68,69,70,71,72,17,73,74,75,76,77,78,79,80,79,81,82,83,84,85,86,87,88,89,79,90,79,79,79,79,79,17,17,17,91,92,93,79,79,79,79,79,79,79,79,79,79,17,17,17,17,94,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,17,17,95,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,17,17,96,97,79,79,79,98,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,99,79,79,79,79,79,79,79,79,79,79,79,100,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,101,102,103,104,105,106,107,108,39,39,109,79,79,79,79,79,79,79,79,79,79,79,79,79,110,79,79,79,79,79,111,79,112,113,114,115,39,116,117,118,119,120,79,79,79,79,79,79,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
21874     17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,121,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,122,123,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,124,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,17,17,125,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,
21875     79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,
21876     79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,
21877     79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,
21878     79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,126,127,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,
21879     61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,128,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,128
21880 };
21881 
21882 const uint8_t unicode::category_block[][256] = {
21883   {_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Zs,_Po,_Po,_Po,_Sc,_Po,_Po,_Po,_Ps,_Pe,_Po,_Sm,_Po,_Pd,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Sm,_Sm,_Sm,_Po,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ps,_Po,_Pe,_Sk,_Pc,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ps,_Sm,_Pe,_Sm,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Zs,_Po,_Sc,_Sc,_Sc,_Sc,_So,_Po,_Sk,_So,_Lo,_Pi,_Sm,_Cf,_So,_Sk,_So,_Sm,_No,_No,_Sk,_Ll,_Po,_Po,_Sk,_No,_Lo,_Pf,_No,_No,_No,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll},
21884   {_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lo,_Lu,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lu,_Lt,_Ll,_Lu,_Lt,_Ll,_Lu,_Lt,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Lt,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll},
21885   {_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Sk,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk},
21886   {_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lu,_Ll,_Lu,_Ll,_Lm,_Sk,_Lu,_Ll,_Cn,_Cn,_Lm,_Ll,_Ll,_Ll,_Po,_Lu,_Cn,_Cn,_Cn,_Cn,_Sk,_Sk,_Lu,_Po,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Sm,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu},
21887   {_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Me,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll},
21888   {_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Po,_Pd,_Cn,_Cn,_So,_So,_Sc,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Pd,_Mn,_Po,_Mn,_Mn,_Po,_Mn,_Mn,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21889   {_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Sm,_Sm,_Sm,_Po,_Po,_Sc,_Po,_Po,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Cf,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cf,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Mn,_Mn,_So,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_So,_So,_Lo},
21890   {_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cf,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_So,_Po,_Po,_Po,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn},
21891   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Cn,_Cn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn},
21892   {_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Sc,_Sc,_No,_No,_No,_No,_No,_No,_So,_Sc,_Cn,_Cn,_Cn,_Cn},
21893   {_Cn,_Mn,_Mn,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Mn,_Cn,_Mc,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mn,_Mn,_Lo,_Lo,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mc,_Cn,_Mc,_Mc,_Mn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Sc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21894   {_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mc,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_So,_Lo,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Mc,_Mn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_So,_So,_So,_So,_So,_So,_Sc,_So,_Cn,_Cn,_Cn,_Cn,_Cn},
21895   {_Mn,_Mc,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_So,_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mc,_Mc,_Cn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21896   {_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Mc,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Cn,_Mn,_Cn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Mc,_Mc,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21897   {_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Sc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lm,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21898   {_Lo,_So,_So,_So,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_Po,_So,_So,_So,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_Mn,_So,_Mn,_So,_Mn,_Ps,_Pe,_Ps,_Pe,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21899   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Lo,_Mc,_Mc,_Mc,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Lo,_Mc,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mc,_Mc,_Mc,_Mn,_So,_So,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lm,_Lo,_Lo,_Lo},
21900   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21901   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21902   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn},
21903   {_Pd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21904   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Zs,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Ps,_Pe,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Nl,_Nl,_Nl,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21905   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Sc,_Lo,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21906   {_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Po,_Po,_Po,_Po,_Mn,_Mn,_Mn,_Cf,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21907   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So},
21908   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mc,_Mn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21909   {_Mn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po},
21910   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Po,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Lo,_Lo,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21911   {_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn},
21912   {_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll},
21913   {_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Lu,_Cn,_Lu,_Cn,_Lu,_Cn,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Ll,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Cn,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Sk,_Sk,_Sk,_Cn,_Cn,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Sk,_Cn},
21914   {_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Cf,_Cf,_Cf,_Cf,_Cf,_Pd,_Pd,_Pd,_Pd,_Pd,_Pd,_Po,_Po,_Pi,_Pf,_Ps,_Pi,_Pi,_Pf,_Ps,_Pi,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Zl,_Zp,_Cf,_Cf,_Cf,_Cf,_Cf,_Zs,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pi,_Pf,_Po,_Po,_Po,_Po,_Pc,_Pc,_Po,_Po,_Po,_Sm,_Ps,_Pe,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Sm,_Po,_Pc,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Zs,_Cf,_Cf,_Cf,_Cf,_Cf,_Cn,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_No,_Lm,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_Sm,_Sm,_Sm,_Ps,_Pe,_Lm,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Sm,_Sm,_Sm,_Ps,_Pe,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Me,_Me,_Me,_Mn,_Me,_Me,_Me,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21915   {_So,_So,_Lu,_So,_So,_So,_So,_Lu,_So,_So,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Ll,_So,_Lu,_So,_So,_Sm,_Lu,_Lu,_Lu,_Lu,_Lu,_So,_So,_So,_So,_So,_So,_Lu,_So,_Lu,_So,_Lu,_So,_Lu,_Lu,_Lu,_Lu,_So,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lo,_Lo,_Lo,_Lo,_Ll,_So,_So,_Ll,_Ll,_Lu,_Lu,_Sm,_Sm,_Sm,_Sm,_Sm,_Lu,_Ll,_Ll,_Ll,_Ll,_So,_Sm,_So,_So,_Ll,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Lu,_Ll,_Nl,_Nl,_Nl,_Nl,_No,_So,_So,_Cn,_Cn,_Cn,_Cn,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_So,_So,_Sm,_So,_So,_Sm,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_Sm,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm},
21916   {_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm},
21917   {_So,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_Ps,_Pe,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn},
21918   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No},
21919   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm},
21920   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So},
21921   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm},
21922   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So},
21923   {_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Sm,_Sm},
21924   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21925   {_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_So,_So,_So,_So,_So,_So,_Lu,_Ll,_Lu,_Ll,_Mn,_Mn,_Mn,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_No,_Po,_Po},
21926   {_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn},
21927   {_Po,_Po,_Pi,_Pf,_Pi,_Pf,_Po,_Po,_Po,_Pi,_Pf,_Po,_Pi,_Pf,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Po,_Po,_Pd,_Po,_Pi,_Pf,_Po,_Po,_Pi,_Pf,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Pd,_Po,_Po,_Po,_Po,_Pd,_Po,_Ps,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21928   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn},
21929   {_Zs,_Po,_Po,_Po,_So,_Lm,_Lo,_Nl,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_So,_So,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Pd,_Ps,_Pe,_Pe,_So,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Pd,_Lm,_Lm,_Lm,_Lm,_Lm,_So,_So,_Nl,_Nl,_Nl,_Lm,_Lo,_Po,_So,_So,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Sk,_Sk,_Lm,_Lm,_Lo,_Pd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lm,_Lm,_Lm,_Lo},
21930   {_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_So,_So,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21931   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn},
21932   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So},
21933   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21934   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21935   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Po,_Po},
21936   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lo,_Mn,_Me,_Me,_Me,_Po,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Lm,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Lm,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21937   {_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Sk,_Sk,_Lu,_Ll,_Lu,_Ll,_Lo,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lm,_Lm,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo},
21938   {_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mc,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_So,_So,_Sc,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Lo,_Po,_Lo,_Cn,_Cn},
21939   {_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Lm,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn},
21940   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_Lo,_Mc,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Mn,_Mn,_Mn,_Lo,_Lo,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lm,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mc,_Mc,_Po,_Po,_Lo,_Lm,_Lm,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21941   {_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sk,_Lm,_Lm,_Lm,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mc,_Mc,_Mn,_Mc,_Mc,_Po,_Mc,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21942   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn},
21943   {_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs},
21944   {_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co},
21945   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21946   {_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21947   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Pe,_Ps,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sc,_So,_Cn,_Cn},
21948   {_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Ps,_Pe,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Pd,_Pd,_Pc,_Pc,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Ps,_Pe,_Po,_Po,_Po,_Po,_Pc,_Pc,_Pc,_Po,_Po,_Po,_Cn,_Po,_Po,_Po,_Po,_Pd,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Po,_Sm,_Pd,_Sm,_Sm,_Sm,_Cn,_Po,_Sc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cf},
21949   {_Cn,_Po,_Po,_Po,_Sc,_Po,_Po,_Po,_Ps,_Pe,_Po,_Sm,_Po,_Pd,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Sm,_Sm,_Sm,_Po,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ps,_Po,_Pe,_Sk,_Pc,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ps,_Sm,_Pe,_Sm,_Ps,_Pe,_Po,_Ps,_Pe,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Sc,_Sc,_Sm,_Sk,_So,_Sc,_Sc,_Cn,_So,_Sm,_Sm,_Sm,_Sm,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cf,_Cf,_So,_So,_Cn,_Cn},
21950   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn},
21951   {_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Cn,_Cn},
21952   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn},
21953   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Nl,_Nl,_Nl,_Nl,_Nl,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21954   {_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21955   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21956   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21957   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Po,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No},
21958   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_No,_No,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No},
21959   {_Lo,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mn,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21960   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21961   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No},
21962   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21963   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21964   {_Mc,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Po,_Po,_Cf,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21965   {_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Po,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Po,_Mn,_Mn,_Mn,_Po,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Po,_Lo,_Po,_Po,_Po,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21966   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21967   {_Mn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mc,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21968   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Lo,_Lo,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21969   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21970   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21971   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_Po,_Po,_Po,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21972   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo},
21973   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21974   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21975   {_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Cn,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
21976   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21977   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21978   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21979   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21980   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_Lm,_Lm,_Lm,_Lm,_Po,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_No,_No,_No,_No,_No,_No,_No,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21981   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21982   {_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21983   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_So,_Mn,_Mn,_Po,_Cf,_Cf,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21984   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21985   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mc,_Mc,_Mn,_Mn,_Mn,_So,_So,_So,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21986   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21987   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21988   {_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Cn,_Lu,_Lu,_Cn,_Cn,_Lu,_Cn,_Cn,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll},
21989   {_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll},
21990   {_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll},
21991   {_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd},
21992   {_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21993   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21994   {_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Sm,_Sm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21995   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21996   {_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So},
21997   {_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
21998   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sk,_Sk,_Sk,_Sk,_Sk},
21999   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So},
22000   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22001   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22002   {_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22003   {_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22004   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22005   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
22006   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo},
22007   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22008   {_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22009   {_Cn,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22010   {_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn},
22011   {_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Cn,_Cn}
22012 };
22013 
22014 const uint8_t unicode::othercase_index[unicode::CHARS >> 8] = {
22015   0,1,2,3,4,5,6,6,6,6,6,6,6,6,6,6,7,6,6,8,6,6,6,6,6,6,6,6,6,9,10,11,6,12,6,6,13,6,6,6,6,6,6,6,14,15,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,16,17,6,6,6,18,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,19,6,6,6,6,20,6,6,6,6,6,6,6,21,6,6,6,6,6,6,6,6,6,6,6,22,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
22016     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
22017     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
22018     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
22019     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
22020 };
22021 
22022 const char32_t unicode::othercase_block[][256] = {
22023   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24833,25089,25345,25601,25857,26113,26369,26625,26881,27137,27393,27649,27905,28161,28417,28673,28929,29185,29441,29697,29953,30209,30465,30721,30977,31233,0,0,0,0,0,0,16642,16898,17154,17410,17666,17922,18178,18434,18690,18946,19202,19458,19714,19970,20226,20482,20738,20994,21250,21506,21762,22018,22274,22530,22786,23042,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,236546,0,0,0,0,0,0,0,0,0,0,57345,57601,57857,58113,58369,58625,58881,59137,59393,59649,59905,60161,60417,60673,60929,61185,61441,61697,61953,62209,62465,62721,62977,0,63489,63745,64001,64257,64513,64769,65025,0,49154,49410,49666,49922,50178,50434,50690,50946,51202,51458,51714,51970,52226,52482,52738,52994,53250,53506,53762,54018,54274,54530,54786,0,55298,55554,55810,56066,56322,56578,56834,96258},
22024   {65793,65538,66305,66050,66817,66562,67329,67074,67841,67586,68353,68098,68865,68610,69377,69122,69889,69634,70401,70146,70913,70658,71425,71170,71937,71682,72449,72194,72961,72706,73473,73218,73985,73730,74497,74242,75009,74754,75521,75266,76033,75778,76545,76290,77057,76802,77569,77314,26881,18690,78593,78338,79105,78850,79617,79362,0,80385,80130,80897,80642,81409,81154,81921,81666,82433,82178,82945,82690,83457,83202,83969,83714,0,84737,84482,85249,84994,85761,85506,86273,86018,86785,86530,87297,87042,87809,87554,88321,88066,88833,88578,89345,89090,89857,89602,90369,90114,90881,90626,91393,91138,91905,91650,92417,92162,92929,92674,93441,93186,93953,93698,94465,94210,94977,94722,95489,95234,96001,95746,65281,96769,96514,97281,97026,97793,97538,21250,148226,152321,99073,98818,99585,99330,152577,100353,100098,153089,153345,101377,101122,0,122113,153857,154369,102913,102658,155649,156417,128514,157953,157697,104705,104450,146690,0,159489,160257,139266,161025,106753,106498,107265,107010,107777,107522,163841,108545,108290,164609,0,0,109825,109570,165889,110593,110338,166401,166657,111617,111362,112129,111874,168449,112897,112642,0,0,113921,113666,0,128770,0,0,0,0,115973,116227,115716,116741,116995,116484,117509,117763,117252,118273,118018,118785,118530,119297,119042,119809,119554,120321,120066,120833,120578,121345,121090,121857,121602,101890,122625,122370,123137,122882,123649,123394,124161,123906,124673,124418,125185,124930,125697,125442,126209,125954,126721,126466,0,127493,127747,127236,128257,128002,103681,114433,129281,129026,129793,129538,130305,130050,130817,130562},
22025   {131329,131074,131841,131586,132353,132098,132865,132610,133377,133122,133889,133634,134401,134146,134913,134658,135425,135170,135937,135682,136449,136194,136961,136706,137473,137218,137985,137730,138497,138242,139009,138754,105985,0,140033,139778,140545,140290,141057,140802,141569,141314,142081,141826,142593,142338,143105,142850,143617,143362,144129,143874,0,0,0,0,0,0,2909441,146433,146178,104961,2909697,2915842,2916098,147969,147714,98305,166145,166913,149249,148994,149761,149506,150273,150018,150785,150530,151297,151042,2912002,2911490,2912258,98562,99842,0,100610,100866,0,102146,0,102402,10988290,0,0,0,103170,10988546,0,103426,0,10980610,10988034,0,104194,103938,0,2908674,10988802,0,0,105474,0,2911746,105730,0,0,106242,0,0,0,0,0,0,0,2909186,0,0,108034,0,0,108802,0,0,0,10989826,110082,148482,110850,111106,148738,0,0,0,0,0,112386,0,0,0,0,0,0,0,0,0,0,10990082,10989570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22026   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235778,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,225537,225282,226049,225794,0,0,227073,226818,0,0,0,261378,261634,261890,0,258817,0,0,0,0,0,0,240641,0,240897,241153,241409,0,248833,0,249089,249345,0,241921,242177,242433,242689,242945,243201,243457,243713,243969,244225,244481,244737,244993,245249,245505,245761,246017,0,246529,246785,247041,247297,247553,247809,248065,248321,248577,230914,231426,231682,231938,0,233730,233986,234242,234498,234754,235010,235266,235522,235778,236034,236290,236546,236802,237058,237314,237570,237826,238338,238338,238594,238850,239106,239362,239618,239874,240130,240386,232450,232962,233218,251649,233986,235522,0,0,0,239106,237570,249602,252161,251906,252673,252418,253185,252930,253697,253442,254209,253954,254721,254466,255233,254978,255745,255490,256257,256002,256769,256514,257281,257026,257793,257538,236034,237826,260354,229122,243713,234754,0,260097,259842,258561,260865,260610,0,228097,228353,228609},
22027   {282625,282881,283137,283393,283649,283905,284161,284417,284673,284929,285185,285441,285697,285953,286209,286465,274433,274689,274945,275201,275457,275713,275969,276225,276481,276737,276993,277249,277505,277761,278017,278273,278529,278785,279041,279297,279553,279809,280065,280321,280577,280833,281089,281345,281601,281857,282113,282369,266242,266498,266754,267010,267266,267522,267778,268034,268290,268546,268802,269058,269314,269570,269826,270082,270338,270594,270850,271106,271362,271618,271874,272130,272386,272642,272898,273154,273410,273666,273922,274178,262146,262402,262658,262914,263170,263426,263682,263938,264194,264450,264706,264962,265218,265474,265730,265986,286977,286722,287489,287234,288001,287746,288513,288258,289025,288770,289537,289282,290049,289794,290561,290306,291073,290818,291585,291330,292097,291842,292609,292354,293121,292866,293633,293378,294145,293890,294657,294402,295169,294914,0,0,0,0,0,0,0,0,297729,297474,298241,297986,298753,298498,299265,299010,299777,299522,300289,300034,300801,300546,301313,301058,301825,301570,302337,302082,302849,302594,303361,303106,303873,303618,304385,304130,304897,304642,305409,305154,305921,305666,306433,306178,306945,306690,307457,307202,307969,307714,308481,308226,308993,308738,309505,309250,310017,309762,310529,310274,311041,310786,315137,311809,311554,312321,312066,312833,312578,313345,313090,313857,313602,314369,314114,314881,314626,311298,315649,315394,316161,315906,316673,316418,317185,316930,317697,317442,318209,317954,318721,318466,319233,318978,319745,319490,320257,320002,320769,320514,321281,321026,321793,321538,322305,322050,322817,322562,323329,323074,323841,323586,324353,324098,324865,324610,325377,325122,325889,325634,326401,326146,326913,326658,327425,327170},
22028   {327937,327682,328449,328194,328961,328706,329473,329218,329985,329730,330497,330242,331009,330754,331521,331266,332033,331778,332545,332290,333057,332802,333569,333314,334081,333826,334593,334338,335105,334850,335617,335362,336129,335874,336641,336386,337153,336898,337665,337410,338177,337922,338689,338434,339201,338946,339713,339458,0,352513,352769,353025,353281,353537,353793,354049,354305,354561,354817,355073,355329,355585,355841,356097,356353,356609,356865,357121,357377,357633,357889,358145,358401,358657,358913,359169,359425,359681,359937,360193,360449,360705,360961,361217,361473,361729,361985,0,0,0,0,0,0,0,0,0,0,340226,340482,340738,340994,341250,341506,341762,342018,342274,342530,342786,343042,343298,343554,343810,344066,344322,344578,344834,345090,345346,345602,345858,346114,346370,346626,346882,347138,347394,347650,347906,348162,348418,348674,348930,349186,349442,349698,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22029   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22030   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2949121,2949377,2949633,2949889,2950145,2950401,2950657,2950913,2951169,2951425,2951681,2951937,2952193,2952449,2952705,2952961,2953217,2953473,2953729,2953985,2954241,2954497,2954753,2955009,2955265,2955521,2955777,2956033,2956289,2956545,2956801,2957057,2957313,2957569,2957825,2958081,2958337,2958593,0,2959105,0,0,0,0,0,2960641,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22031   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11235329,11235585,11235841,11236097,11236353,11236609,11236865,11237121,11237377,11237633,11237889,11238145,11238401,11238657,11238913,11239169,11239425,11239681,11239937,11240193,11240449,11240705,11240961,11241217,11241473,11241729,11241985,11242241,11242497,11242753,11243009,11243265,11243521,11243777,11244033,11244289,11244545,11244801,11245057,11245313,11245569,11245825,11246081,11246337,11246593,11246849,11247105,11247361,11247617,11247873,11248129,11248385,11248641,11248897,11249153,11249409,11249665,11249921,11250177,11250433,11250689,11250945,11251201,11251457,11251713,11251969,11252225,11252481,11252737,11252993,11253249,11253505,11253761,11254017,11254273,11254529,11254785,11255041,11255297,11255553,1308673,1308929,1309185,1309441,1309697,1309953,0,0,1306626,1306882,1307138,1307394,1307650,1307906,0,0},
22032   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10976514,0,0,0,2908930,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22033   {1966337,1966082,1966849,1966594,1967361,1967106,1967873,1967618,1968385,1968130,1968897,1968642,1969409,1969154,1969921,1969666,1970433,1970178,1970945,1970690,1971457,1971202,1971969,1971714,1972481,1972226,1972993,1972738,1973505,1973250,1974017,1973762,1974529,1974274,1975041,1974786,1975553,1975298,1976065,1975810,1976577,1976322,1977089,1976834,1977601,1977346,1978113,1977858,1978625,1978370,1979137,1978882,1979649,1979394,1980161,1979906,1980673,1980418,1981185,1980930,1981697,1981442,1982209,1981954,1982721,1982466,1983233,1982978,1983745,1983490,1984257,1984002,1984769,1984514,1985281,1985026,1985793,1985538,1986305,1986050,1986817,1986562,1987329,1987074,1987841,1987586,1988353,1988098,1988865,1988610,1989377,1989122,1989889,1989634,1990401,1990146,1990913,1990658,1991425,1991170,1991937,1991682,1992449,1992194,1992961,1992706,1993473,1993218,1993985,1993730,1994497,1994242,1995009,1994754,1995521,1995266,1996033,1995778,1996545,1996290,1997057,1996802,1997569,1997314,1998081,1997826,1998593,1998338,1999105,1998850,1999617,1999362,2000129,1999874,2000641,2000386,2001153,2000898,2001665,2001410,2002177,2001922,2002689,2002434,2003201,2002946,2003713,2003458,2004225,2003970,0,0,0,0,0,1990658,0,0,57089,0,2007297,2007042,2007809,2007554,2008321,2008066,2008833,2008578,2009345,2009090,2009857,2009602,2010369,2010114,2010881,2010626,2011393,2011138,2011905,2011650,2012417,2012162,2012929,2012674,2013441,2013186,2013953,2013698,2014465,2014210,2014977,2014722,2015489,2015234,2016001,2015746,2016513,2016258,2017025,2016770,2017537,2017282,2018049,2017794,2018561,2018306,2019073,2018818,2019585,2019330,2020097,2019842,2020609,2020354,2021121,2020866,2021633,2021378,2022145,2021890,2022657,2022402,2023169,2022914,2023681,2023426,2024193,2023938,2024705,2024450,2025217,2024962,2025729,2025474,2026241,2025986,2026753,2026498,2027265,2027010,2027777,2027522,2028289,2028034,2028801,2028546,2029313,2029058,2029825,2029570,2030337,2030082,2030849,2030594,2031361,
22034     2031106},
22035   {2033666,2033922,2034178,2034434,2034690,2034946,2035202,2035458,2031617,2031873,2032129,2032385,2032641,2032897,2033153,2033409,2037762,2038018,2038274,2038530,2038786,2039042,0,0,2035713,2035969,2036225,2036481,2036737,2036993,0,0,2041858,2042114,2042370,2042626,2042882,2043138,2043394,2043650,2039809,2040065,2040321,2040577,2040833,2041089,2041345,2041601,2045954,2046210,2046466,2046722,2046978,2047234,2047490,2047746,2043905,2044161,2044417,2044673,2044929,2045185,2045441,2045697,2050050,2050306,2050562,2050818,2051074,2051330,0,0,2048001,2048257,2048513,2048769,2049025,2049281,0,0,0,2054402,0,2054914,0,2055426,0,2055938,0,2052353,0,2052865,0,2053377,0,2053889,2058242,2058498,2058754,2059010,2059266,2059522,2059778,2060034,2056193,2056449,2056705,2056961,2057217,2057473,2057729,2057985,2079234,2079490,2082818,2083074,2083330,2083586,2087426,2087682,2095106,2095362,2091522,2091778,2095618,2095874,0,0,2066434,2066690,2066946,2067202,2067458,2067714,2067970,2068226,2064385,2064641,2064897,2065153,2065409,2065665,2065921,2066177,2070530,2070786,2071042,2071298,2071554,2071810,2072066,2072322,2068481,2068737,2068993,2069249,2069505,2069761,2070017,2070273,2074626,2074882,2075138,2075394,2075650,2075906,2076162,2076418,2072577,2072833,2073089,2073345,2073601,2073857,2074113,2074369,2078722,2078978,0,2079746,0,0,0,0,2076673,2076929,2060289,2060545,2077441,0,235778,0,0,0,0,2083842,0,0,0,0,2060801,2061057,2061313,2061569,2081537,0,0,0,2086914,2087170,0,0,0,0,0,0,2084865,2085121,2061825,2062081,0,0,0,0,2091010,2091266,0,0,0,2092034,0,0,2088961,2089217,2062849,2063105,2090241,0,0,0,0,0,0,2096130,0,0,0,0,2062337,2062593,2063361,2063617,2093825,0,0,0},
22036   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248065,0,0,0,27393,58625,0,0,0,0,0,0,2182657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2175490,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2191361,2191617,2191873,2192129,2192385,2192641,2192897,2193153,2193409,2193665,2193921,2194177,2194433,2194689,2194945,2195201,2187266,2187522,2187778,2188034,2188290,2188546,2188802,2189058,2189314,2189570,2189826,2190082,2190338,2190594,2190850,2191106,0,0,0,2196481,2196226,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22037   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2412545,2412801,2413057,2413313,2413569,2413825,2414081,2414337,2414593,2414849,2415105,2415361,2415617,2415873,2416129,2416385,2416641,2416897,2417153,2417409,2417665,2417921,2418177,2418433,2418689,2418945,2405890,2406146,2406402,2406658,2406914,2407170,2407426,2407682,2407938,2408194,2408450,2408706,2408962,2409218,2409474,2409730,2409986,2410242,2410498,2410754,2411010,2411266,2411522,2411778,2412034,2412290,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22038   {2895873,2896129,2896385,2896641,2896897,2897153,2897409,2897665,2897921,2898177,2898433,2898689,2898945,2899201,2899457,2899713,2899969,2900225,2900481,2900737,2900993,2901249,2901505,2901761,2902017,2902273,2902529,2902785,2903041,2903297,2903553,2903809,2904065,2904321,2904577,2904833,2905089,2905345,2905601,2905857,2906113,2906369,2906625,2906881,2907137,2907393,2907649,0,2883586,2883842,2884098,2884354,2884610,2884866,2885122,2885378,2885634,2885890,2886146,2886402,2886658,2886914,2887170,2887426,2887682,2887938,2888194,2888450,2888706,2888962,2889218,2889474,2889730,2889986,2890242,2890498,2890754,2891010,2891266,2891522,2891778,2892034,2892290,2892546,2892802,2893058,2893314,2893570,2893826,2894082,2894338,2894594,2894850,2895106,2895362,0,2908417,2908162,158465,1932545,163073,145922,146946,2910209,2909954,2910721,2910466,2911233,2910978,151809,160001,151553,152065,0,2913025,2912770,0,2913793,2913538,0,0,0,0,0,0,0,147201,147457,2916609,2916354,2917121,2916866,2917633,2917378,2918145,2917890,2918657,2918402,2919169,2918914,2919681,2919426,2920193,2919938,2920705,2920450,2921217,2920962,2921729,2921474,2922241,2921986,2922753,2922498,2923265,2923010,2923777,2923522,2924289,2924034,2924801,2924546,2925313,2925058,2925825,2925570,2926337,2926082,2926849,2926594,2927361,2927106,2927873,2927618,2928385,2928130,2928897,2928642,2929409,2929154,2929921,2929666,2930433,2930178,2930945,2930690,2931457,2931202,2931969,2931714,2932481,2932226,2932993,2932738,2933505,2933250,2934017,2933762,2934529,2934274,2935041,2934786,2935553,2935298,2936065,2935810,2936577,2936322,2937089,2936834,2937601,2937346,2938113,2937858,2938625,2938370,2939137,2938882,2939649,2939394,2940161,2939906,2940673,2940418,2941185,2940930,2941697,2941442,0,0,0,0,0,0,0,2944001,2943746,2944513,2944258,0,0,0,2945793,2945538,0,0,0,0,0,0,0,0,0,0,0,0},
22039   {1089538,1089794,1090050,1090306,1090562,1090818,1091074,1091330,1091586,1091842,1092098,1092354,1092610,1092866,1093122,1093378,1093634,1093890,1094146,1094402,1094658,1094914,1095170,1095426,1095682,1095938,1096194,1096450,1096706,1096962,1097218,1097474,1097730,1097986,1098242,1098498,1098754,1099010,0,1099522,0,0,0,0,0,1101058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22040   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10895617,10895362,10896129,10895874,10896641,10896386,10897153,10896898,10897665,10897410,10898177,10897922,10898689,10898434,10899201,10898946,10899713,10899458,10900225,10899970,10900737,10900482,10901249,10900994,10901761,10901506,10902273,10902018,10902785,10902530,10903297,10903042,10903809,10903554,10904321,10904066,10904833,10904578,10905345,10905090,10905857,10905602,10906369,10906114,10906881,10906626,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10912001,10911746,10912513,10912258,10913025,10912770,10913537,10913282,10914049,10913794,10914561,10914306,10915073,10914818,10915585,10915330,10916097,10915842,10916609,10916354,10917121,10916866,10917633,10917378,10918145,10917890,10918657,10918402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22041   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10953473,10953218,10953985,10953730,10954497,10954242,10955009,10954754,10955521,10955266,10956033,10955778,10956545,10956290,0,0,10957569,10957314,10958081,10957826,10958593,10958338,10959105,10958850,10959617,10959362,10960129,10959874,10960641,10960386,10961153,10960898,10961665,10961410,10962177,10961922,10962689,10962434,10963201,10962946,10963713,10963458,10964225,10963970,10964737,10964482,10965249,10964994,10965761,10965506,10966273,10966018,10966785,10966530,10967297,10967042,10967809,10967554,10968321,10968066,10968833,10968578,10969345,10969090,10969857,10969602,10970369,10970114,10970881,10970626,10971393,10971138,10971905,10971650,10972417,10972162,10972929,10972674,0,0,0,0,0,0,0,0,0,10975745,10975490,10976257,10976002,1931521,10977025,10976770,10977537,10977282,10978049,10977794,10978561,10978306,10979073,10978818,0,0,0,10980353,10980098,156929,0,0,10981633,10981378,10982145,10981890,0,0,10983169,10982914,10983681,10983426,10984193,10983938,10984705,10984450,10985217,10984962,10985729,10985474,10986241,10985986,10986753,10986498,10987265,10987010,10987777,10987522,157185,154625,155905,158721,0,0,171521,165633,171265,11227905,10990849,10990594,10991361,10991106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22042   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10990338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1286146,1286402,1286658,1286914,1287170,1287426,1287682,1287938,1288194,1288450,1288706,1288962,1289218,1289474,1289730,1289986,1290242,1290498,1290754,1291010,1291266,1291522,1291778,1292034,1292290,1292546,1292802,1293058,1293314,1293570,1293826,1294082,1294338,1294594,1294850,1295106,1295362,1295618,1295874,1296130,1296386,1296642,1296898,1297154,1297410,1297666,1297922,1298178,1298434,1298690,1298946,1299202,1299458,1299714,1299970,1300226,1300482,1300738,1300994,1301250,1301506,1301762,1302018,1302274,1302530,1302786,1303042,1303298,1303554,1303810,1304066,1304322,1304578,1304834,1305090,1305346,1305602,1305858,1306114,1306370,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22043   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16728321,16728577,16728833,16729089,16729345,16729601,16729857,16730113,16730369,16730625,16730881,16731137,16731393,16731649,16731905,16732161,16732417,16732673,16732929,16733185,16733441,16733697,16733953,16734209,16734465,16734721,0,0,0,0,0,0,16720130,16720386,16720642,16720898,16721154,16721410,16721666,16721922,16722178,16722434,16722690,16722946,16723202,16723458,16723714,16723970,16724226,16724482,16724738,16724994,16725250,16725506,16725762,16726018,16726274,16726530,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22044   {17049601,17049857,17050113,17050369,17050625,17050881,17051137,17051393,17051649,17051905,17052161,17052417,17052673,17052929,17053185,17053441,17053697,17053953,17054209,17054465,17054721,17054977,17055233,17055489,17055745,17056001,17056257,17056513,17056769,17057025,17057281,17057537,17057793,17058049,17058305,17058561,17058817,17059073,17059329,17059585,17039362,17039618,17039874,17040130,17040386,17040642,17040898,17041154,17041410,17041666,17041922,17042178,17042434,17042690,17042946,17043202,17043458,17043714,17043970,17044226,17044482,17044738,17044994,17045250,17045506,17045762,17046018,17046274,17046530,17046786,17047042,17047298,17047554,17047810,17048066,17048322,17048578,17048834,17049090,17049346,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
22045   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17612801,17613057,17613313,17613569,17613825,17614081,17614337,17614593,17614849,17615105,17615361,17615617,17615873,17616129,17616385,17616641,17616897,17617153,17617409,17617665,17617921,17618177,17618433,17618689,17618945,17619201,17619457,17619713,17619969,17620225,17620481,17620737,17620993,17621249,17621505,17621761,17622017,17622273,17622529,17622785,17623041,17623297,17623553,17623809,17624065,17624321,17624577,17624833,17625089,17625345,17625601,0,0,0,0,0,0,0,0,0,0,0,0,0,17596418,17596674,17596930,17597186,17597442,17597698,17597954,17598210,17598466,17598722,17598978,17599234,17599490,17599746,17600002,17600258,17600514,17600770,17601026,17601282,17601538,17601794,17602050,17602306,17602562,17602818,17603074,17603330,17603586,17603842,17604098,17604354,17604610,17604866,17605122,17605378,17605634,17605890,17606146,17606402,17606658,17606914,17607170,17607426,17607682,17607938,17608194,17608450,17608706,17608962,17609218,0,0,0,0,0,0,0,0,0,0,0,0,0},
22046   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18399233,18399489,18399745,18400001,18400257,18400513,18400769,18401025,18401281,18401537,18401793,18402049,18402305,18402561,18402817,18403073,18403329,18403585,18403841,18404097,18404353,18404609,18404865,18405121,18405377,18405633,18405889,18406145,18406401,18406657,18406913,18407169,18391042,18391298,18391554,18391810,18392066,18392322,18392578,18392834,18393090,18393346,18393602,18393858,18394114,18394370,18394626,18394882,18395138,18395394,18395650,18395906,18396162,18396418,18396674,18396930,18397186,18397442,18397698,18397954,18398210,18398466,18398722,18398978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
22047 };
22048 
22049 } // namespace unilib
22050 
22051 /////////
22052 // File: unilib/utf8.cpp
22053 /////////
22054 
22055 // This file is part of UniLib <http://github.com/ufal/unilib/>.
22056 //
22057 // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
22058 // Mathematics and Physics, Charles University in Prague, Czech Republic.
22059 //
22060 // This Source Code Form is subject to the terms of the Mozilla Public
22061 // License, v. 2.0. If a copy of the MPL was not distributed with this
22062 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
22063 //
22064 // UniLib version: 3.1.1
22065 // Unicode version: 8.0.0
22066 
22067 namespace unilib {
22068 
valid(const char * str)22069 bool utf8::valid(const char* str) {
22070   for (; *str; str++)
22071     if (((unsigned char)*str) >= 0x80) {
22072       if (((unsigned char)*str) < 0xC0) return false;
22073       else if (((unsigned char)*str) < 0xE0) {
22074         str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22075       } else if (((unsigned char)*str) < 0xF0) {
22076         str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22077         str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22078       } else if (((unsigned char)*str) < 0xF8) {
22079         str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22080         str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22081         str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22082       } else return false;
22083     }
22084   return true;
22085 }
22086 
valid(const char * str,size_t len)22087 bool utf8::valid(const char* str, size_t len) {
22088   for (; len > 0; str++, len--)
22089     if (((unsigned char)*str) >= 0x80) {
22090       if (((unsigned char)*str) < 0xC0) return false;
22091       else if (((unsigned char)*str) < 0xE0) {
22092         str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22093       } else if (((unsigned char)*str) < 0xF0) {
22094         str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22095         str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22096       } else if (((unsigned char)*str) < 0xF8) {
22097         str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22098         str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22099         str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22100       } else return false;
22101     }
22102   return true;
22103 }
22104 
decode(const char * str,std::u32string & decoded)22105 void utf8::decode(const char* str, std::u32string& decoded) {
22106   decoded.clear();
22107 
22108   for (char32_t chr; (chr = decode(str)); )
22109     decoded.push_back(chr);
22110 }
22111 
decode(const char * str,size_t len,std::u32string & decoded)22112 void utf8::decode(const char* str, size_t len, std::u32string& decoded) {
22113   decoded.clear();
22114 
22115   while (len)
22116     decoded.push_back(decode(str, len));
22117 }
22118 
encode(const std::u32string & str,std::string & encoded)22119 void utf8::encode(const std::u32string& str, std::string& encoded) {
22120   encoded.clear();
22121 
22122   for (auto&& chr : str)
22123     append(encoded, chr);
22124 }
22125 
22126 const char utf8::REPLACEMENT_CHAR;
22127 
22128 } // namespace unilib
22129 
22130 /////////
22131 // File: unilib/version.cpp
22132 /////////
22133 
22134 // This file is part of UniLib <http://github.com/ufal/unilib/>.
22135 //
22136 // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
22137 // Mathematics and Physics, Charles University in Prague, Czech Republic.
22138 //
22139 // This Source Code Form is subject to the terms of the Mozilla Public
22140 // License, v. 2.0. If a copy of the MPL was not distributed with this
22141 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
22142 //
22143 // UniLib version: 3.1.1
22144 // Unicode version: 8.0.0
22145 
22146 namespace unilib {
22147 
22148 // Returns current version.
current()22149 version version::current() {
22150   return {3, 1, 1, ""};
22151 }
22152 
22153 } // namespace unilib
22154 
22155 /////////
22156 // File: utils/compressor_load.cpp
22157 /////////
22158 
22159 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
22160 //
22161 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
22162 // Mathematics and Physics, Charles University in Prague, Czech Republic.
22163 //
22164 // This Source Code Form is subject to the terms of the Mozilla Public
22165 // License, v. 2.0. If a copy of the MPL was not distributed with this
22166 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
22167 
22168 namespace utils {
22169 
22170 // Start of LZMA compression library by Igor Pavlov
22171 namespace lzma {
22172 
22173 // Types.h -- Basic types
22174 // 2010-10-09 : Igor Pavlov : Public domain
22175 #ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H
22176 #define UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H
22177 
22178 #define SZ_OK 0
22179 
22180 #define SZ_ERROR_DATA 1
22181 #define SZ_ERROR_MEM 2
22182 #define SZ_ERROR_CRC 3
22183 #define SZ_ERROR_UNSUPPORTED 4
22184 #define SZ_ERROR_PARAM 5
22185 #define SZ_ERROR_INPUT_EOF 6
22186 #define SZ_ERROR_OUTPUT_EOF 7
22187 #define SZ_ERROR_READ 8
22188 #define SZ_ERROR_WRITE 9
22189 #define SZ_ERROR_PROGRESS 10
22190 #define SZ_ERROR_FAIL 11
22191 #define SZ_ERROR_THREAD 12
22192 
22193 #define SZ_ERROR_ARCHIVE 16
22194 #define SZ_ERROR_NO_ARCHIVE 17
22195 
22196 typedef int SRes;
22197 
22198 #ifndef RINOK
22199 #define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; }
22200 #endif
22201 
22202 /* The following interfaces use first parameter as pointer to structure */
22203 
22204 struct IByteIn
22205 {
22206   uint8_t (*Read)(void *p); /* reads one byte, returns 0 in case of EOF or error */
22207 };
22208 
22209 struct IByteOut
22210 {
22211   void (*Write)(void *p, uint8_t b);
22212 };
22213 
22214 struct ISeqInStream
22215 {
22216   SRes (*Read)(void *p, void *buf, size_t *size);
22217     /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
22218        (output(*size) < input(*size)) is allowed */
22219 };
22220 
22221 /* it can return SZ_ERROR_INPUT_EOF */
22222 SRes SeqInStream_Read(ISeqInStream *stream, void *buf, size_t size);
22223 SRes SeqInStream_Read2(ISeqInStream *stream, void *buf, size_t size, SRes errorType);
22224 SRes SeqInStream_ReadByte(ISeqInStream *stream, uint8_t *buf);
22225 
22226 struct ISeqOutStream
22227 {
22228   size_t (*Write)(void *p, const void *buf, size_t size);
22229     /* Returns: result - the number of actually written bytes.
22230        (result < size) means error */
22231 };
22232 
22233 enum ESzSeek
22234 {
22235   SZ_SEEK_SET = 0,
22236   SZ_SEEK_CUR = 1,
22237   SZ_SEEK_END = 2
22238 };
22239 
22240 struct ISeekInStream
22241 {
22242   SRes (*Read)(void *p, void *buf, size_t *size);  /* same as ISeqInStream::Read */
22243   SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin);
22244 };
22245 
22246 struct ILookInStream
22247 {
22248   SRes (*Look)(void *p, const void **buf, size_t *size);
22249     /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
22250        (output(*size) > input(*size)) is not allowed
22251        (output(*size) < input(*size)) is allowed */
22252   SRes (*Skip)(void *p, size_t offset);
22253     /* offset must be <= output(*size) of Look */
22254 
22255   SRes (*Read)(void *p, void *buf, size_t *size);
22256     /* reads directly (without buffer). It's same as ISeqInStream::Read */
22257   SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin);
22258 };
22259 
22260 SRes LookInStream_LookRead(ILookInStream *stream, void *buf, size_t *size);
22261 SRes LookInStream_SeekTo(ILookInStream *stream, uint64_t offset);
22262 
22263 /* reads via ILookInStream::Read */
22264 SRes LookInStream_Read2(ILookInStream *stream, void *buf, size_t size, SRes errorType);
22265 SRes LookInStream_Read(ILookInStream *stream, void *buf, size_t size);
22266 
22267 #define LookToRead_BUF_SIZE (1 << 14)
22268 
22269 struct CLookToRead
22270 {
22271   ILookInStream s;
22272   ISeekInStream *realStream;
22273   size_t pos;
22274   size_t size;
22275   uint8_t buf[LookToRead_BUF_SIZE];
22276 };
22277 
22278 void LookToRead_CreateVTable(CLookToRead *p, int lookahead);
22279 void LookToRead_Init(CLookToRead *p);
22280 
22281 struct CSecToLook
22282 {
22283   ISeqInStream s;
22284   ILookInStream *realStream;
22285 };
22286 
22287 void SecToLook_CreateVTable(CSecToLook *p);
22288 
22289 struct CSecToRead
22290 {
22291   ISeqInStream s;
22292   ILookInStream *realStream;
22293 };
22294 
22295 void SecToRead_CreateVTable(CSecToRead *p);
22296 
22297 struct ICompressProgress
22298 {
22299   SRes (*Progress)(void *p, uint64_t inSize, uint64_t outSize);
22300     /* Returns: result. (result != SZ_OK) means break.
22301        Value (uint64_t)(int64_t)-1 for size means unknown value. */
22302 };
22303 
22304 struct ISzAlloc
22305 {
22306   void *(*Alloc)(void *p, size_t size);
22307   void (*Free)(void *p, void *address); /* address can be 0 */
22308 };
22309 
22310 #define IAlloc_Alloc(p, size) (p)->Alloc((p), size)
22311 #define IAlloc_Free(p, a) (p)->Free((p), a)
22312 
22313 #endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H
22314 
22315 // LzmaDec.h -- LZMA Decoder
22316 // 2009-02-07 : Igor Pavlov : Public domain
22317 
22318 /* #define _LZMA_PROB32 */
22319 /* _LZMA_PROB32 can increase the speed on some CPUs,
22320    but memory usage for CLzmaDec::probs will be doubled in that case */
22321 
22322 #ifdef _LZMA_PROB32
22323 #define CLzmaProb uint32_t
22324 #else
22325 #define CLzmaProb uint16_t
22326 #endif
22327 
22328 /* ---------- LZMA Properties ---------- */
22329 
22330 #define LZMA_PROPS_SIZE 5
22331 
22332 struct CLzmaProps
22333 {
22334   unsigned lc, lp, pb;
22335   uint32_t dicSize;
22336 };
22337 
22338 /* LzmaProps_Decode - decodes properties
22339 Returns:
22340   SZ_OK
22341   SZ_ERROR_UNSUPPORTED - Unsupported properties
22342 */
22343 
22344 SRes LzmaProps_Decode(CLzmaProps *p, const uint8_t *data, unsigned size);
22345 
22346 /* ---------- LZMA Decoder state ---------- */
22347 
22348 /* LZMA_REQUIRED_INPUT_MAX = number of required input bytes for worst case.
22349    Num bits = log2((2^11 / 31) ^ 22) + 26 < 134 + 26 = 160; */
22350 
22351 #define LZMA_REQUIRED_INPUT_MAX 20
22352 
22353 struct CLzmaDec
22354 {
22355   CLzmaProps prop;
22356   CLzmaProb *probs;
22357   uint8_t *dic;
22358   const uint8_t *buf;
22359   uint32_t range, code;
22360   size_t dicPos;
22361   size_t dicBufSize;
22362   uint32_t processedPos;
22363   uint32_t checkDicSize;
22364   unsigned state;
22365   uint32_t reps[4];
22366   unsigned remainLen;
22367   int needFlush;
22368   int needInitState;
22369   uint32_t numProbs;
22370   unsigned tempBufSize;
22371   uint8_t tempBuf[LZMA_REQUIRED_INPUT_MAX];
22372 };
22373 
22374 #define LzmaDec_Construct(p) { (p)->dic = 0; (p)->probs = 0; }
22375 
22376 void LzmaDec_Init(CLzmaDec *p);
22377 
22378 /* There are two types of LZMA streams:
22379      0) Stream with end mark. That end mark adds about 6 bytes to compressed size.
22380      1) Stream without end mark. You must know exact uncompressed size to decompress such stream. */
22381 
22382 enum ELzmaFinishMode
22383 {
22384   LZMA_FINISH_ANY,   /* finish at any point */
22385   LZMA_FINISH_END    /* block must be finished at the end */
22386 };
22387 
22388 /* ELzmaFinishMode has meaning only if the decoding reaches output limit !!!
22389 
22390    You must use LZMA_FINISH_END, when you know that current output buffer
22391    covers last bytes of block. In other cases you must use LZMA_FINISH_ANY.
22392 
22393    If LZMA decoder sees end marker before reaching output limit, it returns SZ_OK,
22394    and output value of destLen will be less than output buffer size limit.
22395    You can check status result also.
22396 
22397    You can use multiple checks to test data integrity after full decompression:
22398      1) Check Result and "status" variable.
22399      2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize.
22400      3) Check that output(srcLen) = compressedSize, if you know real compressedSize.
22401         You must use correct finish mode in that case. */
22402 
22403 enum ELzmaStatus
22404 {
22405   LZMA_STATUS_NOT_SPECIFIED,               /* use main error code instead */
22406   LZMA_STATUS_FINISHED_WITH_MARK,          /* stream was finished with end mark. */
22407   LZMA_STATUS_NOT_FINISHED,                /* stream was not finished */
22408   LZMA_STATUS_NEEDS_MORE_INPUT,            /* you must provide more input bytes */
22409   LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK  /* there is probability that stream was finished without end mark */
22410 };
22411 
22412 /* ELzmaStatus is used only as output value for function call */
22413 
22414 /* ---------- Interfaces ---------- */
22415 
22416 /* There are 3 levels of interfaces:
22417      1) Dictionary Interface
22418      2) Buffer Interface
22419      3) One Call Interface
22420    You can select any of these interfaces, but don't mix functions from different
22421    groups for same object. */
22422 
22423 /* There are two variants to allocate state for Dictionary Interface:
22424      1) LzmaDec_Allocate / LzmaDec_Free
22425      2) LzmaDec_AllocateProbs / LzmaDec_FreeProbs
22426    You can use variant 2, if you set dictionary buffer manually.
22427    For Buffer Interface you must always use variant 1.
22428 
22429 LzmaDec_Allocate* can return:
22430   SZ_OK
22431   SZ_ERROR_MEM         - Memory allocation error
22432   SZ_ERROR_UNSUPPORTED - Unsupported properties
22433 */
22434 
22435 SRes LzmaDec_AllocateProbs(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc);
22436 void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc);
22437 
22438 SRes LzmaDec_Allocate(CLzmaDec *state, const uint8_t *prop, unsigned propsSize, ISzAlloc *alloc);
22439 void LzmaDec_Free(CLzmaDec *state, ISzAlloc *alloc);
22440 
22441 /* ---------- Dictionary Interface ---------- */
22442 
22443 /* You can use it, if you want to eliminate the overhead for data copying from
22444    dictionary to some other external buffer.
22445    You must work with CLzmaDec variables directly in this interface.
22446 
22447    STEPS:
22448      LzmaDec_Constr()
22449      LzmaDec_Allocate()
22450      for (each new stream)
22451      {
22452        LzmaDec_Init()
22453        while (it needs more decompression)
22454        {
22455          LzmaDec_DecodeToDic()
22456          use data from CLzmaDec::dic and update CLzmaDec::dicPos
22457        }
22458      }
22459      LzmaDec_Free()
22460 */
22461 
22462 /* LzmaDec_DecodeToDic
22463 
22464    The decoding to internal dictionary buffer (CLzmaDec::dic).
22465    You must manually update CLzmaDec::dicPos, if it reaches CLzmaDec::dicBufSize !!!
22466 
22467 finishMode:
22468   It has meaning only if the decoding reaches output limit (dicLimit).
22469   LZMA_FINISH_ANY - Decode just dicLimit bytes.
22470   LZMA_FINISH_END - Stream must be finished after dicLimit.
22471 
22472 Returns:
22473   SZ_OK
22474     status:
22475       LZMA_STATUS_FINISHED_WITH_MARK
22476       LZMA_STATUS_NOT_FINISHED
22477       LZMA_STATUS_NEEDS_MORE_INPUT
22478       LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK
22479   SZ_ERROR_DATA - Data error
22480 */
22481 
22482 SRes LzmaDec_DecodeToDic(CLzmaDec *p, size_t dicLimit,
22483     const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status);
22484 
22485 /* ---------- Buffer Interface ---------- */
22486 
22487 /* It's zlib-like interface.
22488    See LzmaDec_DecodeToDic description for information about STEPS and return results,
22489    but you must use LzmaDec_DecodeToBuf instead of LzmaDec_DecodeToDic and you don't need
22490    to work with CLzmaDec variables manually.
22491 
22492 finishMode:
22493   It has meaning only if the decoding reaches output limit (*destLen).
22494   LZMA_FINISH_ANY - Decode just destLen bytes.
22495   LZMA_FINISH_END - Stream must be finished after (*destLen).
22496 */
22497 
22498 SRes LzmaDec_DecodeToBuf(CLzmaDec *p, uint8_t *dest, size_t *destLen,
22499     const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status);
22500 
22501 /* ---------- One Call Interface ---------- */
22502 
22503 /* LzmaDecode
22504 
22505 finishMode:
22506   It has meaning only if the decoding reaches output limit (*destLen).
22507   LZMA_FINISH_ANY - Decode just destLen bytes.
22508   LZMA_FINISH_END - Stream must be finished after (*destLen).
22509 
22510 Returns:
22511   SZ_OK
22512     status:
22513       LZMA_STATUS_FINISHED_WITH_MARK
22514       LZMA_STATUS_NOT_FINISHED
22515       LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK
22516   SZ_ERROR_DATA - Data error
22517   SZ_ERROR_MEM  - Memory allocation error
22518   SZ_ERROR_UNSUPPORTED - Unsupported properties
22519   SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src).
22520 */
22521 
22522 SRes LzmaDecode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen,
22523     const uint8_t *propData, unsigned propSize, ELzmaFinishMode finishMode,
22524     ELzmaStatus *status, ISzAlloc *alloc);
22525 
22526 // LzmaDec.c -- LZMA Decoder
22527 // 2009-09-20 : Igor Pavlov : Public domain
22528 
22529 #define kNumTopBits 24
22530 #define kTopValue ((uint32_t)1 << kNumTopBits)
22531 
22532 #define kNumBitModelTotalBits 11
22533 #define kBitModelTotal (1 << kNumBitModelTotalBits)
22534 #define kNumMoveBits 5
22535 
22536 #define RC_INIT_SIZE 5
22537 
22538 #define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); }
22539 
22540 #define IF_BIT_0(p) ttt = *(p); NORMALIZE; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound)
22541 #define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
22542 #define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits));
22543 #define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \
22544   { UPDATE_0(p); i = (i + i); A0; } else \
22545   { UPDATE_1(p); i = (i + i) + 1; A1; }
22546 #define GET_BIT(p, i) GET_BIT2(p, i, ; , ;)
22547 
22548 #define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); }
22549 #define TREE_DECODE(probs, limit, i) \
22550   { i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; }
22551 
22552 /* #define _LZMA_SIZE_OPT */
22553 
22554 #ifdef _LZMA_SIZE_OPT
22555 #define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i)
22556 #else
22557 #define TREE_6_DECODE(probs, i) \
22558   { i = 1; \
22559   TREE_GET_BIT(probs, i); \
22560   TREE_GET_BIT(probs, i); \
22561   TREE_GET_BIT(probs, i); \
22562   TREE_GET_BIT(probs, i); \
22563   TREE_GET_BIT(probs, i); \
22564   TREE_GET_BIT(probs, i); \
22565   i -= 0x40; }
22566 #endif
22567 
22568 #define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); }
22569 
22570 #define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound)
22571 #define UPDATE_0_CHECK range = bound;
22572 #define UPDATE_1_CHECK range -= bound; code -= bound;
22573 #define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \
22574   { UPDATE_0_CHECK; i = (i + i); A0; } else \
22575   { UPDATE_1_CHECK; i = (i + i) + 1; A1; }
22576 #define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;)
22577 #define TREE_DECODE_CHECK(probs, limit, i) \
22578   { i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; }
22579 
22580 #define kNumPosBitsMax 4
22581 #define kNumPosStatesMax (1 << kNumPosBitsMax)
22582 
22583 #define kLenNumLowBits 3
22584 #define kLenNumLowSymbols (1 << kLenNumLowBits)
22585 #define kLenNumMidBits 3
22586 #define kLenNumMidSymbols (1 << kLenNumMidBits)
22587 #define kLenNumHighBits 8
22588 #define kLenNumHighSymbols (1 << kLenNumHighBits)
22589 
22590 #define LenChoice 0
22591 #define LenChoice2 (LenChoice + 1)
22592 #define LenLow (LenChoice2 + 1)
22593 #define LenMid (LenLow + (kNumPosStatesMax << kLenNumLowBits))
22594 #define LenHigh (LenMid + (kNumPosStatesMax << kLenNumMidBits))
22595 #define kNumLenProbs (LenHigh + kLenNumHighSymbols)
22596 
22597 #define kNumStates 12
22598 #define kNumLitStates 7
22599 
22600 #define kStartPosModelIndex 4
22601 #define kEndPosModelIndex 14
22602 #define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
22603 
22604 #define kNumPosSlotBits 6
22605 #define kNumLenToPosStates 4
22606 
22607 #define kNumAlignBits 4
22608 #define kAlignTableSize (1 << kNumAlignBits)
22609 
22610 #define kMatchMinLen 2
22611 #define kMatchSpecLenStart (kMatchMinLen + kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols)
22612 
22613 #define IsMatch 0
22614 #define IsRep (IsMatch + (kNumStates << kNumPosBitsMax))
22615 #define IsRepG0 (IsRep + kNumStates)
22616 #define IsRepG1 (IsRepG0 + kNumStates)
22617 #define IsRepG2 (IsRepG1 + kNumStates)
22618 #define IsRep0Long (IsRepG2 + kNumStates)
22619 #define PosSlot (IsRep0Long + (kNumStates << kNumPosBitsMax))
22620 #define SpecPos (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
22621 #define Align (SpecPos + kNumFullDistances - kEndPosModelIndex)
22622 #define LenCoder (Align + kAlignTableSize)
22623 #define RepLenCoder (LenCoder + kNumLenProbs)
22624 #define Literal (RepLenCoder + kNumLenProbs)
22625 
22626 #define LZMA_BASE_SIZE 1846
22627 #define LZMA_LIT_SIZE 768
22628 
22629 #define LzmaProps_GetNumProbs(p) ((uint32_t)LZMA_BASE_SIZE + (LZMA_LIT_SIZE << ((p)->lc + (p)->lp)))
22630 
22631 #if Literal != LZMA_BASE_SIZE
22632 StopCompilingDueBUG
22633 #endif
22634 
22635 #define LZMA_DIC_MIN (1 << 12)
22636 
22637 /* First LZMA-symbol is always decoded.
22638 And it decodes new LZMA-symbols while (buf < bufLimit), but "buf" is without last normalization
22639 Out:
22640   Result:
22641     SZ_OK - OK
22642     SZ_ERROR_DATA - Error
22643   p->remainLen:
22644     < kMatchSpecLenStart : normal remain
22645     = kMatchSpecLenStart : finished
22646     = kMatchSpecLenStart + 1 : Flush marker
22647     = kMatchSpecLenStart + 2 : State Init Marker
22648 */
22649 
LzmaDec_DecodeReal(CLzmaDec * p,size_t limit,const uint8_t * bufLimit)22650 static int LzmaDec_DecodeReal(CLzmaDec *p, size_t limit, const uint8_t *bufLimit)
22651 {
22652   CLzmaProb *probs = p->probs;
22653 
22654   unsigned state = p->state;
22655   uint32_t rep0 = p->reps[0], rep1 = p->reps[1], rep2 = p->reps[2], rep3 = p->reps[3];
22656   unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
22657   unsigned lpMask = ((unsigned)1 << (p->prop.lp)) - 1;
22658   unsigned lc = p->prop.lc;
22659 
22660   uint8_t *dic = p->dic;
22661   size_t dicBufSize = p->dicBufSize;
22662   size_t dicPos = p->dicPos;
22663 
22664   uint32_t processedPos = p->processedPos;
22665   uint32_t checkDicSize = p->checkDicSize;
22666   unsigned len = 0;
22667 
22668   const uint8_t *buf = p->buf;
22669   uint32_t range = p->range;
22670   uint32_t code = p->code;
22671 
22672   do
22673   {
22674     CLzmaProb *prob;
22675     uint32_t bound;
22676     unsigned ttt;
22677     unsigned posState = processedPos & pbMask;
22678 
22679     prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
22680     IF_BIT_0(prob)
22681     {
22682       unsigned symbol;
22683       UPDATE_0(prob);
22684       prob = probs + Literal;
22685       if (checkDicSize != 0 || processedPos != 0)
22686         prob += (LZMA_LIT_SIZE * (((processedPos & lpMask) << lc) +
22687         (dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc))));
22688 
22689       if (state < kNumLitStates)
22690       {
22691         state -= (state < 4) ? state : 3;
22692         symbol = 1;
22693         do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100);
22694       }
22695       else
22696       {
22697         unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)];
22698         unsigned offs = 0x100;
22699         state -= (state < 10) ? 3 : 6;
22700         symbol = 1;
22701         do
22702         {
22703           unsigned bit;
22704           CLzmaProb *probLit;
22705           matchByte <<= 1;
22706           bit = (matchByte & offs);
22707           probLit = prob + offs + bit + symbol;
22708           GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit)
22709         }
22710         while (symbol < 0x100);
22711       }
22712       dic[dicPos++] = (uint8_t)symbol;
22713       processedPos++;
22714       continue;
22715     }
22716     else
22717     {
22718       UPDATE_1(prob);
22719       prob = probs + IsRep + state;
22720       IF_BIT_0(prob)
22721       {
22722         UPDATE_0(prob);
22723         state += kNumStates;
22724         prob = probs + LenCoder;
22725       }
22726       else
22727       {
22728         UPDATE_1(prob);
22729         if (checkDicSize == 0 && processedPos == 0)
22730           return SZ_ERROR_DATA;
22731         prob = probs + IsRepG0 + state;
22732         IF_BIT_0(prob)
22733         {
22734           UPDATE_0(prob);
22735           prob = probs + IsRep0Long + (state << kNumPosBitsMax) + posState;
22736           IF_BIT_0(prob)
22737           {
22738             UPDATE_0(prob);
22739             dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)];
22740             dicPos++;
22741             processedPos++;
22742             state = state < kNumLitStates ? 9 : 11;
22743             continue;
22744           }
22745           UPDATE_1(prob);
22746         }
22747         else
22748         {
22749           uint32_t distance;
22750           UPDATE_1(prob);
22751           prob = probs + IsRepG1 + state;
22752           IF_BIT_0(prob)
22753           {
22754             UPDATE_0(prob);
22755             distance = rep1;
22756           }
22757           else
22758           {
22759             UPDATE_1(prob);
22760             prob = probs + IsRepG2 + state;
22761             IF_BIT_0(prob)
22762             {
22763               UPDATE_0(prob);
22764               distance = rep2;
22765             }
22766             else
22767             {
22768               UPDATE_1(prob);
22769               distance = rep3;
22770               rep3 = rep2;
22771             }
22772             rep2 = rep1;
22773           }
22774           rep1 = rep0;
22775           rep0 = distance;
22776         }
22777         state = state < kNumLitStates ? 8 : 11;
22778         prob = probs + RepLenCoder;
22779       }
22780       {
22781         unsigned limit, offset;
22782         CLzmaProb *probLen = prob + LenChoice;
22783         IF_BIT_0(probLen)
22784         {
22785           UPDATE_0(probLen);
22786           probLen = prob + LenLow + (posState << kLenNumLowBits);
22787           offset = 0;
22788           limit = (1 << kLenNumLowBits);
22789         }
22790         else
22791         {
22792           UPDATE_1(probLen);
22793           probLen = prob + LenChoice2;
22794           IF_BIT_0(probLen)
22795           {
22796             UPDATE_0(probLen);
22797             probLen = prob + LenMid + (posState << kLenNumMidBits);
22798             offset = kLenNumLowSymbols;
22799             limit = (1 << kLenNumMidBits);
22800           }
22801           else
22802           {
22803             UPDATE_1(probLen);
22804             probLen = prob + LenHigh;
22805             offset = kLenNumLowSymbols + kLenNumMidSymbols;
22806             limit = (1 << kLenNumHighBits);
22807           }
22808         }
22809         TREE_DECODE(probLen, limit, len);
22810         len += offset;
22811       }
22812 
22813       if (state >= kNumStates)
22814       {
22815         uint32_t distance;
22816         prob = probs + PosSlot +
22817             ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
22818         TREE_6_DECODE(prob, distance);
22819         if (distance >= kStartPosModelIndex)
22820         {
22821           unsigned posSlot = (unsigned)distance;
22822           int numDirectBits = (int)(((distance >> 1) - 1));
22823           distance = (2 | (distance & 1));
22824           if (posSlot < kEndPosModelIndex)
22825           {
22826             distance <<= numDirectBits;
22827             prob = probs + SpecPos + distance - posSlot - 1;
22828             {
22829               uint32_t mask = 1;
22830               unsigned i = 1;
22831               do
22832               {
22833                 GET_BIT2(prob + i, i, ; , distance |= mask);
22834                 mask <<= 1;
22835               }
22836               while (--numDirectBits != 0);
22837             }
22838           }
22839           else
22840           {
22841             numDirectBits -= kNumAlignBits;
22842             do
22843             {
22844               NORMALIZE
22845               range >>= 1;
22846 
22847               {
22848                 uint32_t t;
22849                 code -= range;
22850                 t = (0 - ((uint32_t)code >> 31)); /* (uint32_t)((int32_t)code >> 31) */
22851                 distance = (distance << 1) + (t + 1);
22852                 code += range & t;
22853               }
22854               /*
22855               distance <<= 1;
22856               if (code >= range)
22857               {
22858                 code -= range;
22859                 distance |= 1;
22860               }
22861               */
22862             }
22863             while (--numDirectBits != 0);
22864             prob = probs + Align;
22865             distance <<= kNumAlignBits;
22866             {
22867               unsigned i = 1;
22868               GET_BIT2(prob + i, i, ; , distance |= 1);
22869               GET_BIT2(prob + i, i, ; , distance |= 2);
22870               GET_BIT2(prob + i, i, ; , distance |= 4);
22871               GET_BIT2(prob + i, i, ; , distance |= 8);
22872             }
22873             if (distance == (uint32_t)0xFFFFFFFF)
22874             {
22875               len += kMatchSpecLenStart;
22876               state -= kNumStates;
22877               break;
22878             }
22879           }
22880         }
22881         rep3 = rep2;
22882         rep2 = rep1;
22883         rep1 = rep0;
22884         rep0 = distance + 1;
22885         if (checkDicSize == 0)
22886         {
22887           if (distance >= processedPos)
22888             return SZ_ERROR_DATA;
22889         }
22890         else if (distance >= checkDicSize)
22891           return SZ_ERROR_DATA;
22892         state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
22893       }
22894 
22895       len += kMatchMinLen;
22896 
22897       if (limit == dicPos)
22898         return SZ_ERROR_DATA;
22899       {
22900         size_t rem = limit - dicPos;
22901         unsigned curLen = ((rem < len) ? (unsigned)rem : len);
22902         size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0);
22903 
22904         processedPos += curLen;
22905 
22906         len -= curLen;
22907         if (pos + curLen <= dicBufSize)
22908         {
22909           uint8_t *dest = dic + dicPos;
22910           ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
22911           const uint8_t *lim = dest + curLen;
22912           dicPos += curLen;
22913           do
22914             *(dest) = (uint8_t)*(dest + src);
22915           while (++dest != lim);
22916         }
22917         else
22918         {
22919           do
22920           {
22921             dic[dicPos++] = dic[pos];
22922             if (++pos == dicBufSize)
22923               pos = 0;
22924           }
22925           while (--curLen != 0);
22926         }
22927       }
22928     }
22929   }
22930   while (dicPos < limit && buf < bufLimit);
22931   NORMALIZE;
22932   p->buf = buf;
22933   p->range = range;
22934   p->code = code;
22935   p->remainLen = len;
22936   p->dicPos = dicPos;
22937   p->processedPos = processedPos;
22938   p->reps[0] = rep0;
22939   p->reps[1] = rep1;
22940   p->reps[2] = rep2;
22941   p->reps[3] = rep3;
22942   p->state = state;
22943 
22944   return SZ_OK;
22945 }
22946 
LzmaDec_WriteRem(CLzmaDec * p,size_t limit)22947 static void LzmaDec_WriteRem(CLzmaDec *p, size_t limit)
22948 {
22949   if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart)
22950   {
22951     uint8_t *dic = p->dic;
22952     size_t dicPos = p->dicPos;
22953     size_t dicBufSize = p->dicBufSize;
22954     unsigned len = p->remainLen;
22955     uint32_t rep0 = p->reps[0];
22956     if (limit - dicPos < len)
22957       len = (unsigned)(limit - dicPos);
22958 
22959     if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len)
22960       p->checkDicSize = p->prop.dicSize;
22961 
22962     p->processedPos += len;
22963     p->remainLen -= len;
22964     while (len-- != 0)
22965     {
22966       dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)];
22967       dicPos++;
22968     }
22969     p->dicPos = dicPos;
22970   }
22971 }
22972 
LzmaDec_DecodeReal2(CLzmaDec * p,size_t limit,const uint8_t * bufLimit)22973 static int LzmaDec_DecodeReal2(CLzmaDec *p, size_t limit, const uint8_t *bufLimit)
22974 {
22975   do
22976   {
22977     size_t limit2 = limit;
22978     if (p->checkDicSize == 0)
22979     {
22980       uint32_t rem = p->prop.dicSize - p->processedPos;
22981       if (limit - p->dicPos > rem)
22982         limit2 = p->dicPos + rem;
22983     }
22984     RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit));
22985     if (p->processedPos >= p->prop.dicSize)
22986       p->checkDicSize = p->prop.dicSize;
22987     LzmaDec_WriteRem(p, limit);
22988   }
22989   while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart);
22990 
22991   if (p->remainLen > kMatchSpecLenStart)
22992   {
22993     p->remainLen = kMatchSpecLenStart;
22994   }
22995   return 0;
22996 }
22997 
22998 enum ELzmaDummy
22999 {
23000   DUMMY_ERROR, /* unexpected end of input stream */
23001   DUMMY_LIT,
23002   DUMMY_MATCH,
23003   DUMMY_REP
23004 };
23005 
LzmaDec_TryDummy(const CLzmaDec * p,const uint8_t * buf,size_t inSize)23006 static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const uint8_t *buf, size_t inSize)
23007 {
23008   uint32_t range = p->range;
23009   uint32_t code = p->code;
23010   const uint8_t *bufLimit = buf + inSize;
23011   CLzmaProb *probs = p->probs;
23012   unsigned state = p->state;
23013   ELzmaDummy res;
23014 
23015   {
23016     CLzmaProb *prob;
23017     uint32_t bound;
23018     unsigned ttt;
23019     unsigned posState = (p->processedPos) & ((1 << p->prop.pb) - 1);
23020 
23021     prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
23022     IF_BIT_0_CHECK(prob)
23023     {
23024       UPDATE_0_CHECK
23025 
23026       /* if (bufLimit - buf >= 7) return DUMMY_LIT; */
23027 
23028       prob = probs + Literal;
23029       if (p->checkDicSize != 0 || p->processedPos != 0)
23030         prob += (LZMA_LIT_SIZE *
23031           ((((p->processedPos) & ((1 << (p->prop.lp)) - 1)) << p->prop.lc) +
23032           (p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc))));
23033 
23034       if (state < kNumLitStates)
23035       {
23036         unsigned symbol = 1;
23037         do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100);
23038       }
23039       else
23040       {
23041         unsigned matchByte = p->dic[p->dicPos - p->reps[0] +
23042             ((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)];
23043         unsigned offs = 0x100;
23044         unsigned symbol = 1;
23045         do
23046         {
23047           unsigned bit;
23048           CLzmaProb *probLit;
23049           matchByte <<= 1;
23050           bit = (matchByte & offs);
23051           probLit = prob + offs + bit + symbol;
23052           GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit)
23053         }
23054         while (symbol < 0x100);
23055       }
23056       res = DUMMY_LIT;
23057     }
23058     else
23059     {
23060       unsigned len;
23061       UPDATE_1_CHECK;
23062 
23063       prob = probs + IsRep + state;
23064       IF_BIT_0_CHECK(prob)
23065       {
23066         UPDATE_0_CHECK;
23067         state = 0;
23068         prob = probs + LenCoder;
23069         res = DUMMY_MATCH;
23070       }
23071       else
23072       {
23073         UPDATE_1_CHECK;
23074         res = DUMMY_REP;
23075         prob = probs + IsRepG0 + state;
23076         IF_BIT_0_CHECK(prob)
23077         {
23078           UPDATE_0_CHECK;
23079           prob = probs + IsRep0Long + (state << kNumPosBitsMax) + posState;
23080           IF_BIT_0_CHECK(prob)
23081           {
23082             UPDATE_0_CHECK;
23083             NORMALIZE_CHECK;
23084             return DUMMY_REP;
23085           }
23086           else
23087           {
23088             UPDATE_1_CHECK;
23089           }
23090         }
23091         else
23092         {
23093           UPDATE_1_CHECK;
23094           prob = probs + IsRepG1 + state;
23095           IF_BIT_0_CHECK(prob)
23096           {
23097             UPDATE_0_CHECK;
23098           }
23099           else
23100           {
23101             UPDATE_1_CHECK;
23102             prob = probs + IsRepG2 + state;
23103             IF_BIT_0_CHECK(prob)
23104             {
23105               UPDATE_0_CHECK;
23106             }
23107             else
23108             {
23109               UPDATE_1_CHECK;
23110             }
23111           }
23112         }
23113         state = kNumStates;
23114         prob = probs + RepLenCoder;
23115       }
23116       {
23117         unsigned limit, offset;
23118         CLzmaProb *probLen = prob + LenChoice;
23119         IF_BIT_0_CHECK(probLen)
23120         {
23121           UPDATE_0_CHECK;
23122           probLen = prob + LenLow + (posState << kLenNumLowBits);
23123           offset = 0;
23124           limit = 1 << kLenNumLowBits;
23125         }
23126         else
23127         {
23128           UPDATE_1_CHECK;
23129           probLen = prob + LenChoice2;
23130           IF_BIT_0_CHECK(probLen)
23131           {
23132             UPDATE_0_CHECK;
23133             probLen = prob + LenMid + (posState << kLenNumMidBits);
23134             offset = kLenNumLowSymbols;
23135             limit = 1 << kLenNumMidBits;
23136           }
23137           else
23138           {
23139             UPDATE_1_CHECK;
23140             probLen = prob + LenHigh;
23141             offset = kLenNumLowSymbols + kLenNumMidSymbols;
23142             limit = 1 << kLenNumHighBits;
23143           }
23144         }
23145         TREE_DECODE_CHECK(probLen, limit, len);
23146         len += offset;
23147       }
23148 
23149       if (state < 4)
23150       {
23151         unsigned posSlot;
23152         prob = probs + PosSlot +
23153             ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) <<
23154             kNumPosSlotBits);
23155         TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
23156         if (posSlot >= kStartPosModelIndex)
23157         {
23158           int numDirectBits = ((posSlot >> 1) - 1);
23159 
23160           /* if (bufLimit - buf >= 8) return DUMMY_MATCH; */
23161 
23162           if (posSlot < kEndPosModelIndex)
23163           {
23164             prob = probs + SpecPos + ((2 | (posSlot & 1)) << numDirectBits) - posSlot - 1;
23165           }
23166           else
23167           {
23168             numDirectBits -= kNumAlignBits;
23169             do
23170             {
23171               NORMALIZE_CHECK
23172               range >>= 1;
23173               code -= range & (((code - range) >> 31) - 1);
23174               /* if (code >= range) code -= range; */
23175             }
23176             while (--numDirectBits != 0);
23177             prob = probs + Align;
23178             numDirectBits = kNumAlignBits;
23179           }
23180           {
23181             unsigned i = 1;
23182             do
23183             {
23184               GET_BIT_CHECK(prob + i, i);
23185             }
23186             while (--numDirectBits != 0);
23187           }
23188         }
23189       }
23190     }
23191   }
23192   NORMALIZE_CHECK;
23193   return res;
23194 }
23195 
LzmaDec_InitRc(CLzmaDec * p,const uint8_t * data)23196 static void LzmaDec_InitRc(CLzmaDec *p, const uint8_t *data)
23197 {
23198   p->code = ((uint32_t)data[1] << 24) | ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 8) | ((uint32_t)data[4]);
23199   p->range = 0xFFFFFFFF;
23200   p->needFlush = 0;
23201 }
23202 
LzmaDec_InitDicAndState(CLzmaDec * p,bool initDic,bool initState)23203 void LzmaDec_InitDicAndState(CLzmaDec *p, bool initDic, bool initState)
23204 {
23205   p->needFlush = 1;
23206   p->remainLen = 0;
23207   p->tempBufSize = 0;
23208 
23209   if (initDic)
23210   {
23211     p->processedPos = 0;
23212     p->checkDicSize = 0;
23213     p->needInitState = 1;
23214   }
23215   if (initState)
23216     p->needInitState = 1;
23217 }
23218 
LzmaDec_Init(CLzmaDec * p)23219 void LzmaDec_Init(CLzmaDec *p)
23220 {
23221   p->dicPos = 0;
23222   LzmaDec_InitDicAndState(p, true, true);
23223 }
23224 
LzmaDec_InitStateReal(CLzmaDec * p)23225 static void LzmaDec_InitStateReal(CLzmaDec *p)
23226 {
23227   uint32_t numProbs = Literal + ((uint32_t)LZMA_LIT_SIZE << (p->prop.lc + p->prop.lp));
23228   uint32_t i;
23229   CLzmaProb *probs = p->probs;
23230   for (i = 0; i < numProbs; i++)
23231     probs[i] = kBitModelTotal >> 1;
23232   p->reps[0] = p->reps[1] = p->reps[2] = p->reps[3] = 1;
23233   p->state = 0;
23234   p->needInitState = 0;
23235 }
23236 
LzmaDec_DecodeToDic(CLzmaDec * p,size_t dicLimit,const uint8_t * src,size_t * srcLen,ELzmaFinishMode finishMode,ELzmaStatus * status)23237 SRes LzmaDec_DecodeToDic(CLzmaDec *p, size_t dicLimit, const uint8_t *src, size_t *srcLen,
23238     ELzmaFinishMode finishMode, ELzmaStatus *status)
23239 {
23240   size_t inSize = *srcLen;
23241   (*srcLen) = 0;
23242   LzmaDec_WriteRem(p, dicLimit);
23243 
23244   *status = LZMA_STATUS_NOT_SPECIFIED;
23245 
23246   while (p->remainLen != kMatchSpecLenStart)
23247   {
23248       int checkEndMarkNow;
23249 
23250       if (p->needFlush != 0)
23251       {
23252         for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--)
23253           p->tempBuf[p->tempBufSize++] = *src++;
23254         if (p->tempBufSize < RC_INIT_SIZE)
23255         {
23256           *status = LZMA_STATUS_NEEDS_MORE_INPUT;
23257           return SZ_OK;
23258         }
23259         if (p->tempBuf[0] != 0)
23260           return SZ_ERROR_DATA;
23261 
23262         LzmaDec_InitRc(p, p->tempBuf);
23263         p->tempBufSize = 0;
23264       }
23265 
23266       checkEndMarkNow = 0;
23267       if (p->dicPos >= dicLimit)
23268       {
23269         if (p->remainLen == 0 && p->code == 0)
23270         {
23271           *status = LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK;
23272           return SZ_OK;
23273         }
23274         if (finishMode == LZMA_FINISH_ANY)
23275         {
23276           *status = LZMA_STATUS_NOT_FINISHED;
23277           return SZ_OK;
23278         }
23279         if (p->remainLen != 0)
23280         {
23281           *status = LZMA_STATUS_NOT_FINISHED;
23282           return SZ_ERROR_DATA;
23283         }
23284         checkEndMarkNow = 1;
23285       }
23286 
23287       if (p->needInitState)
23288         LzmaDec_InitStateReal(p);
23289 
23290       if (p->tempBufSize == 0)
23291       {
23292         size_t processed;
23293         const uint8_t *bufLimit;
23294         if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow)
23295         {
23296           int dummyRes = LzmaDec_TryDummy(p, src, inSize);
23297           if (dummyRes == DUMMY_ERROR)
23298           {
23299             memcpy(p->tempBuf, src, inSize);
23300             p->tempBufSize = (unsigned)inSize;
23301             (*srcLen) += inSize;
23302             *status = LZMA_STATUS_NEEDS_MORE_INPUT;
23303             return SZ_OK;
23304           }
23305           if (checkEndMarkNow && dummyRes != DUMMY_MATCH)
23306           {
23307             *status = LZMA_STATUS_NOT_FINISHED;
23308             return SZ_ERROR_DATA;
23309           }
23310           bufLimit = src;
23311         }
23312         else
23313           bufLimit = src + inSize - LZMA_REQUIRED_INPUT_MAX;
23314         p->buf = src;
23315         if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0)
23316           return SZ_ERROR_DATA;
23317         processed = (size_t)(p->buf - src);
23318         (*srcLen) += processed;
23319         src += processed;
23320         inSize -= processed;
23321       }
23322       else
23323       {
23324         unsigned rem = p->tempBufSize, lookAhead = 0;
23325         while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize)
23326           p->tempBuf[rem++] = src[lookAhead++];
23327         p->tempBufSize = rem;
23328         if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow)
23329         {
23330           int dummyRes = LzmaDec_TryDummy(p, p->tempBuf, rem);
23331           if (dummyRes == DUMMY_ERROR)
23332           {
23333             (*srcLen) += lookAhead;
23334             *status = LZMA_STATUS_NEEDS_MORE_INPUT;
23335             return SZ_OK;
23336           }
23337           if (checkEndMarkNow && dummyRes != DUMMY_MATCH)
23338           {
23339             *status = LZMA_STATUS_NOT_FINISHED;
23340             return SZ_ERROR_DATA;
23341           }
23342         }
23343         p->buf = p->tempBuf;
23344         if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0)
23345           return SZ_ERROR_DATA;
23346         lookAhead -= (rem - (unsigned)(p->buf - p->tempBuf));
23347         (*srcLen) += lookAhead;
23348         src += lookAhead;
23349         inSize -= lookAhead;
23350         p->tempBufSize = 0;
23351       }
23352   }
23353   if (p->code == 0)
23354     *status = LZMA_STATUS_FINISHED_WITH_MARK;
23355   return (p->code == 0) ? SZ_OK : SZ_ERROR_DATA;
23356 }
23357 
LzmaDec_DecodeToBuf(CLzmaDec * p,uint8_t * dest,size_t * destLen,const uint8_t * src,size_t * srcLen,ELzmaFinishMode finishMode,ELzmaStatus * status)23358 SRes LzmaDec_DecodeToBuf(CLzmaDec *p, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status)
23359 {
23360   size_t outSize = *destLen;
23361   size_t inSize = *srcLen;
23362   *srcLen = *destLen = 0;
23363   for (;;)
23364   {
23365     size_t inSizeCur = inSize, outSizeCur, dicPos;
23366     ELzmaFinishMode curFinishMode;
23367     SRes res;
23368     if (p->dicPos == p->dicBufSize)
23369       p->dicPos = 0;
23370     dicPos = p->dicPos;
23371     if (outSize > p->dicBufSize - dicPos)
23372     {
23373       outSizeCur = p->dicBufSize;
23374       curFinishMode = LZMA_FINISH_ANY;
23375     }
23376     else
23377     {
23378       outSizeCur = dicPos + outSize;
23379       curFinishMode = finishMode;
23380     }
23381 
23382     res = LzmaDec_DecodeToDic(p, outSizeCur, src, &inSizeCur, curFinishMode, status);
23383     src += inSizeCur;
23384     inSize -= inSizeCur;
23385     *srcLen += inSizeCur;
23386     outSizeCur = p->dicPos - dicPos;
23387     memcpy(dest, p->dic + dicPos, outSizeCur);
23388     dest += outSizeCur;
23389     outSize -= outSizeCur;
23390     *destLen += outSizeCur;
23391     if (res != 0)
23392       return res;
23393     if (outSizeCur == 0 || outSize == 0)
23394       return SZ_OK;
23395   }
23396 }
23397 
LzmaDec_FreeProbs(CLzmaDec * p,ISzAlloc * alloc)23398 void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc)
23399 {
23400   alloc->Free(alloc, p->probs);
23401   p->probs = 0;
23402 }
23403 
LzmaDec_FreeDict(CLzmaDec * p,ISzAlloc * alloc)23404 static void LzmaDec_FreeDict(CLzmaDec *p, ISzAlloc *alloc)
23405 {
23406   alloc->Free(alloc, p->dic);
23407   p->dic = 0;
23408 }
23409 
LzmaDec_Free(CLzmaDec * p,ISzAlloc * alloc)23410 void LzmaDec_Free(CLzmaDec *p, ISzAlloc *alloc)
23411 {
23412   LzmaDec_FreeProbs(p, alloc);
23413   LzmaDec_FreeDict(p, alloc);
23414 }
23415 
LzmaProps_Decode(CLzmaProps * p,const uint8_t * data,unsigned size)23416 SRes LzmaProps_Decode(CLzmaProps *p, const uint8_t *data, unsigned size)
23417 {
23418   uint32_t dicSize;
23419   uint8_t d;
23420 
23421   if (size < LZMA_PROPS_SIZE)
23422     return SZ_ERROR_UNSUPPORTED;
23423   else
23424     dicSize = data[1] | ((uint32_t)data[2] << 8) | ((uint32_t)data[3] << 16) | ((uint32_t)data[4] << 24);
23425 
23426   if (dicSize < LZMA_DIC_MIN)
23427     dicSize = LZMA_DIC_MIN;
23428   p->dicSize = dicSize;
23429 
23430   d = data[0];
23431   if (d >= (9 * 5 * 5))
23432     return SZ_ERROR_UNSUPPORTED;
23433 
23434   p->lc = d % 9;
23435   d /= 9;
23436   p->pb = d / 5;
23437   p->lp = d % 5;
23438 
23439   return SZ_OK;
23440 }
23441 
LzmaDec_AllocateProbs2(CLzmaDec * p,const CLzmaProps * propNew,ISzAlloc * alloc)23442 static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAlloc *alloc)
23443 {
23444   uint32_t numProbs = LzmaProps_GetNumProbs(propNew);
23445   if (p->probs == 0 || numProbs != p->numProbs)
23446   {
23447     LzmaDec_FreeProbs(p, alloc);
23448     p->probs = (CLzmaProb *)alloc->Alloc(alloc, numProbs * sizeof(CLzmaProb));
23449     p->numProbs = numProbs;
23450     if (p->probs == 0)
23451       return SZ_ERROR_MEM;
23452   }
23453   return SZ_OK;
23454 }
23455 
LzmaDec_AllocateProbs(CLzmaDec * p,const uint8_t * props,unsigned propsSize,ISzAlloc * alloc)23456 SRes LzmaDec_AllocateProbs(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc)
23457 {
23458   CLzmaProps propNew;
23459   RINOK(LzmaProps_Decode(&propNew, props, propsSize));
23460   RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
23461   p->prop = propNew;
23462   return SZ_OK;
23463 }
23464 
LzmaDec_Allocate(CLzmaDec * p,const uint8_t * props,unsigned propsSize,ISzAlloc * alloc)23465 SRes LzmaDec_Allocate(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc)
23466 {
23467   CLzmaProps propNew;
23468   size_t dicBufSize;
23469   RINOK(LzmaProps_Decode(&propNew, props, propsSize));
23470   RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
23471   dicBufSize = propNew.dicSize;
23472   if (p->dic == 0 || dicBufSize != p->dicBufSize)
23473   {
23474     LzmaDec_FreeDict(p, alloc);
23475     p->dic = (uint8_t *)alloc->Alloc(alloc, dicBufSize);
23476     if (p->dic == 0)
23477     {
23478       LzmaDec_FreeProbs(p, alloc);
23479       return SZ_ERROR_MEM;
23480     }
23481   }
23482   p->dicBufSize = dicBufSize;
23483   p->prop = propNew;
23484   return SZ_OK;
23485 }
23486 
LzmaDecode(uint8_t * dest,size_t * destLen,const uint8_t * src,size_t * srcLen,const uint8_t * propData,unsigned propSize,ELzmaFinishMode finishMode,ELzmaStatus * status,ISzAlloc * alloc)23487 SRes LzmaDecode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen,
23488     const uint8_t *propData, unsigned propSize, ELzmaFinishMode finishMode,
23489     ELzmaStatus *status, ISzAlloc *alloc)
23490 {
23491   CLzmaDec p;
23492   SRes res;
23493   size_t inSize = *srcLen;
23494   size_t outSize = *destLen;
23495   *srcLen = *destLen = 0;
23496   if (inSize < RC_INIT_SIZE)
23497     return SZ_ERROR_INPUT_EOF;
23498 
23499   LzmaDec_Construct(&p);
23500   res = LzmaDec_AllocateProbs(&p, propData, propSize, alloc);
23501   if (res != 0)
23502     return res;
23503   p.dic = dest;
23504   p.dicBufSize = outSize;
23505 
23506   LzmaDec_Init(&p);
23507 
23508   *srcLen = inSize;
23509   res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status);
23510 
23511   if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT)
23512     res = SZ_ERROR_INPUT_EOF;
23513 
23514   (*destLen) = p.dicPos;
23515   LzmaDec_FreeProbs(&p, alloc);
23516   return res;
23517 }
23518 
23519 } // namespace lzma
23520 // End of LZMA compression library by Igor Pavlov
23521 
23522 #ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H
23523 #define UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H
LzmaAlloc(void *,size_t size)23524 static void *LzmaAlloc(void* /*p*/, size_t size) { return new char[size]; }
LzmaFree(void *,void * address)23525 static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; }
23526 static lzma::ISzAlloc lzmaAllocator = { LzmaAlloc, LzmaFree };
23527 #endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H
23528 
load(istream & is,binary_decoder & data)23529 bool compressor::load(istream& is, binary_decoder& data) {
23530   uint32_t uncompressed_len, compressed_len, poor_crc;
23531   unsigned char props_encoded[LZMA_PROPS_SIZE];
23532 
23533   if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false;
23534   if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false;
23535   if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false;
23536   if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false;
23537   if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false;
23538 
23539   vector<unsigned char> compressed(compressed_len);
23540   if (!is.read((char *) compressed.data(), compressed_len)) return false;
23541 
23542   lzma::ELzmaStatus status;
23543   size_t uncompressed_size = uncompressed_len, compressed_size = compressed_len;
23544   auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator);
23545   if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false;
23546 
23547   return true;
23548 }
23549 
23550 } // namespace utils
23551 
23552 /////////
23553 // File: utils/compressor_save.cpp
23554 /////////
23555 
23556 // This file is part of UFAL C++ Utils <http://github.com/ufal/cpp_utils/>.
23557 //
23558 // Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
23559 // Mathematics and Physics, Charles University in Prague, Czech Republic.
23560 //
23561 // This Source Code Form is subject to the terms of the Mozilla Public
23562 // License, v. 2.0. If a copy of the MPL was not distributed with this
23563 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
23564 
23565 namespace utils {
23566 
23567 // Start of LZMA compression library by Igor Pavlov
23568 namespace lzma {
23569 
23570 // Types.h -- Basic types
23571 // 2010-10-09 : Igor Pavlov : Public domain
23572 #ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H
23573 #define UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H
23574 
23575 #define SZ_OK 0
23576 
23577 #define SZ_ERROR_DATA 1
23578 #define SZ_ERROR_MEM 2
23579 #define SZ_ERROR_CRC 3
23580 #define SZ_ERROR_UNSUPPORTED 4
23581 #define SZ_ERROR_PARAM 5
23582 #define SZ_ERROR_INPUT_EOF 6
23583 #define SZ_ERROR_OUTPUT_EOF 7
23584 #define SZ_ERROR_READ 8
23585 #define SZ_ERROR_WRITE 9
23586 #define SZ_ERROR_PROGRESS 10
23587 #define SZ_ERROR_FAIL 11
23588 #define SZ_ERROR_THREAD 12
23589 
23590 #define SZ_ERROR_ARCHIVE 16
23591 #define SZ_ERROR_NO_ARCHIVE 17
23592 
23593 typedef int SRes;
23594 
23595 #ifndef RINOK
23596 #define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; }
23597 #endif
23598 
23599 /* The following interfaces use first parameter as pointer to structure */
23600 
23601 struct IByteIn
23602 {
23603   uint8_t (*Read)(void *p); /* reads one byte, returns 0 in case of EOF or error */
23604 };
23605 
23606 struct IByteOut
23607 {
23608   void (*Write)(void *p, uint8_t b);
23609 };
23610 
23611 struct ISeqInStream
23612 {
23613   SRes (*Read)(void *p, void *buf, size_t *size);
23614     /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
23615        (output(*size) < input(*size)) is allowed */
23616 };
23617 
23618 /* it can return SZ_ERROR_INPUT_EOF */
23619 SRes SeqInStream_Read(ISeqInStream *stream, void *buf, size_t size);
23620 SRes SeqInStream_Read2(ISeqInStream *stream, void *buf, size_t size, SRes errorType);
23621 SRes SeqInStream_ReadByte(ISeqInStream *stream, uint8_t *buf);
23622 
23623 struct ISeqOutStream
23624 {
23625   size_t (*Write)(void *p, const void *buf, size_t size);
23626     /* Returns: result - the number of actually written bytes.
23627        (result < size) means error */
23628 };
23629 
23630 enum ESzSeek
23631 {
23632   SZ_SEEK_SET = 0,
23633   SZ_SEEK_CUR = 1,
23634   SZ_SEEK_END = 2
23635 };
23636 
23637 struct ISeekInStream
23638 {
23639   SRes (*Read)(void *p, void *buf, size_t *size);  /* same as ISeqInStream::Read */
23640   SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin);
23641 };
23642 
23643 struct ILookInStream
23644 {
23645   SRes (*Look)(void *p, const void **buf, size_t *size);
23646     /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
23647        (output(*size) > input(*size)) is not allowed
23648        (output(*size) < input(*size)) is allowed */
23649   SRes (*Skip)(void *p, size_t offset);
23650     /* offset must be <= output(*size) of Look */
23651 
23652   SRes (*Read)(void *p, void *buf, size_t *size);
23653     /* reads directly (without buffer). It's same as ISeqInStream::Read */
23654   SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin);
23655 };
23656 
23657 SRes LookInStream_LookRead(ILookInStream *stream, void *buf, size_t *size);
23658 SRes LookInStream_SeekTo(ILookInStream *stream, uint64_t offset);
23659 
23660 /* reads via ILookInStream::Read */
23661 SRes LookInStream_Read2(ILookInStream *stream, void *buf, size_t size, SRes errorType);
23662 SRes LookInStream_Read(ILookInStream *stream, void *buf, size_t size);
23663 
23664 #define LookToRead_BUF_SIZE (1 << 14)
23665 
23666 struct CLookToRead
23667 {
23668   ILookInStream s;
23669   ISeekInStream *realStream;
23670   size_t pos;
23671   size_t size;
23672   uint8_t buf[LookToRead_BUF_SIZE];
23673 };
23674 
23675 void LookToRead_CreateVTable(CLookToRead *p, int lookahead);
23676 void LookToRead_Init(CLookToRead *p);
23677 
23678 struct CSecToLook
23679 {
23680   ISeqInStream s;
23681   ILookInStream *realStream;
23682 };
23683 
23684 void SecToLook_CreateVTable(CSecToLook *p);
23685 
23686 struct CSecToRead
23687 {
23688   ISeqInStream s;
23689   ILookInStream *realStream;
23690 };
23691 
23692 void SecToRead_CreateVTable(CSecToRead *p);
23693 
23694 struct ICompressProgress
23695 {
23696   SRes (*Progress)(void *p, uint64_t inSize, uint64_t outSize);
23697     /* Returns: result. (result != SZ_OK) means break.
23698        Value (uint64_t)(int64_t)-1 for size means unknown value. */
23699 };
23700 
23701 struct ISzAlloc
23702 {
23703   void *(*Alloc)(void *p, size_t size);
23704   void (*Free)(void *p, void *address); /* address can be 0 */
23705 };
23706 
23707 #define IAlloc_Alloc(p, size) (p)->Alloc((p), size)
23708 #define IAlloc_Free(p, a) (p)->Free((p), a)
23709 
23710 #endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H
23711 
23712 // LzHash.h -- HASH functions for LZ algorithms
23713 // 2009-02-07 : Igor Pavlov : Public domain
23714 
23715 #define kHash2Size (1 << 10)
23716 #define kHash3Size (1 << 16)
23717 #define kHash4Size (1 << 20)
23718 
23719 #define kFix3HashSize (kHash2Size)
23720 #define kFix4HashSize (kHash2Size + kHash3Size)
23721 #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
23722 
23723 #define HASH2_CALC hashValue = cur[0] | ((uint32_t)cur[1] << 8);
23724 
23725 #define HASH3_CALC { \
23726   uint32_t temp = p->crc[cur[0]] ^ cur[1]; \
23727   hash2Value = temp & (kHash2Size - 1); \
23728   hashValue = (temp ^ ((uint32_t)cur[2] << 8)) & p->hashMask; }
23729 
23730 #define HASH4_CALC { \
23731   uint32_t temp = p->crc[cur[0]] ^ cur[1]; \
23732   hash2Value = temp & (kHash2Size - 1); \
23733   hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \
23734   hashValue = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)) & p->hashMask; }
23735 
23736 #define HASH5_CALC { \
23737   uint32_t temp = p->crc[cur[0]] ^ cur[1]; \
23738   hash2Value = temp & (kHash2Size - 1); \
23739   hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \
23740   hash4Value = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)); \
23741   hashValue = (hash4Value ^ (p->crc[cur[4]] << 3)) & p->hashMask; \
23742   hash4Value &= (kHash4Size - 1); }
23743 
23744 /* #define HASH_ZIP_CALC hashValue = ((cur[0] | ((uint32_t)cur[1] << 8)) ^ p->crc[cur[2]]) & 0xFFFF; */
23745 #define HASH_ZIP_CALC hashValue = ((cur[2] | ((uint32_t)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
23746 
23747 #define MT_HASH2_CALC \
23748   hash2Value = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1);
23749 
23750 #define MT_HASH3_CALC { \
23751   uint32_t temp = p->crc[cur[0]] ^ cur[1]; \
23752   hash2Value = temp & (kHash2Size - 1); \
23753   hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); }
23754 
23755 #define MT_HASH4_CALC { \
23756   uint32_t temp = p->crc[cur[0]] ^ cur[1]; \
23757   hash2Value = temp & (kHash2Size - 1); \
23758   hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \
23759   hash4Value = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)) & (kHash4Size - 1); }
23760 
23761 // LzFind.h -- Match finder for LZ algorithms
23762 // 2009-04-22 : Igor Pavlov : Public domain
23763 
23764 typedef uint32_t CLzRef;
23765 
23766 struct CMatchFinder
23767 {
23768   uint8_t *buffer;
23769   uint32_t pos;
23770   uint32_t posLimit;
23771   uint32_t streamPos;
23772   uint32_t lenLimit;
23773 
23774   uint32_t cyclicBufferPos;
23775   uint32_t cyclicBufferSize; /* it must be = (historySize + 1) */
23776 
23777   uint32_t matchMaxLen;
23778   CLzRef *hash;
23779   CLzRef *son;
23780   uint32_t hashMask;
23781   uint32_t cutValue;
23782 
23783   uint8_t *bufferBase;
23784   ISeqInStream *stream;
23785   int streamEndWasReached;
23786 
23787   uint32_t blockSize;
23788   uint32_t keepSizeBefore;
23789   uint32_t keepSizeAfter;
23790 
23791   uint32_t numHashBytes;
23792   int directInput;
23793   size_t directInputRem;
23794   int btMode;
23795   int bigHash;
23796   uint32_t historySize;
23797   uint32_t fixedHashSize;
23798   uint32_t hashSizeSum;
23799   uint32_t numSons;
23800   SRes result;
23801   uint32_t crc[256];
23802 };
23803 
23804 #define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer)
23805 #define Inline_MatchFinder_GetIndexByte(p, index) ((p)->buffer[(int32_t)(index)])
23806 
23807 #define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos)
23808 
23809 int MatchFinder_NeedMove(CMatchFinder *p);
23810 uint8_t *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p);
23811 void MatchFinder_MoveBlock(CMatchFinder *p);
23812 void MatchFinder_ReadIfRequired(CMatchFinder *p);
23813 
23814 void MatchFinder_Construct(CMatchFinder *p);
23815 
23816 /* Conditions:
23817      historySize <= 3 GB
23818      keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB
23819 */
23820 int MatchFinder_Create(CMatchFinder *p, uint32_t historySize,
23821     uint32_t keepAddBufferBefore, uint32_t matchMaxLen, uint32_t keepAddBufferAfter,
23822     ISzAlloc *alloc);
23823 void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc);
23824 void MatchFinder_Normalize3(uint32_t subValue, CLzRef *items, uint32_t numItems);
23825 void MatchFinder_ReduceOffsets(CMatchFinder *p, uint32_t subValue);
23826 
23827 uint32_t * GetMatchesSpec1(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *buffer, CLzRef *son,
23828     uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t _cutValue,
23829     uint32_t *distances, uint32_t maxLen);
23830 
23831 /*
23832 Conditions:
23833   Mf_GetNumAvailableBytes_Func must be called before each Mf_GetMatchLen_Func.
23834   Mf_GetPointerToCurrentPos_Func's result must be used only before any other function
23835 */
23836 
23837 typedef void (*Mf_Init_Func)(void *object);
23838 typedef uint8_t (*Mf_GetIndexByte_Func)(void *object, int32_t index);
23839 typedef uint32_t (*Mf_GetNumAvailableBytes_Func)(void *object);
23840 typedef const uint8_t * (*Mf_GetPointerToCurrentPos_Func)(void *object);
23841 typedef uint32_t (*Mf_GetMatches_Func)(void *object, uint32_t *distances);
23842 typedef void (*Mf_Skip_Func)(void *object, uint32_t);
23843 
23844 struct IMatchFinder
23845 {
23846   Mf_Init_Func Init;
23847   Mf_GetIndexByte_Func GetIndexByte;
23848   Mf_GetNumAvailableBytes_Func GetNumAvailableBytes;
23849   Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos;
23850   Mf_GetMatches_Func GetMatches;
23851   Mf_Skip_Func Skip;
23852 };
23853 
23854 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable);
23855 
23856 void MatchFinder_Init(CMatchFinder *p);
23857 uint32_t Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances);
23858 uint32_t Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances);
23859 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num);
23860 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num);
23861 
23862 // LzFind.c -- Match finder for LZ algorithms
23863 // 2009-04-22 : Igor Pavlov : Public domain
23864 
23865 #define kEmptyHashValue 0
23866 #define kMaxValForNormalize ((uint32_t)0xFFFFFFFF)
23867 #define kNormalizeStepMin (1 << 10) /* it must be power of 2 */
23868 #define kNormalizeMask (~(kNormalizeStepMin - 1))
23869 #define kMaxHistorySize ((uint32_t)3 << 30)
23870 
23871 #define kStartMaxLen 3
23872 
LzInWindow_Free(CMatchFinder * p,ISzAlloc * alloc)23873 static void LzInWindow_Free(CMatchFinder *p, ISzAlloc *alloc)
23874 {
23875   if (!p->directInput)
23876   {
23877     alloc->Free(alloc, p->bufferBase);
23878     p->bufferBase = 0;
23879   }
23880 }
23881 
23882 /* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */
23883 
LzInWindow_Create(CMatchFinder * p,uint32_t keepSizeReserv,ISzAlloc * alloc)23884 static int LzInWindow_Create(CMatchFinder *p, uint32_t keepSizeReserv, ISzAlloc *alloc)
23885 {
23886   uint32_t blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv;
23887   if (p->directInput)
23888   {
23889     p->blockSize = blockSize;
23890     return 1;
23891   }
23892   if (p->bufferBase == 0 || p->blockSize != blockSize)
23893   {
23894     LzInWindow_Free(p, alloc);
23895     p->blockSize = blockSize;
23896     p->bufferBase = (uint8_t *)alloc->Alloc(alloc, (size_t)blockSize);
23897   }
23898   return (p->bufferBase != 0);
23899 }
23900 
MatchFinder_GetPointerToCurrentPos(CMatchFinder * p)23901 uint8_t *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
MatchFinder_GetIndexByte(CMatchFinder * p,int32_t index)23902 uint8_t MatchFinder_GetIndexByte(CMatchFinder *p, int32_t index) { return p->buffer[index]; }
23903 
MatchFinder_GetNumAvailableBytes(CMatchFinder * p)23904 uint32_t MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; }
23905 
MatchFinder_ReduceOffsets(CMatchFinder * p,uint32_t subValue)23906 void MatchFinder_ReduceOffsets(CMatchFinder *p, uint32_t subValue)
23907 {
23908   p->posLimit -= subValue;
23909   p->pos -= subValue;
23910   p->streamPos -= subValue;
23911 }
23912 
MatchFinder_ReadBlock(CMatchFinder * p)23913 static void MatchFinder_ReadBlock(CMatchFinder *p)
23914 {
23915   if (p->streamEndWasReached || p->result != SZ_OK)
23916     return;
23917   if (p->directInput)
23918   {
23919     uint32_t curSize = 0xFFFFFFFF - p->streamPos;
23920     if (curSize > p->directInputRem)
23921       curSize = (uint32_t)p->directInputRem;
23922     p->directInputRem -= curSize;
23923     p->streamPos += curSize;
23924     if (p->directInputRem == 0)
23925       p->streamEndWasReached = 1;
23926     return;
23927   }
23928   for (;;)
23929   {
23930     uint8_t *dest = p->buffer + (p->streamPos - p->pos);
23931     size_t size = (p->bufferBase + p->blockSize - dest);
23932     if (size == 0)
23933       return;
23934     p->result = p->stream->Read(p->stream, dest, &size);
23935     if (p->result != SZ_OK)
23936       return;
23937     if (size == 0)
23938     {
23939       p->streamEndWasReached = 1;
23940       return;
23941     }
23942     p->streamPos += (uint32_t)size;
23943     if (p->streamPos - p->pos > p->keepSizeAfter)
23944       return;
23945   }
23946 }
23947 
MatchFinder_MoveBlock(CMatchFinder * p)23948 void MatchFinder_MoveBlock(CMatchFinder *p)
23949 {
23950   memmove(p->bufferBase,
23951     p->buffer - p->keepSizeBefore,
23952     (size_t)(p->streamPos - p->pos + p->keepSizeBefore));
23953   p->buffer = p->bufferBase + p->keepSizeBefore;
23954 }
23955 
MatchFinder_NeedMove(CMatchFinder * p)23956 int MatchFinder_NeedMove(CMatchFinder *p)
23957 {
23958   if (p->directInput)
23959     return 0;
23960   /* if (p->streamEndWasReached) return 0; */
23961   return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
23962 }
23963 
MatchFinder_ReadIfRequired(CMatchFinder * p)23964 void MatchFinder_ReadIfRequired(CMatchFinder *p)
23965 {
23966   if (p->streamEndWasReached)
23967     return;
23968   if (p->keepSizeAfter >= p->streamPos - p->pos)
23969     MatchFinder_ReadBlock(p);
23970 }
23971 
MatchFinder_CheckAndMoveAndRead(CMatchFinder * p)23972 static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p)
23973 {
23974   if (MatchFinder_NeedMove(p))
23975     MatchFinder_MoveBlock(p);
23976   MatchFinder_ReadBlock(p);
23977 }
23978 
MatchFinder_SetDefaultSettings(CMatchFinder * p)23979 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
23980 {
23981   p->cutValue = 32;
23982   p->btMode = 1;
23983   p->numHashBytes = 4;
23984   p->bigHash = 0;
23985 }
23986 
23987 #define kCrcPoly 0xEDB88320
23988 
MatchFinder_Construct(CMatchFinder * p)23989 void MatchFinder_Construct(CMatchFinder *p)
23990 {
23991   uint32_t i;
23992   p->bufferBase = 0;
23993   p->directInput = 0;
23994   p->hash = 0;
23995   MatchFinder_SetDefaultSettings(p);
23996 
23997   for (i = 0; i < 256; i++)
23998   {
23999     uint32_t r = i;
24000     int j;
24001     for (j = 0; j < 8; j++)
24002       r = (r >> 1) ^ (kCrcPoly & ~((r & 1) - 1));
24003     p->crc[i] = r;
24004   }
24005 }
24006 
MatchFinder_FreeThisClassMemory(CMatchFinder * p,ISzAlloc * alloc)24007 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAlloc *alloc)
24008 {
24009   alloc->Free(alloc, p->hash);
24010   p->hash = 0;
24011 }
24012 
MatchFinder_Free(CMatchFinder * p,ISzAlloc * alloc)24013 void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc)
24014 {
24015   MatchFinder_FreeThisClassMemory(p, alloc);
24016   LzInWindow_Free(p, alloc);
24017 }
24018 
AllocRefs(uint32_t num,ISzAlloc * alloc)24019 static CLzRef* AllocRefs(uint32_t num, ISzAlloc *alloc)
24020 {
24021   size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
24022   if (sizeInBytes / sizeof(CLzRef) != num)
24023     return 0;
24024   return (CLzRef *)alloc->Alloc(alloc, sizeInBytes);
24025 }
24026 
MatchFinder_Create(CMatchFinder * p,uint32_t historySize,uint32_t keepAddBufferBefore,uint32_t matchMaxLen,uint32_t keepAddBufferAfter,ISzAlloc * alloc)24027 int MatchFinder_Create(CMatchFinder *p, uint32_t historySize,
24028     uint32_t keepAddBufferBefore, uint32_t matchMaxLen, uint32_t keepAddBufferAfter,
24029     ISzAlloc *alloc)
24030 {
24031   uint32_t sizeReserv;
24032   if (historySize > kMaxHistorySize)
24033   {
24034     MatchFinder_Free(p, alloc);
24035     return 0;
24036   }
24037   sizeReserv = historySize >> 1;
24038   if (historySize > ((uint32_t)2 << 30))
24039     sizeReserv = historySize >> 2;
24040   sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19);
24041 
24042   p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
24043   p->keepSizeAfter = matchMaxLen + keepAddBufferAfter;
24044   /* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */
24045   if (LzInWindow_Create(p, sizeReserv, alloc))
24046   {
24047     uint32_t newCyclicBufferSize = historySize + 1;
24048     uint32_t hs;
24049     p->matchMaxLen = matchMaxLen;
24050     {
24051       p->fixedHashSize = 0;
24052       if (p->numHashBytes == 2)
24053         hs = (1 << 16) - 1;
24054       else
24055       {
24056         hs = historySize - 1;
24057         hs |= (hs >> 1);
24058         hs |= (hs >> 2);
24059         hs |= (hs >> 4);
24060         hs |= (hs >> 8);
24061         hs >>= 1;
24062         hs |= 0xFFFF; /* don't change it! It's required for Deflate */
24063         if (hs > (1 << 24))
24064         {
24065           if (p->numHashBytes == 3)
24066             hs = (1 << 24) - 1;
24067           else
24068             hs >>= 1;
24069         }
24070       }
24071       p->hashMask = hs;
24072       hs++;
24073       if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size;
24074       if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size;
24075       if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size;
24076       hs += p->fixedHashSize;
24077     }
24078 
24079     {
24080       uint32_t prevSize = p->hashSizeSum + p->numSons;
24081       uint32_t newSize;
24082       p->historySize = historySize;
24083       p->hashSizeSum = hs;
24084       p->cyclicBufferSize = newCyclicBufferSize;
24085       p->numSons = (p->btMode ? newCyclicBufferSize * 2 : newCyclicBufferSize);
24086       newSize = p->hashSizeSum + p->numSons;
24087       if (p->hash != 0 && prevSize == newSize)
24088         return 1;
24089       MatchFinder_FreeThisClassMemory(p, alloc);
24090       p->hash = AllocRefs(newSize, alloc);
24091       if (p->hash != 0)
24092       {
24093         p->son = p->hash + p->hashSizeSum;
24094         return 1;
24095       }
24096     }
24097   }
24098   MatchFinder_Free(p, alloc);
24099   return 0;
24100 }
24101 
MatchFinder_SetLimits(CMatchFinder * p)24102 static void MatchFinder_SetLimits(CMatchFinder *p)
24103 {
24104   uint32_t limit = kMaxValForNormalize - p->pos;
24105   uint32_t limit2 = p->cyclicBufferSize - p->cyclicBufferPos;
24106   if (limit2 < limit)
24107     limit = limit2;
24108   limit2 = p->streamPos - p->pos;
24109   if (limit2 <= p->keepSizeAfter)
24110   {
24111     if (limit2 > 0)
24112       limit2 = 1;
24113   }
24114   else
24115     limit2 -= p->keepSizeAfter;
24116   if (limit2 < limit)
24117     limit = limit2;
24118   {
24119     uint32_t lenLimit = p->streamPos - p->pos;
24120     if (lenLimit > p->matchMaxLen)
24121       lenLimit = p->matchMaxLen;
24122     p->lenLimit = lenLimit;
24123   }
24124   p->posLimit = p->pos + limit;
24125 }
24126 
MatchFinder_Init(CMatchFinder * p)24127 void MatchFinder_Init(CMatchFinder *p)
24128 {
24129   uint32_t i;
24130   for (i = 0; i < p->hashSizeSum; i++)
24131     p->hash[i] = kEmptyHashValue;
24132   p->cyclicBufferPos = 0;
24133   p->buffer = p->bufferBase;
24134   p->pos = p->streamPos = p->cyclicBufferSize;
24135   p->result = SZ_OK;
24136   p->streamEndWasReached = 0;
24137   MatchFinder_ReadBlock(p);
24138   MatchFinder_SetLimits(p);
24139 }
24140 
MatchFinder_GetSubValue(CMatchFinder * p)24141 static uint32_t MatchFinder_GetSubValue(CMatchFinder *p)
24142 {
24143   return (p->pos - p->historySize - 1) & kNormalizeMask;
24144 }
24145 
MatchFinder_Normalize3(uint32_t subValue,CLzRef * items,uint32_t numItems)24146 void MatchFinder_Normalize3(uint32_t subValue, CLzRef *items, uint32_t numItems)
24147 {
24148   uint32_t i;
24149   for (i = 0; i < numItems; i++)
24150   {
24151     uint32_t value = items[i];
24152     if (value <= subValue)
24153       value = kEmptyHashValue;
24154     else
24155       value -= subValue;
24156     items[i] = value;
24157   }
24158 }
24159 
MatchFinder_Normalize(CMatchFinder * p)24160 static void MatchFinder_Normalize(CMatchFinder *p)
24161 {
24162   uint32_t subValue = MatchFinder_GetSubValue(p);
24163   MatchFinder_Normalize3(subValue, p->hash, p->hashSizeSum + p->numSons);
24164   MatchFinder_ReduceOffsets(p, subValue);
24165 }
24166 
MatchFinder_CheckLimits(CMatchFinder * p)24167 static void MatchFinder_CheckLimits(CMatchFinder *p)
24168 {
24169   if (p->pos == kMaxValForNormalize)
24170     MatchFinder_Normalize(p);
24171   if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos)
24172     MatchFinder_CheckAndMoveAndRead(p);
24173   if (p->cyclicBufferPos == p->cyclicBufferSize)
24174     p->cyclicBufferPos = 0;
24175   MatchFinder_SetLimits(p);
24176 }
24177 
Hc_GetMatchesSpec(uint32_t lenLimit,uint32_t curMatch,uint32_t pos,const uint8_t * cur,CLzRef * son,uint32_t _cyclicBufferPos,uint32_t _cyclicBufferSize,uint32_t cutValue,uint32_t * distances,uint32_t maxLen)24178 static uint32_t * Hc_GetMatchesSpec(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son,
24179     uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue,
24180     uint32_t *distances, uint32_t maxLen)
24181 {
24182   son[_cyclicBufferPos] = curMatch;
24183   for (;;)
24184   {
24185     uint32_t delta = pos - curMatch;
24186     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
24187       return distances;
24188     {
24189       const uint8_t *pb = cur - delta;
24190       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
24191       if (pb[maxLen] == cur[maxLen] && *pb == *cur)
24192       {
24193         uint32_t len = 0;
24194         while (++len != lenLimit)
24195           if (pb[len] != cur[len])
24196             break;
24197         if (maxLen < len)
24198         {
24199           *distances++ = maxLen = len;
24200           *distances++ = delta - 1;
24201           if (len == lenLimit)
24202             return distances;
24203         }
24204       }
24205     }
24206   }
24207 }
24208 
GetMatchesSpec1(uint32_t lenLimit,uint32_t curMatch,uint32_t pos,const uint8_t * cur,CLzRef * son,uint32_t _cyclicBufferPos,uint32_t _cyclicBufferSize,uint32_t cutValue,uint32_t * distances,uint32_t maxLen)24209 uint32_t * GetMatchesSpec1(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son,
24210     uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue,
24211     uint32_t *distances, uint32_t maxLen)
24212 {
24213   CLzRef *ptr0 = son + (_cyclicBufferPos << 1) + 1;
24214   CLzRef *ptr1 = son + (_cyclicBufferPos << 1);
24215   uint32_t len0 = 0, len1 = 0;
24216   for (;;)
24217   {
24218     uint32_t delta = pos - curMatch;
24219     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
24220     {
24221       *ptr0 = *ptr1 = kEmptyHashValue;
24222       return distances;
24223     }
24224     {
24225       CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
24226       const uint8_t *pb = cur - delta;
24227       uint32_t len = (len0 < len1 ? len0 : len1);
24228       if (pb[len] == cur[len])
24229       {
24230         if (++len != lenLimit && pb[len] == cur[len])
24231           while (++len != lenLimit)
24232             if (pb[len] != cur[len])
24233               break;
24234         if (maxLen < len)
24235         {
24236           *distances++ = maxLen = len;
24237           *distances++ = delta - 1;
24238           if (len == lenLimit)
24239           {
24240             *ptr1 = pair[0];
24241             *ptr0 = pair[1];
24242             return distances;
24243           }
24244         }
24245       }
24246       if (pb[len] < cur[len])
24247       {
24248         *ptr1 = curMatch;
24249         ptr1 = pair + 1;
24250         curMatch = *ptr1;
24251         len1 = len;
24252       }
24253       else
24254       {
24255         *ptr0 = curMatch;
24256         ptr0 = pair;
24257         curMatch = *ptr0;
24258         len0 = len;
24259       }
24260     }
24261   }
24262 }
24263 
SkipMatchesSpec(uint32_t lenLimit,uint32_t curMatch,uint32_t pos,const uint8_t * cur,CLzRef * son,uint32_t _cyclicBufferPos,uint32_t _cyclicBufferSize,uint32_t cutValue)24264 static void SkipMatchesSpec(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son,
24265     uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue)
24266 {
24267   CLzRef *ptr0 = son + (_cyclicBufferPos << 1) + 1;
24268   CLzRef *ptr1 = son + (_cyclicBufferPos << 1);
24269   uint32_t len0 = 0, len1 = 0;
24270   for (;;)
24271   {
24272     uint32_t delta = pos - curMatch;
24273     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
24274     {
24275       *ptr0 = *ptr1 = kEmptyHashValue;
24276       return;
24277     }
24278     {
24279       CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
24280       const uint8_t *pb = cur - delta;
24281       uint32_t len = (len0 < len1 ? len0 : len1);
24282       if (pb[len] == cur[len])
24283       {
24284         while (++len != lenLimit)
24285           if (pb[len] != cur[len])
24286             break;
24287         {
24288           if (len == lenLimit)
24289           {
24290             *ptr1 = pair[0];
24291             *ptr0 = pair[1];
24292             return;
24293           }
24294         }
24295       }
24296       if (pb[len] < cur[len])
24297       {
24298         *ptr1 = curMatch;
24299         ptr1 = pair + 1;
24300         curMatch = *ptr1;
24301         len1 = len;
24302       }
24303       else
24304       {
24305         *ptr0 = curMatch;
24306         ptr0 = pair;
24307         curMatch = *ptr0;
24308         len0 = len;
24309       }
24310     }
24311   }
24312 }
24313 
24314 #define MOVE_POS \
24315   ++p->cyclicBufferPos; \
24316   p->buffer++; \
24317   if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p);
24318 
24319 #define MOVE_POS_RET MOVE_POS return offset;
24320 
MatchFinder_MovePos(CMatchFinder * p)24321 static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; }
24322 
24323 #define GET_MATCHES_HEADER2(minLen, ret_op) \
24324   uint32_t lenLimit; uint32_t hashValue; const uint8_t *cur; uint32_t curMatch; \
24325   lenLimit = p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
24326   cur = p->buffer;
24327 
24328 #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0)
24329 #define SKIP_HEADER(minLen)        GET_MATCHES_HEADER2(minLen, continue)
24330 
24331 #define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
24332 
24333 #define GET_MATCHES_FOOTER(offset, maxLen) \
24334   offset = (uint32_t)(GetMatchesSpec1(lenLimit, curMatch, MF_PARAMS(p), \
24335   distances + offset, maxLen) - distances); MOVE_POS_RET;
24336 
24337 #define SKIP_FOOTER \
24338   SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS;
24339 
Bt2_MatchFinder_GetMatches(CMatchFinder * p,uint32_t * distances)24340 static uint32_t Bt2_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances)
24341 {
24342   uint32_t offset;
24343   GET_MATCHES_HEADER(2)
24344   HASH2_CALC;
24345   curMatch = p->hash[hashValue];
24346   p->hash[hashValue] = p->pos;
24347   offset = 0;
24348   GET_MATCHES_FOOTER(offset, 1)
24349 }
24350 
Bt3Zip_MatchFinder_GetMatches(CMatchFinder * p,uint32_t * distances)24351 uint32_t Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances)
24352 {
24353   uint32_t offset;
24354   GET_MATCHES_HEADER(3)
24355   HASH_ZIP_CALC;
24356   curMatch = p->hash[hashValue];
24357   p->hash[hashValue] = p->pos;
24358   offset = 0;
24359   GET_MATCHES_FOOTER(offset, 2)
24360 }
24361 
Bt3_MatchFinder_GetMatches(CMatchFinder * p,uint32_t * distances)24362 static uint32_t Bt3_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances)
24363 {
24364   uint32_t hash2Value, delta2, maxLen, offset;
24365   GET_MATCHES_HEADER(3)
24366 
24367   HASH3_CALC;
24368 
24369   delta2 = p->pos - p->hash[hash2Value];
24370   curMatch = p->hash[kFix3HashSize + hashValue];
24371 
24372   p->hash[hash2Value] =
24373   p->hash[kFix3HashSize + hashValue] = p->pos;
24374 
24375   maxLen = 2;
24376   offset = 0;
24377   if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
24378   {
24379     for (; maxLen != lenLimit; maxLen++)
24380       if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen])
24381         break;
24382     distances[0] = maxLen;
24383     distances[1] = delta2 - 1;
24384     offset = 2;
24385     if (maxLen == lenLimit)
24386     {
24387       SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p));
24388       MOVE_POS_RET;
24389     }
24390   }
24391   GET_MATCHES_FOOTER(offset, maxLen)
24392 }
24393 
Bt4_MatchFinder_GetMatches(CMatchFinder * p,uint32_t * distances)24394 static uint32_t Bt4_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances)
24395 {
24396   uint32_t hash2Value, hash3Value, delta2, delta3, maxLen, offset;
24397   GET_MATCHES_HEADER(4)
24398 
24399   HASH4_CALC;
24400 
24401   delta2 = p->pos - p->hash[                hash2Value];
24402   delta3 = p->pos - p->hash[kFix3HashSize + hash3Value];
24403   curMatch = p->hash[kFix4HashSize + hashValue];
24404 
24405   p->hash[                hash2Value] =
24406   p->hash[kFix3HashSize + hash3Value] =
24407   p->hash[kFix4HashSize + hashValue] = p->pos;
24408 
24409   maxLen = 1;
24410   offset = 0;
24411   if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
24412   {
24413     distances[0] = maxLen = 2;
24414     distances[1] = delta2 - 1;
24415     offset = 2;
24416   }
24417   if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
24418   {
24419     maxLen = 3;
24420     distances[offset + 1] = delta3 - 1;
24421     offset += 2;
24422     delta2 = delta3;
24423   }
24424   if (offset != 0)
24425   {
24426     for (; maxLen != lenLimit; maxLen++)
24427       if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen])
24428         break;
24429     distances[offset - 2] = maxLen;
24430     if (maxLen == lenLimit)
24431     {
24432       SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p));
24433       MOVE_POS_RET;
24434     }
24435   }
24436   if (maxLen < 3)
24437     maxLen = 3;
24438   GET_MATCHES_FOOTER(offset, maxLen)
24439 }
24440 
Hc4_MatchFinder_GetMatches(CMatchFinder * p,uint32_t * distances)24441 static uint32_t Hc4_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances)
24442 {
24443   uint32_t hash2Value, hash3Value, delta2, delta3, maxLen, offset;
24444   GET_MATCHES_HEADER(4)
24445 
24446   HASH4_CALC;
24447 
24448   delta2 = p->pos - p->hash[                hash2Value];
24449   delta3 = p->pos - p->hash[kFix3HashSize + hash3Value];
24450   curMatch = p->hash[kFix4HashSize + hashValue];
24451 
24452   p->hash[                hash2Value] =
24453   p->hash[kFix3HashSize + hash3Value] =
24454   p->hash[kFix4HashSize + hashValue] = p->pos;
24455 
24456   maxLen = 1;
24457   offset = 0;
24458   if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
24459   {
24460     distances[0] = maxLen = 2;
24461     distances[1] = delta2 - 1;
24462     offset = 2;
24463   }
24464   if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
24465   {
24466     maxLen = 3;
24467     distances[offset + 1] = delta3 - 1;
24468     offset += 2;
24469     delta2 = delta3;
24470   }
24471   if (offset != 0)
24472   {
24473     for (; maxLen != lenLimit; maxLen++)
24474       if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen])
24475         break;
24476     distances[offset - 2] = maxLen;
24477     if (maxLen == lenLimit)
24478     {
24479       p->son[p->cyclicBufferPos] = curMatch;
24480       MOVE_POS_RET;
24481     }
24482   }
24483   if (maxLen < 3)
24484     maxLen = 3;
24485   offset = (uint32_t)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p),
24486     distances + offset, maxLen) - (distances));
24487   MOVE_POS_RET
24488 }
24489 
Hc3Zip_MatchFinder_GetMatches(CMatchFinder * p,uint32_t * distances)24490 uint32_t Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances)
24491 {
24492   uint32_t offset;
24493   GET_MATCHES_HEADER(3)
24494   HASH_ZIP_CALC;
24495   curMatch = p->hash[hashValue];
24496   p->hash[hashValue] = p->pos;
24497   offset = (uint32_t)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p),
24498     distances, 2) - (distances));
24499   MOVE_POS_RET
24500 }
24501 
Bt2_MatchFinder_Skip(CMatchFinder * p,uint32_t num)24502 static void Bt2_MatchFinder_Skip(CMatchFinder *p, uint32_t num)
24503 {
24504   do
24505   {
24506     SKIP_HEADER(2)
24507     HASH2_CALC;
24508     curMatch = p->hash[hashValue];
24509     p->hash[hashValue] = p->pos;
24510     SKIP_FOOTER
24511   }
24512   while (--num != 0);
24513 }
24514 
Bt3Zip_MatchFinder_Skip(CMatchFinder * p,uint32_t num)24515 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num)
24516 {
24517   do
24518   {
24519     SKIP_HEADER(3)
24520     HASH_ZIP_CALC;
24521     curMatch = p->hash[hashValue];
24522     p->hash[hashValue] = p->pos;
24523     SKIP_FOOTER
24524   }
24525   while (--num != 0);
24526 }
24527 
Bt3_MatchFinder_Skip(CMatchFinder * p,uint32_t num)24528 static void Bt3_MatchFinder_Skip(CMatchFinder *p, uint32_t num)
24529 {
24530   do
24531   {
24532     uint32_t hash2Value;
24533     SKIP_HEADER(3)
24534     HASH3_CALC;
24535     curMatch = p->hash[kFix3HashSize + hashValue];
24536     p->hash[hash2Value] =
24537     p->hash[kFix3HashSize + hashValue] = p->pos;
24538     SKIP_FOOTER
24539   }
24540   while (--num != 0);
24541 }
24542 
Bt4_MatchFinder_Skip(CMatchFinder * p,uint32_t num)24543 static void Bt4_MatchFinder_Skip(CMatchFinder *p, uint32_t num)
24544 {
24545   do
24546   {
24547     uint32_t hash2Value, hash3Value;
24548     SKIP_HEADER(4)
24549     HASH4_CALC;
24550     curMatch = p->hash[kFix4HashSize + hashValue];
24551     p->hash[                hash2Value] =
24552     p->hash[kFix3HashSize + hash3Value] = p->pos;
24553     p->hash[kFix4HashSize + hashValue] = p->pos;
24554     SKIP_FOOTER
24555   }
24556   while (--num != 0);
24557 }
24558 
Hc4_MatchFinder_Skip(CMatchFinder * p,uint32_t num)24559 static void Hc4_MatchFinder_Skip(CMatchFinder *p, uint32_t num)
24560 {
24561   do
24562   {
24563     uint32_t hash2Value, hash3Value;
24564     SKIP_HEADER(4)
24565     HASH4_CALC;
24566     curMatch = p->hash[kFix4HashSize + hashValue];
24567     p->hash[                hash2Value] =
24568     p->hash[kFix3HashSize + hash3Value] =
24569     p->hash[kFix4HashSize + hashValue] = p->pos;
24570     p->son[p->cyclicBufferPos] = curMatch;
24571     MOVE_POS
24572   }
24573   while (--num != 0);
24574 }
24575 
Hc3Zip_MatchFinder_Skip(CMatchFinder * p,uint32_t num)24576 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num)
24577 {
24578   do
24579   {
24580     SKIP_HEADER(3)
24581     HASH_ZIP_CALC;
24582     curMatch = p->hash[hashValue];
24583     p->hash[hashValue] = p->pos;
24584     p->son[p->cyclicBufferPos] = curMatch;
24585     MOVE_POS
24586   }
24587   while (--num != 0);
24588 }
24589 
MatchFinder_CreateVTable(CMatchFinder * p,IMatchFinder * vTable)24590 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable)
24591 {
24592   vTable->Init = (Mf_Init_Func)MatchFinder_Init;
24593   vTable->GetIndexByte = (Mf_GetIndexByte_Func)MatchFinder_GetIndexByte;
24594   vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
24595   vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
24596   if (!p->btMode)
24597   {
24598     vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
24599     vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
24600   }
24601   else if (p->numHashBytes == 2)
24602   {
24603     vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
24604     vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
24605   }
24606   else if (p->numHashBytes == 3)
24607   {
24608     vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
24609     vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
24610   }
24611   else
24612   {
24613     vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
24614     vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
24615   }
24616 }
24617 
24618 //  LzmaEnc.h -- LZMA Encoder
24619 // 2009-02-07 : Igor Pavlov : Public domain
24620 
24621 #define LZMA_PROPS_SIZE 5
24622 
24623 struct CLzmaEncProps
24624 {
24625   int level;       /*  0 <= level <= 9 */
24626   uint32_t dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version
24627                       (1 << 12) <= dictSize <= (1 << 30) for 64-bit version
24628                        default = (1 << 24) */
24629   int lc;          /* 0 <= lc <= 8, default = 3 */
24630   int lp;          /* 0 <= lp <= 4, default = 0 */
24631   int pb;          /* 0 <= pb <= 4, default = 2 */
24632   int algo;        /* 0 - fast, 1 - normal, default = 1 */
24633   int fb;          /* 5 <= fb <= 273, default = 32 */
24634   int btMode;      /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */
24635   int numHashBytes; /* 2, 3 or 4, default = 4 */
24636   uint32_t mc;        /* 1 <= mc <= (1 << 30), default = 32 */
24637   unsigned writeEndMark;  /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */
24638   int numThreads;  /* 1 or 2, default = 2 */
24639 };
24640 
24641 void LzmaEncProps_Init(CLzmaEncProps *p);
24642 void LzmaEncProps_Normalize(CLzmaEncProps *p);
24643 uint32_t LzmaEncProps_GetDictSize(const CLzmaEncProps *props2);
24644 
24645 /* ---------- CLzmaEncHandle Interface ---------- */
24646 
24647 /* LzmaEnc_* functions can return the following exit codes:
24648 Returns:
24649   SZ_OK           - OK
24650   SZ_ERROR_MEM    - Memory allocation error
24651   SZ_ERROR_PARAM  - Incorrect paramater in props
24652   SZ_ERROR_WRITE  - Write callback error.
24653   SZ_ERROR_PROGRESS - some break from progress callback
24654   SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version)
24655 */
24656 
24657 typedef void * CLzmaEncHandle;
24658 
24659 CLzmaEncHandle LzmaEnc_Create(ISzAlloc *alloc);
24660 void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig);
24661 SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props);
24662 SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, uint8_t *properties, size_t *size);
24663 SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream,
24664     ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig);
24665 SRes LzmaEnc_MemEncode(CLzmaEncHandle p, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen,
24666     int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig);
24667 
24668 /* ---------- One Call Interface ---------- */
24669 
24670 /* LzmaEncode
24671 Return code:
24672   SZ_OK               - OK
24673   SZ_ERROR_MEM        - Memory allocation error
24674   SZ_ERROR_PARAM      - Incorrect paramater
24675   SZ_ERROR_OUTPUT_EOF - output buffer overflow
24676   SZ_ERROR_THREAD     - errors in multithreading functions (only for Mt version)
24677 */
24678 
24679 SRes LzmaEncode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen,
24680     const CLzmaEncProps *props, uint8_t *propsEncoded, size_t *propsSize, int writeEndMark,
24681     ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig);
24682 
24683 // LzmaEnc.c -- LZMA Encoder
24684 // 2010-04-16 : Igor Pavlov : Public domain
24685 
24686 #define kBlockSizeMax ((1 << LZMA_NUM_BLOCK_SIZE_BITS) - 1)
24687 
24688 #define kBlockSize (9 << 10)
24689 #define kUnpackBlockSize (1 << 18)
24690 #define kMatchArraySize (1 << 21)
24691 #define kMatchRecordMaxSize ((LZMA_MATCH_LEN_MAX * 2 + 3) * LZMA_MATCH_LEN_MAX)
24692 
24693 #define kNumMaxDirectBits (31)
24694 
24695 #define kNumTopBits 24
24696 #define kTopValue ((uint32_t)1 << kNumTopBits)
24697 
24698 #define kNumBitModelTotalBits 11
24699 #define kBitModelTotal (1 << kNumBitModelTotalBits)
24700 #define kNumMoveBits 5
24701 #define kProbInitValue (kBitModelTotal >> 1)
24702 
24703 #define kNumMoveReducingBits 4
24704 #define kNumBitPriceShiftBits 4
24705 #define kBitPrice (1 << kNumBitPriceShiftBits)
24706 
LzmaEncProps_Init(CLzmaEncProps * p)24707 void LzmaEncProps_Init(CLzmaEncProps *p)
24708 {
24709   p->level = 5;
24710   p->dictSize = p->mc = 0;
24711   p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
24712   p->writeEndMark = 0;
24713 }
24714 
LzmaEncProps_Normalize(CLzmaEncProps * p)24715 void LzmaEncProps_Normalize(CLzmaEncProps *p)
24716 {
24717   int level = p->level;
24718   if (level < 0) level = 5;
24719   p->level = level;
24720   if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26)));
24721   if (p->lc < 0) p->lc = 3;
24722   if (p->lp < 0) p->lp = 0;
24723   if (p->pb < 0) p->pb = 2;
24724   if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
24725   if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
24726   if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
24727   if (p->numHashBytes < 0) p->numHashBytes = 4;
24728   if (p->mc == 0)  p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1);
24729   if (p->numThreads < 0)
24730     p->numThreads = 1;
24731 }
24732 
LzmaEncProps_GetDictSize(const CLzmaEncProps * props2)24733 uint32_t LzmaEncProps_GetDictSize(const CLzmaEncProps *props2)
24734 {
24735   CLzmaEncProps props = *props2;
24736   LzmaEncProps_Normalize(&props);
24737   return props.dictSize;
24738 }
24739 
24740 /* #define LZMA_LOG_BSR */
24741 /* Define it for Intel's CPU */
24742 
24743 #ifdef LZMA_LOG_BSR
24744 
24745 #define kDicLogSizeMaxCompress 30
24746 
24747 #define BSR2_RET(pos, res) { unsigned long i; _BitScanReverse(&i, (pos)); res = (i + i) + ((pos >> (i - 1)) & 1); }
24748 
GetPosSlot1(uint32_t pos)24749 uint32_t GetPosSlot1(uint32_t pos)
24750 {
24751   uint32_t res;
24752   BSR2_RET(pos, res);
24753   return res;
24754 }
24755 #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); }
24756 #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); }
24757 
24758 #else
24759 
24760 //#define kNumLogBits (9 + (int)sizeof(size_t) / 2)
24761 #define kNumLogBits (9 + (int)sizeof(uint32_t) / 2)
24762 #define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7)
24763 
LzmaEnc_FastPosInit(uint8_t * g_FastPos)24764 void LzmaEnc_FastPosInit(uint8_t *g_FastPos)
24765 {
24766   int c = 2, slotFast;
24767   g_FastPos[0] = 0;
24768   g_FastPos[1] = 1;
24769 
24770   for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++)
24771   {
24772     uint32_t k = (1 << ((slotFast >> 1) - 1));
24773     uint32_t j;
24774     for (j = 0; j < k; j++, c++)
24775       g_FastPos[c] = (uint8_t)slotFast;
24776   }
24777 }
24778 
24779 #define BSR2_RET(pos, res) { uint32_t i = 6 + ((kNumLogBits - 1) & \
24780   (0 - (((((uint32_t)1 << (kNumLogBits + 6)) - 1) - pos) >> 31))); \
24781   res = p->g_FastPos[pos >> i] + (i * 2); }
24782 /*
24783 #define BSR2_RET(pos, res) { res = (pos < (1 << (kNumLogBits + 6))) ? \
24784   p->g_FastPos[pos >> 6] + 12 : \
24785   p->g_FastPos[pos >> (6 + kNumLogBits - 1)] + (6 + (kNumLogBits - 1)) * 2; }
24786 */
24787 
24788 #define GetPosSlot1(pos) p->g_FastPos[pos]
24789 #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); }
24790 #define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos]; else BSR2_RET(pos, res); }
24791 
24792 #endif
24793 
24794 #define LZMA_NUM_REPS 4
24795 
24796 typedef unsigned CState;
24797 
24798 struct COptimal
24799 {
24800   uint32_t price;
24801 
24802   CState state;
24803   int prev1IsChar;
24804   int prev2;
24805 
24806   uint32_t posPrev2;
24807   uint32_t backPrev2;
24808 
24809   uint32_t posPrev;
24810   uint32_t backPrev;
24811   uint32_t backs[LZMA_NUM_REPS];
24812 };
24813 
24814 #define kNumOpts (1 << 12)
24815 
24816 #define kNumLenToPosStates 4
24817 #define kNumPosSlotBits 6
24818 #define kDicLogSizeMin 0
24819 #define kDicLogSizeMax 32
24820 #define kDistTableSizeMax (kDicLogSizeMax * 2)
24821 
24822 #define kNumAlignBits 4
24823 #define kAlignTableSize (1 << kNumAlignBits)
24824 #define kAlignMask (kAlignTableSize - 1)
24825 
24826 #define kStartPosModelIndex 4
24827 #define kEndPosModelIndex 14
24828 #define kNumPosModels (kEndPosModelIndex - kStartPosModelIndex)
24829 
24830 #define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
24831 
24832 #ifdef _LZMA_PROB32
24833 #define CLzmaProb uint32_t
24834 #else
24835 #define CLzmaProb uint16_t
24836 #endif
24837 
24838 #define LZMA_PB_MAX 4
24839 #define LZMA_LC_MAX 8
24840 #define LZMA_LP_MAX 4
24841 
24842 #define LZMA_NUM_PB_STATES_MAX (1 << LZMA_PB_MAX)
24843 
24844 #define kLenNumLowBits 3
24845 #define kLenNumLowSymbols (1 << kLenNumLowBits)
24846 #define kLenNumMidBits 3
24847 #define kLenNumMidSymbols (1 << kLenNumMidBits)
24848 #define kLenNumHighBits 8
24849 #define kLenNumHighSymbols (1 << kLenNumHighBits)
24850 
24851 #define kLenNumSymbolsTotal (kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols)
24852 
24853 #define LZMA_MATCH_LEN_MIN 2
24854 #define LZMA_MATCH_LEN_MAX (LZMA_MATCH_LEN_MIN + kLenNumSymbolsTotal - 1)
24855 
24856 #define kNumStates 12
24857 
24858 struct CLenEnc
24859 {
24860   CLzmaProb choice;
24861   CLzmaProb choice2;
24862   CLzmaProb low[LZMA_NUM_PB_STATES_MAX << kLenNumLowBits];
24863   CLzmaProb mid[LZMA_NUM_PB_STATES_MAX << kLenNumMidBits];
24864   CLzmaProb high[kLenNumHighSymbols];
24865 };
24866 
24867 struct CLenPriceEnc
24868 {
24869   CLenEnc p;
24870   uint32_t prices[LZMA_NUM_PB_STATES_MAX][kLenNumSymbolsTotal];
24871   uint32_t tableSize;
24872   uint32_t counters[LZMA_NUM_PB_STATES_MAX];
24873 };
24874 
24875 struct CRangeEnc
24876 {
24877   uint32_t range;
24878   uint8_t cache;
24879   uint64_t low;
24880   uint64_t cacheSize;
24881   uint8_t *buf;
24882   uint8_t *bufLim;
24883   uint8_t *bufBase;
24884   ISeqOutStream *outStream;
24885   uint64_t processed;
24886   SRes res;
24887 };
24888 
24889 struct CSaveState
24890 {
24891   CLzmaProb *litProbs;
24892 
24893   CLzmaProb isMatch[kNumStates][LZMA_NUM_PB_STATES_MAX];
24894   CLzmaProb isRep[kNumStates];
24895   CLzmaProb isRepG0[kNumStates];
24896   CLzmaProb isRepG1[kNumStates];
24897   CLzmaProb isRepG2[kNumStates];
24898   CLzmaProb isRep0Long[kNumStates][LZMA_NUM_PB_STATES_MAX];
24899 
24900   CLzmaProb posSlotEncoder[kNumLenToPosStates][1 << kNumPosSlotBits];
24901   CLzmaProb posEncoders[kNumFullDistances - kEndPosModelIndex];
24902   CLzmaProb posAlignEncoder[1 << kNumAlignBits];
24903 
24904   CLenPriceEnc lenEnc;
24905   CLenPriceEnc repLenEnc;
24906 
24907   uint32_t reps[LZMA_NUM_REPS];
24908   uint32_t state;
24909 };
24910 
24911 struct CLzmaEnc
24912 {
24913   IMatchFinder matchFinder;
24914   void *matchFinderObj;
24915 
24916   CMatchFinder matchFinderBase;
24917 
24918   uint32_t optimumEndIndex;
24919   uint32_t optimumCurrentIndex;
24920 
24921   uint32_t longestMatchLength;
24922   uint32_t numPairs;
24923   uint32_t numAvail;
24924   COptimal opt[kNumOpts];
24925 
24926   #ifndef LZMA_LOG_BSR
24927   uint8_t g_FastPos[1 << kNumLogBits];
24928   #endif
24929 
24930   uint32_t ProbPrices[kBitModelTotal >> kNumMoveReducingBits];
24931   uint32_t matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1];
24932   uint32_t numFastBytes;
24933   uint32_t additionalOffset;
24934   uint32_t reps[LZMA_NUM_REPS];
24935   uint32_t state;
24936 
24937   uint32_t posSlotPrices[kNumLenToPosStates][kDistTableSizeMax];
24938   uint32_t distancesPrices[kNumLenToPosStates][kNumFullDistances];
24939   uint32_t alignPrices[kAlignTableSize];
24940   uint32_t alignPriceCount;
24941 
24942   uint32_t distTableSize;
24943 
24944   unsigned lc, lp, pb;
24945   unsigned lpMask, pbMask;
24946 
24947   CLzmaProb *litProbs;
24948 
24949   CLzmaProb isMatch[kNumStates][LZMA_NUM_PB_STATES_MAX];
24950   CLzmaProb isRep[kNumStates];
24951   CLzmaProb isRepG0[kNumStates];
24952   CLzmaProb isRepG1[kNumStates];
24953   CLzmaProb isRepG2[kNumStates];
24954   CLzmaProb isRep0Long[kNumStates][LZMA_NUM_PB_STATES_MAX];
24955 
24956   CLzmaProb posSlotEncoder[kNumLenToPosStates][1 << kNumPosSlotBits];
24957   CLzmaProb posEncoders[kNumFullDistances - kEndPosModelIndex];
24958   CLzmaProb posAlignEncoder[1 << kNumAlignBits];
24959 
24960   CLenPriceEnc lenEnc;
24961   CLenPriceEnc repLenEnc;
24962 
24963   unsigned lclp;
24964 
24965   bool fastMode;
24966 
24967   CRangeEnc rc;
24968 
24969   bool writeEndMark;
24970   uint64_t nowPos64;
24971   uint32_t matchPriceCount;
24972   bool finished;
24973   bool multiThread;
24974 
24975   SRes result;
24976   uint32_t dictSize;
24977   uint32_t matchFinderCycles;
24978 
24979   int needInit;
24980 
24981   CSaveState saveState;
24982 };
24983 
LzmaEnc_SaveState(CLzmaEncHandle pp)24984 void LzmaEnc_SaveState(CLzmaEncHandle pp)
24985 {
24986   CLzmaEnc *p = (CLzmaEnc *)pp;
24987   CSaveState *dest = &p->saveState;
24988   int i;
24989   dest->lenEnc = p->lenEnc;
24990   dest->repLenEnc = p->repLenEnc;
24991   dest->state = p->state;
24992 
24993   for (i = 0; i < kNumStates; i++)
24994   {
24995     memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i]));
24996     memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i]));
24997   }
24998   for (i = 0; i < kNumLenToPosStates; i++)
24999     memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i]));
25000   memcpy(dest->isRep, p->isRep, sizeof(p->isRep));
25001   memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0));
25002   memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1));
25003   memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2));
25004   memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders));
25005   memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder));
25006   memcpy(dest->reps, p->reps, sizeof(p->reps));
25007   memcpy(dest->litProbs, p->litProbs, (0x300 << p->lclp) * sizeof(CLzmaProb));
25008 }
25009 
LzmaEnc_RestoreState(CLzmaEncHandle pp)25010 void LzmaEnc_RestoreState(CLzmaEncHandle pp)
25011 {
25012   CLzmaEnc *dest = (CLzmaEnc *)pp;
25013   const CSaveState *p = &dest->saveState;
25014   int i;
25015   dest->lenEnc = p->lenEnc;
25016   dest->repLenEnc = p->repLenEnc;
25017   dest->state = p->state;
25018 
25019   for (i = 0; i < kNumStates; i++)
25020   {
25021     memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i]));
25022     memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i]));
25023   }
25024   for (i = 0; i < kNumLenToPosStates; i++)
25025     memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i]));
25026   memcpy(dest->isRep, p->isRep, sizeof(p->isRep));
25027   memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0));
25028   memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1));
25029   memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2));
25030   memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders));
25031   memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder));
25032   memcpy(dest->reps, p->reps, sizeof(p->reps));
25033   memcpy(dest->litProbs, p->litProbs, (0x300 << dest->lclp) * sizeof(CLzmaProb));
25034 }
25035 
LzmaEnc_SetProps(CLzmaEncHandle pp,const CLzmaEncProps * props2)25036 SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2)
25037 {
25038   CLzmaEnc *p = (CLzmaEnc *)pp;
25039   CLzmaEncProps props = *props2;
25040   LzmaEncProps_Normalize(&props);
25041 
25042   if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX ||
25043       props.dictSize > ((uint32_t)1 << kDicLogSizeMaxCompress) || props.dictSize > ((uint32_t)1 << 30))
25044     return SZ_ERROR_PARAM;
25045   p->dictSize = props.dictSize;
25046   p->matchFinderCycles = props.mc;
25047   {
25048     unsigned fb = props.fb;
25049     if (fb < 5)
25050       fb = 5;
25051     if (fb > LZMA_MATCH_LEN_MAX)
25052       fb = LZMA_MATCH_LEN_MAX;
25053     p->numFastBytes = fb;
25054   }
25055   p->lc = props.lc;
25056   p->lp = props.lp;
25057   p->pb = props.pb;
25058   p->fastMode = (props.algo == 0);
25059   p->matchFinderBase.btMode = props.btMode;
25060   {
25061     uint32_t numHashBytes = 4;
25062     if (props.btMode)
25063     {
25064       if (props.numHashBytes < 2)
25065         numHashBytes = 2;
25066       else if (props.numHashBytes < 4)
25067         numHashBytes = props.numHashBytes;
25068     }
25069     p->matchFinderBase.numHashBytes = numHashBytes;
25070   }
25071 
25072   p->matchFinderBase.cutValue = props.mc;
25073 
25074   p->writeEndMark = props.writeEndMark;
25075 
25076   return SZ_OK;
25077 }
25078 
25079 static const int kLiteralNextStates[kNumStates] = {0, 0, 0, 0, 1, 2, 3, 4,  5,  6,   4, 5};
25080 static const int kMatchNextStates[kNumStates]   = {7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10};
25081 static const int kRepNextStates[kNumStates]     = {8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11};
25082 static const int kShortRepNextStates[kNumStates]= {9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11};
25083 
25084 #define IsCharState(s) ((s) < 7)
25085 
25086 #define GetLenToPosState(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1)
25087 
25088 #define kInfinityPrice (1 << 30)
25089 
RangeEnc_Construct(CRangeEnc * p)25090 static void RangeEnc_Construct(CRangeEnc *p)
25091 {
25092   p->outStream = 0;
25093   p->bufBase = 0;
25094 }
25095 
25096 #define RangeEnc_GetProcessed(p) ((p)->processed + ((p)->buf - (p)->bufBase) + (p)->cacheSize)
25097 
25098 #define RC_BUF_SIZE (1 << 16)
RangeEnc_Alloc(CRangeEnc * p,ISzAlloc * alloc)25099 static int RangeEnc_Alloc(CRangeEnc *p, ISzAlloc *alloc)
25100 {
25101   if (p->bufBase == 0)
25102   {
25103     p->bufBase = (uint8_t *)alloc->Alloc(alloc, RC_BUF_SIZE);
25104     if (p->bufBase == 0)
25105       return 0;
25106     p->bufLim = p->bufBase + RC_BUF_SIZE;
25107   }
25108   return 1;
25109 }
25110 
RangeEnc_Free(CRangeEnc * p,ISzAlloc * alloc)25111 static void RangeEnc_Free(CRangeEnc *p, ISzAlloc *alloc)
25112 {
25113   alloc->Free(alloc, p->bufBase);
25114   p->bufBase = 0;
25115 }
25116 
RangeEnc_Init(CRangeEnc * p)25117 static void RangeEnc_Init(CRangeEnc *p)
25118 {
25119   /* Stream.Init(); */
25120   p->low = 0;
25121   p->range = 0xFFFFFFFF;
25122   p->cacheSize = 1;
25123   p->cache = 0;
25124 
25125   p->buf = p->bufBase;
25126 
25127   p->processed = 0;
25128   p->res = SZ_OK;
25129 }
25130 
RangeEnc_FlushStream(CRangeEnc * p)25131 static void RangeEnc_FlushStream(CRangeEnc *p)
25132 {
25133   size_t num;
25134   if (p->res != SZ_OK)
25135     return;
25136   num = p->buf - p->bufBase;
25137   if (num != p->outStream->Write(p->outStream, p->bufBase, num))
25138     p->res = SZ_ERROR_WRITE;
25139   p->processed += num;
25140   p->buf = p->bufBase;
25141 }
25142 
RangeEnc_ShiftLow(CRangeEnc * p)25143 static void RangeEnc_ShiftLow(CRangeEnc *p)
25144 {
25145   if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0)
25146   {
25147     uint8_t temp = p->cache;
25148     do
25149     {
25150       uint8_t *buf = p->buf;
25151       *buf++ = (uint8_t)(temp + (uint8_t)(p->low >> 32));
25152       p->buf = buf;
25153       if (buf == p->bufLim)
25154         RangeEnc_FlushStream(p);
25155       temp = 0xFF;
25156     }
25157     while (--p->cacheSize != 0);
25158     p->cache = (uint8_t)((uint32_t)p->low >> 24);
25159   }
25160   p->cacheSize++;
25161   p->low = (uint32_t)p->low << 8;
25162 }
25163 
RangeEnc_FlushData(CRangeEnc * p)25164 static void RangeEnc_FlushData(CRangeEnc *p)
25165 {
25166   int i;
25167   for (i = 0; i < 5; i++)
25168     RangeEnc_ShiftLow(p);
25169 }
25170 
RangeEnc_EncodeDirectBits(CRangeEnc * p,uint32_t value,int numBits)25171 static void RangeEnc_EncodeDirectBits(CRangeEnc *p, uint32_t value, int numBits)
25172 {
25173   do
25174   {
25175     p->range >>= 1;
25176     p->low += p->range & (0 - ((value >> --numBits) & 1));
25177     if (p->range < kTopValue)
25178     {
25179       p->range <<= 8;
25180       RangeEnc_ShiftLow(p);
25181     }
25182   }
25183   while (numBits != 0);
25184 }
25185 
RangeEnc_EncodeBit(CRangeEnc * p,CLzmaProb * prob,uint32_t symbol)25186 static void RangeEnc_EncodeBit(CRangeEnc *p, CLzmaProb *prob, uint32_t symbol)
25187 {
25188   uint32_t ttt = *prob;
25189   uint32_t newBound = (p->range >> kNumBitModelTotalBits) * ttt;
25190   if (symbol == 0)
25191   {
25192     p->range = newBound;
25193     ttt += (kBitModelTotal - ttt) >> kNumMoveBits;
25194   }
25195   else
25196   {
25197     p->low += newBound;
25198     p->range -= newBound;
25199     ttt -= ttt >> kNumMoveBits;
25200   }
25201   *prob = (CLzmaProb)ttt;
25202   if (p->range < kTopValue)
25203   {
25204     p->range <<= 8;
25205     RangeEnc_ShiftLow(p);
25206   }
25207 }
25208 
LitEnc_Encode(CRangeEnc * p,CLzmaProb * probs,uint32_t symbol)25209 static void LitEnc_Encode(CRangeEnc *p, CLzmaProb *probs, uint32_t symbol)
25210 {
25211   symbol |= 0x100;
25212   do
25213   {
25214     RangeEnc_EncodeBit(p, probs + (symbol >> 8), (symbol >> 7) & 1);
25215     symbol <<= 1;
25216   }
25217   while (symbol < 0x10000);
25218 }
25219 
LitEnc_EncodeMatched(CRangeEnc * p,CLzmaProb * probs,uint32_t symbol,uint32_t matchByte)25220 static void LitEnc_EncodeMatched(CRangeEnc *p, CLzmaProb *probs, uint32_t symbol, uint32_t matchByte)
25221 {
25222   uint32_t offs = 0x100;
25223   symbol |= 0x100;
25224   do
25225   {
25226     matchByte <<= 1;
25227     RangeEnc_EncodeBit(p, probs + (offs + (matchByte & offs) + (symbol >> 8)), (symbol >> 7) & 1);
25228     symbol <<= 1;
25229     offs &= ~(matchByte ^ symbol);
25230   }
25231   while (symbol < 0x10000);
25232 }
25233 
LzmaEnc_InitPriceTables(uint32_t * ProbPrices)25234 void LzmaEnc_InitPriceTables(uint32_t *ProbPrices)
25235 {
25236   uint32_t i;
25237   for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits))
25238   {
25239     const int kCyclesBits = kNumBitPriceShiftBits;
25240     uint32_t w = i;
25241     uint32_t bitCount = 0;
25242     int j;
25243     for (j = 0; j < kCyclesBits; j++)
25244     {
25245       w = w * w;
25246       bitCount <<= 1;
25247       while (w >= ((uint32_t)1 << 16))
25248       {
25249         w >>= 1;
25250         bitCount++;
25251       }
25252     }
25253     ProbPrices[i >> kNumMoveReducingBits] = ((kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount);
25254   }
25255 }
25256 
25257 #define GET_PRICE(prob, symbol) \
25258   p->ProbPrices[((prob) ^ (((-(int)(symbol))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits];
25259 
25260 #define GET_PRICEa(prob, symbol) \
25261   ProbPrices[((prob) ^ ((-((int)(symbol))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits];
25262 
25263 #define GET_PRICE_0(prob) p->ProbPrices[(prob) >> kNumMoveReducingBits]
25264 #define GET_PRICE_1(prob) p->ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits]
25265 
25266 #define GET_PRICE_0a(prob) ProbPrices[(prob) >> kNumMoveReducingBits]
25267 #define GET_PRICE_1a(prob) ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits]
25268 
LitEnc_GetPrice(const CLzmaProb * probs,uint32_t symbol,uint32_t * ProbPrices)25269 static uint32_t LitEnc_GetPrice(const CLzmaProb *probs, uint32_t symbol, uint32_t *ProbPrices)
25270 {
25271   uint32_t price = 0;
25272   symbol |= 0x100;
25273   do
25274   {
25275     price += GET_PRICEa(probs[symbol >> 8], (symbol >> 7) & 1);
25276     symbol <<= 1;
25277   }
25278   while (symbol < 0x10000);
25279   return price;
25280 }
25281 
LitEnc_GetPriceMatched(const CLzmaProb * probs,uint32_t symbol,uint32_t matchByte,uint32_t * ProbPrices)25282 static uint32_t LitEnc_GetPriceMatched(const CLzmaProb *probs, uint32_t symbol, uint32_t matchByte, uint32_t *ProbPrices)
25283 {
25284   uint32_t price = 0;
25285   uint32_t offs = 0x100;
25286   symbol |= 0x100;
25287   do
25288   {
25289     matchByte <<= 1;
25290     price += GET_PRICEa(probs[offs + (matchByte & offs) + (symbol >> 8)], (symbol >> 7) & 1);
25291     symbol <<= 1;
25292     offs &= ~(matchByte ^ symbol);
25293   }
25294   while (symbol < 0x10000);
25295   return price;
25296 }
25297 
RcTree_Encode(CRangeEnc * rc,CLzmaProb * probs,int numBitLevels,uint32_t symbol)25298 static void RcTree_Encode(CRangeEnc *rc, CLzmaProb *probs, int numBitLevels, uint32_t symbol)
25299 {
25300   uint32_t m = 1;
25301   int i;
25302   for (i = numBitLevels; i != 0;)
25303   {
25304     uint32_t bit;
25305     i--;
25306     bit = (symbol >> i) & 1;
25307     RangeEnc_EncodeBit(rc, probs + m, bit);
25308     m = (m << 1) | bit;
25309   }
25310 }
25311 
RcTree_ReverseEncode(CRangeEnc * rc,CLzmaProb * probs,int numBitLevels,uint32_t symbol)25312 static void RcTree_ReverseEncode(CRangeEnc *rc, CLzmaProb *probs, int numBitLevels, uint32_t symbol)
25313 {
25314   uint32_t m = 1;
25315   int i;
25316   for (i = 0; i < numBitLevels; i++)
25317   {
25318     uint32_t bit = symbol & 1;
25319     RangeEnc_EncodeBit(rc, probs + m, bit);
25320     m = (m << 1) | bit;
25321     symbol >>= 1;
25322   }
25323 }
25324 
RcTree_GetPrice(const CLzmaProb * probs,int numBitLevels,uint32_t symbol,uint32_t * ProbPrices)25325 static uint32_t RcTree_GetPrice(const CLzmaProb *probs, int numBitLevels, uint32_t symbol, uint32_t *ProbPrices)
25326 {
25327   uint32_t price = 0;
25328   symbol |= (1 << numBitLevels);
25329   while (symbol != 1)
25330   {
25331     price += GET_PRICEa(probs[symbol >> 1], symbol & 1);
25332     symbol >>= 1;
25333   }
25334   return price;
25335 }
25336 
RcTree_ReverseGetPrice(const CLzmaProb * probs,int numBitLevels,uint32_t symbol,uint32_t * ProbPrices)25337 static uint32_t RcTree_ReverseGetPrice(const CLzmaProb *probs, int numBitLevels, uint32_t symbol, uint32_t *ProbPrices)
25338 {
25339   uint32_t price = 0;
25340   uint32_t m = 1;
25341   int i;
25342   for (i = numBitLevels; i != 0; i--)
25343   {
25344     uint32_t bit = symbol & 1;
25345     symbol >>= 1;
25346     price += GET_PRICEa(probs[m], bit);
25347     m = (m << 1) | bit;
25348   }
25349   return price;
25350 }
25351 
LenEnc_Init(CLenEnc * p)25352 static void LenEnc_Init(CLenEnc *p)
25353 {
25354   unsigned i;
25355   p->choice = p->choice2 = kProbInitValue;
25356   for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++)
25357     p->low[i] = kProbInitValue;
25358   for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++)
25359     p->mid[i] = kProbInitValue;
25360   for (i = 0; i < kLenNumHighSymbols; i++)
25361     p->high[i] = kProbInitValue;
25362 }
25363 
LenEnc_Encode(CLenEnc * p,CRangeEnc * rc,uint32_t symbol,uint32_t posState)25364 static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, uint32_t symbol, uint32_t posState)
25365 {
25366   if (symbol < kLenNumLowSymbols)
25367   {
25368     RangeEnc_EncodeBit(rc, &p->choice, 0);
25369     RcTree_Encode(rc, p->low + (posState << kLenNumLowBits), kLenNumLowBits, symbol);
25370   }
25371   else
25372   {
25373     RangeEnc_EncodeBit(rc, &p->choice, 1);
25374     if (symbol < kLenNumLowSymbols + kLenNumMidSymbols)
25375     {
25376       RangeEnc_EncodeBit(rc, &p->choice2, 0);
25377       RcTree_Encode(rc, p->mid + (posState << kLenNumMidBits), kLenNumMidBits, symbol - kLenNumLowSymbols);
25378     }
25379     else
25380     {
25381       RangeEnc_EncodeBit(rc, &p->choice2, 1);
25382       RcTree_Encode(rc, p->high, kLenNumHighBits, symbol - kLenNumLowSymbols - kLenNumMidSymbols);
25383     }
25384   }
25385 }
25386 
LenEnc_SetPrices(CLenEnc * p,uint32_t posState,uint32_t numSymbols,uint32_t * prices,uint32_t * ProbPrices)25387 static void LenEnc_SetPrices(CLenEnc *p, uint32_t posState, uint32_t numSymbols, uint32_t *prices, uint32_t *ProbPrices)
25388 {
25389   uint32_t a0 = GET_PRICE_0a(p->choice);
25390   uint32_t a1 = GET_PRICE_1a(p->choice);
25391   uint32_t b0 = a1 + GET_PRICE_0a(p->choice2);
25392   uint32_t b1 = a1 + GET_PRICE_1a(p->choice2);
25393   uint32_t i = 0;
25394   for (i = 0; i < kLenNumLowSymbols; i++)
25395   {
25396     if (i >= numSymbols)
25397       return;
25398     prices[i] = a0 + RcTree_GetPrice(p->low + (posState << kLenNumLowBits), kLenNumLowBits, i, ProbPrices);
25399   }
25400   for (; i < kLenNumLowSymbols + kLenNumMidSymbols; i++)
25401   {
25402     if (i >= numSymbols)
25403       return;
25404     prices[i] = b0 + RcTree_GetPrice(p->mid + (posState << kLenNumMidBits), kLenNumMidBits, i - kLenNumLowSymbols, ProbPrices);
25405   }
25406   for (; i < numSymbols; i++)
25407     prices[i] = b1 + RcTree_GetPrice(p->high, kLenNumHighBits, i - kLenNumLowSymbols - kLenNumMidSymbols, ProbPrices);
25408 }
25409 
LenPriceEnc_UpdateTable(CLenPriceEnc * p,uint32_t posState,uint32_t * ProbPrices)25410 static void LenPriceEnc_UpdateTable(CLenPriceEnc *p, uint32_t posState, uint32_t *ProbPrices)
25411 {
25412   LenEnc_SetPrices(&p->p, posState, p->tableSize, p->prices[posState], ProbPrices);
25413   p->counters[posState] = p->tableSize;
25414 }
25415 
LenPriceEnc_UpdateTables(CLenPriceEnc * p,uint32_t numPosStates,uint32_t * ProbPrices)25416 static void LenPriceEnc_UpdateTables(CLenPriceEnc *p, uint32_t numPosStates, uint32_t *ProbPrices)
25417 {
25418   uint32_t posState;
25419   for (posState = 0; posState < numPosStates; posState++)
25420     LenPriceEnc_UpdateTable(p, posState, ProbPrices);
25421 }
25422 
LenEnc_Encode2(CLenPriceEnc * p,CRangeEnc * rc,uint32_t symbol,uint32_t posState,bool updatePrice,uint32_t * ProbPrices)25423 static void LenEnc_Encode2(CLenPriceEnc *p, CRangeEnc *rc, uint32_t symbol, uint32_t posState, bool updatePrice, uint32_t *ProbPrices)
25424 {
25425   LenEnc_Encode(&p->p, rc, symbol, posState);
25426   if (updatePrice)
25427     if (--p->counters[posState] == 0)
25428       LenPriceEnc_UpdateTable(p, posState, ProbPrices);
25429 }
25430 
MovePos(CLzmaEnc * p,uint32_t num)25431 static void MovePos(CLzmaEnc *p, uint32_t num)
25432 {
25433   if (num != 0)
25434   {
25435     p->additionalOffset += num;
25436     p->matchFinder.Skip(p->matchFinderObj, num);
25437   }
25438 }
25439 
ReadMatchDistances(CLzmaEnc * p,uint32_t * numDistancePairsRes)25440 static uint32_t ReadMatchDistances(CLzmaEnc *p, uint32_t *numDistancePairsRes)
25441 {
25442   uint32_t lenRes = 0, numPairs;
25443   p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj);
25444   numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches);
25445   if (numPairs > 0)
25446   {
25447     lenRes = p->matches[numPairs - 2];
25448     if (lenRes == p->numFastBytes)
25449     {
25450       const uint8_t *pby = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1;
25451       uint32_t distance = p->matches[numPairs - 1] + 1;
25452       uint32_t numAvail = p->numAvail;
25453       if (numAvail > LZMA_MATCH_LEN_MAX)
25454         numAvail = LZMA_MATCH_LEN_MAX;
25455       {
25456         const uint8_t *pby2 = pby - distance;
25457         for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++);
25458       }
25459     }
25460   }
25461   p->additionalOffset++;
25462   *numDistancePairsRes = numPairs;
25463   return lenRes;
25464 }
25465 
25466 #define MakeAsChar(p) (p)->backPrev = (uint32_t)(-1); (p)->prev1IsChar = false;
25467 #define MakeAsShortRep(p) (p)->backPrev = 0; (p)->prev1IsChar = false;
25468 #define IsShortRep(p) ((p)->backPrev == 0)
25469 
GetRepLen1Price(CLzmaEnc * p,uint32_t state,uint32_t posState)25470 static uint32_t GetRepLen1Price(CLzmaEnc *p, uint32_t state, uint32_t posState)
25471 {
25472   return
25473     GET_PRICE_0(p->isRepG0[state]) +
25474     GET_PRICE_0(p->isRep0Long[state][posState]);
25475 }
25476 
GetPureRepPrice(CLzmaEnc * p,uint32_t repIndex,uint32_t state,uint32_t posState)25477 static uint32_t GetPureRepPrice(CLzmaEnc *p, uint32_t repIndex, uint32_t state, uint32_t posState)
25478 {
25479   uint32_t price;
25480   if (repIndex == 0)
25481   {
25482     price = GET_PRICE_0(p->isRepG0[state]);
25483     price += GET_PRICE_1(p->isRep0Long[state][posState]);
25484   }
25485   else
25486   {
25487     price = GET_PRICE_1(p->isRepG0[state]);
25488     if (repIndex == 1)
25489       price += GET_PRICE_0(p->isRepG1[state]);
25490     else
25491     {
25492       price += GET_PRICE_1(p->isRepG1[state]);
25493       price += GET_PRICE(p->isRepG2[state], repIndex - 2);
25494     }
25495   }
25496   return price;
25497 }
25498 
GetRepPrice(CLzmaEnc * p,uint32_t repIndex,uint32_t len,uint32_t state,uint32_t posState)25499 static uint32_t GetRepPrice(CLzmaEnc *p, uint32_t repIndex, uint32_t len, uint32_t state, uint32_t posState)
25500 {
25501   return p->repLenEnc.prices[posState][len - LZMA_MATCH_LEN_MIN] +
25502     GetPureRepPrice(p, repIndex, state, posState);
25503 }
25504 
Backward(CLzmaEnc * p,uint32_t * backRes,uint32_t cur)25505 static uint32_t Backward(CLzmaEnc *p, uint32_t *backRes, uint32_t cur)
25506 {
25507   uint32_t posMem = p->opt[cur].posPrev;
25508   uint32_t backMem = p->opt[cur].backPrev;
25509   p->optimumEndIndex = cur;
25510   do
25511   {
25512     if (p->opt[cur].prev1IsChar)
25513     {
25514       MakeAsChar(&p->opt[posMem])
25515       p->opt[posMem].posPrev = posMem - 1;
25516       if (p->opt[cur].prev2)
25517       {
25518         p->opt[posMem - 1].prev1IsChar = false;
25519         p->opt[posMem - 1].posPrev = p->opt[cur].posPrev2;
25520         p->opt[posMem - 1].backPrev = p->opt[cur].backPrev2;
25521       }
25522     }
25523     {
25524       uint32_t posPrev = posMem;
25525       uint32_t backCur = backMem;
25526 
25527       backMem = p->opt[posPrev].backPrev;
25528       posMem = p->opt[posPrev].posPrev;
25529 
25530       p->opt[posPrev].backPrev = backCur;
25531       p->opt[posPrev].posPrev = cur;
25532       cur = posPrev;
25533     }
25534   }
25535   while (cur != 0);
25536   *backRes = p->opt[0].backPrev;
25537   p->optimumCurrentIndex  = p->opt[0].posPrev;
25538   return p->optimumCurrentIndex;
25539 }
25540 
25541 #define LIT_PROBS(pos, prevByte) (p->litProbs + ((((pos) & p->lpMask) << p->lc) + ((prevByte) >> (8 - p->lc))) * 0x300)
25542 
GetOptimum(CLzmaEnc * p,uint32_t position,uint32_t * backRes)25543 static uint32_t GetOptimum(CLzmaEnc *p, uint32_t position, uint32_t *backRes)
25544 {
25545   uint32_t numAvail, mainLen, numPairs, repMaxIndex, i, posState, lenEnd, len, cur;
25546   uint32_t matchPrice, repMatchPrice, normalMatchPrice;
25547   uint32_t reps[LZMA_NUM_REPS], repLens[LZMA_NUM_REPS];
25548   uint32_t *matches;
25549   const uint8_t *data;
25550   uint8_t curByte, matchByte;
25551   if (p->optimumEndIndex != p->optimumCurrentIndex)
25552   {
25553     const COptimal *opt = &p->opt[p->optimumCurrentIndex];
25554     uint32_t lenRes = opt->posPrev - p->optimumCurrentIndex;
25555     *backRes = opt->backPrev;
25556     p->optimumCurrentIndex = opt->posPrev;
25557     return lenRes;
25558   }
25559   p->optimumCurrentIndex = p->optimumEndIndex = 0;
25560 
25561   if (p->additionalOffset == 0)
25562     mainLen = ReadMatchDistances(p, &numPairs);
25563   else
25564   {
25565     mainLen = p->longestMatchLength;
25566     numPairs = p->numPairs;
25567   }
25568 
25569   numAvail = p->numAvail;
25570   if (numAvail < 2)
25571   {
25572     *backRes = (uint32_t)(-1);
25573     return 1;
25574   }
25575   if (numAvail > LZMA_MATCH_LEN_MAX)
25576     numAvail = LZMA_MATCH_LEN_MAX;
25577 
25578   data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1;
25579   repMaxIndex = 0;
25580   for (i = 0; i < LZMA_NUM_REPS; i++)
25581   {
25582     uint32_t lenTest;
25583     const uint8_t *data2;
25584     reps[i] = p->reps[i];
25585     data2 = data - (reps[i] + 1);
25586     if (data[0] != data2[0] || data[1] != data2[1])
25587     {
25588       repLens[i] = 0;
25589       continue;
25590     }
25591     for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++);
25592     repLens[i] = lenTest;
25593     if (lenTest > repLens[repMaxIndex])
25594       repMaxIndex = i;
25595   }
25596   if (repLens[repMaxIndex] >= p->numFastBytes)
25597   {
25598     uint32_t lenRes;
25599     *backRes = repMaxIndex;
25600     lenRes = repLens[repMaxIndex];
25601     MovePos(p, lenRes - 1);
25602     return lenRes;
25603   }
25604 
25605   matches = p->matches;
25606   if (mainLen >= p->numFastBytes)
25607   {
25608     *backRes = matches[numPairs - 1] + LZMA_NUM_REPS;
25609     MovePos(p, mainLen - 1);
25610     return mainLen;
25611   }
25612   curByte = *data;
25613   matchByte = *(data - (reps[0] + 1));
25614 
25615   if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2)
25616   {
25617     *backRes = (uint32_t)-1;
25618     return 1;
25619   }
25620 
25621   p->opt[0].state = (CState)p->state;
25622 
25623   posState = (position & p->pbMask);
25624 
25625   {
25626     const CLzmaProb *probs = LIT_PROBS(position, *(data - 1));
25627     p->opt[1].price = GET_PRICE_0(p->isMatch[p->state][posState]) +
25628         (!IsCharState(p->state) ?
25629           LitEnc_GetPriceMatched(probs, curByte, matchByte, p->ProbPrices) :
25630           LitEnc_GetPrice(probs, curByte, p->ProbPrices));
25631   }
25632 
25633   MakeAsChar(&p->opt[1]);
25634 
25635   matchPrice = GET_PRICE_1(p->isMatch[p->state][posState]);
25636   repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[p->state]);
25637 
25638   if (matchByte == curByte)
25639   {
25640     uint32_t shortRepPrice = repMatchPrice + GetRepLen1Price(p, p->state, posState);
25641     if (shortRepPrice < p->opt[1].price)
25642     {
25643       p->opt[1].price = shortRepPrice;
25644       MakeAsShortRep(&p->opt[1]);
25645     }
25646   }
25647   lenEnd = ((mainLen >= repLens[repMaxIndex]) ? mainLen : repLens[repMaxIndex]);
25648 
25649   if (lenEnd < 2)
25650   {
25651     *backRes = p->opt[1].backPrev;
25652     return 1;
25653   }
25654 
25655   p->opt[1].posPrev = 0;
25656   for (i = 0; i < LZMA_NUM_REPS; i++)
25657     p->opt[0].backs[i] = reps[i];
25658 
25659   len = lenEnd;
25660   do
25661     p->opt[len--].price = kInfinityPrice;
25662   while (len >= 2);
25663 
25664   for (i = 0; i < LZMA_NUM_REPS; i++)
25665   {
25666     uint32_t repLen = repLens[i];
25667     uint32_t price;
25668     if (repLen < 2)
25669       continue;
25670     price = repMatchPrice + GetPureRepPrice(p, i, p->state, posState);
25671     do
25672     {
25673       uint32_t curAndLenPrice = price + p->repLenEnc.prices[posState][repLen - 2];
25674       COptimal *opt = &p->opt[repLen];
25675       if (curAndLenPrice < opt->price)
25676       {
25677         opt->price = curAndLenPrice;
25678         opt->posPrev = 0;
25679         opt->backPrev = i;
25680         opt->prev1IsChar = false;
25681       }
25682     }
25683     while (--repLen >= 2);
25684   }
25685 
25686   normalMatchPrice = matchPrice + GET_PRICE_0(p->isRep[p->state]);
25687 
25688   len = ((repLens[0] >= 2) ? repLens[0] + 1 : 2);
25689   if (len <= mainLen)
25690   {
25691     uint32_t offs = 0;
25692     while (len > matches[offs])
25693       offs += 2;
25694     for (; ; len++)
25695     {
25696       COptimal *opt;
25697       uint32_t distance = matches[offs + 1];
25698 
25699       uint32_t curAndLenPrice = normalMatchPrice + p->lenEnc.prices[posState][len - LZMA_MATCH_LEN_MIN];
25700       uint32_t lenToPosState = GetLenToPosState(len);
25701       if (distance < kNumFullDistances)
25702         curAndLenPrice += p->distancesPrices[lenToPosState][distance];
25703       else
25704       {
25705         uint32_t slot;
25706         GetPosSlot2(distance, slot);
25707         curAndLenPrice += p->alignPrices[distance & kAlignMask] + p->posSlotPrices[lenToPosState][slot];
25708       }
25709       opt = &p->opt[len];
25710       if (curAndLenPrice < opt->price)
25711       {
25712         opt->price = curAndLenPrice;
25713         opt->posPrev = 0;
25714         opt->backPrev = distance + LZMA_NUM_REPS;
25715         opt->prev1IsChar = false;
25716       }
25717       if (len == matches[offs])
25718       {
25719         offs += 2;
25720         if (offs == numPairs)
25721           break;
25722       }
25723     }
25724   }
25725 
25726   cur = 0;
25727 
25728   for (;;)
25729   {
25730     uint32_t numAvailFull, newLen, numPairs, posPrev, state, posState, startLen;
25731     uint32_t curPrice, curAnd1Price, matchPrice, repMatchPrice;
25732     bool nextIsChar;
25733     uint8_t curByte, matchByte;
25734     const uint8_t *data;
25735     COptimal *curOpt;
25736     COptimal *nextOpt;
25737 
25738     cur++;
25739     if (cur == lenEnd)
25740       return Backward(p, backRes, cur);
25741 
25742     newLen = ReadMatchDistances(p, &numPairs);
25743     if (newLen >= p->numFastBytes)
25744     {
25745       p->numPairs = numPairs;
25746       p->longestMatchLength = newLen;
25747       return Backward(p, backRes, cur);
25748     }
25749     position++;
25750     curOpt = &p->opt[cur];
25751     posPrev = curOpt->posPrev;
25752     if (curOpt->prev1IsChar)
25753     {
25754       posPrev--;
25755       if (curOpt->prev2)
25756       {
25757         state = p->opt[curOpt->posPrev2].state;
25758         if (curOpt->backPrev2 < LZMA_NUM_REPS)
25759           state = kRepNextStates[state];
25760         else
25761           state = kMatchNextStates[state];
25762       }
25763       else
25764         state = p->opt[posPrev].state;
25765       state = kLiteralNextStates[state];
25766     }
25767     else
25768       state = p->opt[posPrev].state;
25769     if (posPrev == cur - 1)
25770     {
25771       if (IsShortRep(curOpt))
25772         state = kShortRepNextStates[state];
25773       else
25774         state = kLiteralNextStates[state];
25775     }
25776     else
25777     {
25778       uint32_t pos;
25779       const COptimal *prevOpt;
25780       if (curOpt->prev1IsChar && curOpt->prev2)
25781       {
25782         posPrev = curOpt->posPrev2;
25783         pos = curOpt->backPrev2;
25784         state = kRepNextStates[state];
25785       }
25786       else
25787       {
25788         pos = curOpt->backPrev;
25789         if (pos < LZMA_NUM_REPS)
25790           state = kRepNextStates[state];
25791         else
25792           state = kMatchNextStates[state];
25793       }
25794       prevOpt = &p->opt[posPrev];
25795       if (pos < LZMA_NUM_REPS)
25796       {
25797         uint32_t i;
25798         reps[0] = prevOpt->backs[pos];
25799         for (i = 1; i <= pos; i++)
25800           reps[i] = prevOpt->backs[i - 1];
25801         for (; i < LZMA_NUM_REPS; i++)
25802           reps[i] = prevOpt->backs[i];
25803       }
25804       else
25805       {
25806         uint32_t i;
25807         reps[0] = (pos - LZMA_NUM_REPS);
25808         for (i = 1; i < LZMA_NUM_REPS; i++)
25809           reps[i] = prevOpt->backs[i - 1];
25810       }
25811     }
25812     curOpt->state = (CState)state;
25813 
25814     curOpt->backs[0] = reps[0];
25815     curOpt->backs[1] = reps[1];
25816     curOpt->backs[2] = reps[2];
25817     curOpt->backs[3] = reps[3];
25818 
25819     curPrice = curOpt->price;
25820     nextIsChar = false;
25821     data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1;
25822     curByte = *data;
25823     matchByte = *(data - (reps[0] + 1));
25824 
25825     posState = (position & p->pbMask);
25826 
25827     curAnd1Price = curPrice + GET_PRICE_0(p->isMatch[state][posState]);
25828     {
25829       const CLzmaProb *probs = LIT_PROBS(position, *(data - 1));
25830       curAnd1Price +=
25831         (!IsCharState(state) ?
25832           LitEnc_GetPriceMatched(probs, curByte, matchByte, p->ProbPrices) :
25833           LitEnc_GetPrice(probs, curByte, p->ProbPrices));
25834     }
25835 
25836     nextOpt = &p->opt[cur + 1];
25837 
25838     if (curAnd1Price < nextOpt->price)
25839     {
25840       nextOpt->price = curAnd1Price;
25841       nextOpt->posPrev = cur;
25842       MakeAsChar(nextOpt);
25843       nextIsChar = true;
25844     }
25845 
25846     matchPrice = curPrice + GET_PRICE_1(p->isMatch[state][posState]);
25847     repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[state]);
25848 
25849     if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0))
25850     {
25851       uint32_t shortRepPrice = repMatchPrice + GetRepLen1Price(p, state, posState);
25852       if (shortRepPrice <= nextOpt->price)
25853       {
25854         nextOpt->price = shortRepPrice;
25855         nextOpt->posPrev = cur;
25856         MakeAsShortRep(nextOpt);
25857         nextIsChar = true;
25858       }
25859     }
25860     numAvailFull = p->numAvail;
25861     {
25862       uint32_t temp = kNumOpts - 1 - cur;
25863       if (temp < numAvailFull)
25864         numAvailFull = temp;
25865     }
25866 
25867     if (numAvailFull < 2)
25868       continue;
25869     numAvail = (numAvailFull <= p->numFastBytes ? numAvailFull : p->numFastBytes);
25870 
25871     if (!nextIsChar && matchByte != curByte) /* speed optimization */
25872     {
25873       /* try Literal + rep0 */
25874       uint32_t temp;
25875       uint32_t lenTest2;
25876       const uint8_t *data2 = data - (reps[0] + 1);
25877       uint32_t limit = p->numFastBytes + 1;
25878       if (limit > numAvailFull)
25879         limit = numAvailFull;
25880 
25881       for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++);
25882       lenTest2 = temp - 1;
25883       if (lenTest2 >= 2)
25884       {
25885         uint32_t state2 = kLiteralNextStates[state];
25886         uint32_t posStateNext = (position + 1) & p->pbMask;
25887         uint32_t nextRepMatchPrice = curAnd1Price +
25888             GET_PRICE_1(p->isMatch[state2][posStateNext]) +
25889             GET_PRICE_1(p->isRep[state2]);
25890         /* for (; lenTest2 >= 2; lenTest2--) */
25891         {
25892           uint32_t curAndLenPrice;
25893           COptimal *opt;
25894           uint32_t offset = cur + 1 + lenTest2;
25895           while (lenEnd < offset)
25896             p->opt[++lenEnd].price = kInfinityPrice;
25897           curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext);
25898           opt = &p->opt[offset];
25899           if (curAndLenPrice < opt->price)
25900           {
25901             opt->price = curAndLenPrice;
25902             opt->posPrev = cur + 1;
25903             opt->backPrev = 0;
25904             opt->prev1IsChar = true;
25905             opt->prev2 = false;
25906           }
25907         }
25908       }
25909     }
25910 
25911     startLen = 2; /* speed optimization */
25912     {
25913     uint32_t repIndex;
25914     for (repIndex = 0; repIndex < LZMA_NUM_REPS; repIndex++)
25915     {
25916       uint32_t lenTest;
25917       uint32_t lenTestTemp;
25918       uint32_t price;
25919       const uint8_t *data2 = data - (reps[repIndex] + 1);
25920       if (data[0] != data2[0] || data[1] != data2[1])
25921         continue;
25922       for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++);
25923       while (lenEnd < cur + lenTest)
25924         p->opt[++lenEnd].price = kInfinityPrice;
25925       lenTestTemp = lenTest;
25926       price = repMatchPrice + GetPureRepPrice(p, repIndex, state, posState);
25927       do
25928       {
25929         uint32_t curAndLenPrice = price + p->repLenEnc.prices[posState][lenTest - 2];
25930         COptimal *opt = &p->opt[cur + lenTest];
25931         if (curAndLenPrice < opt->price)
25932         {
25933           opt->price = curAndLenPrice;
25934           opt->posPrev = cur;
25935           opt->backPrev = repIndex;
25936           opt->prev1IsChar = false;
25937         }
25938       }
25939       while (--lenTest >= 2);
25940       lenTest = lenTestTemp;
25941 
25942       if (repIndex == 0)
25943         startLen = lenTest + 1;
25944 
25945       /* if (_maxMode) */
25946         {
25947           uint32_t lenTest2 = lenTest + 1;
25948           uint32_t limit = lenTest2 + p->numFastBytes;
25949           uint32_t nextRepMatchPrice;
25950           if (limit > numAvailFull)
25951             limit = numAvailFull;
25952           for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++);
25953           lenTest2 -= lenTest + 1;
25954           if (lenTest2 >= 2)
25955           {
25956             uint32_t state2 = kRepNextStates[state];
25957             uint32_t posStateNext = (position + lenTest) & p->pbMask;
25958             uint32_t curAndLenCharPrice =
25959                 price + p->repLenEnc.prices[posState][lenTest - 2] +
25960                 GET_PRICE_0(p->isMatch[state2][posStateNext]) +
25961                 LitEnc_GetPriceMatched(LIT_PROBS(position + lenTest, data[lenTest - 1]),
25962                     data[lenTest], data2[lenTest], p->ProbPrices);
25963             state2 = kLiteralNextStates[state2];
25964             posStateNext = (position + lenTest + 1) & p->pbMask;
25965             nextRepMatchPrice = curAndLenCharPrice +
25966                 GET_PRICE_1(p->isMatch[state2][posStateNext]) +
25967                 GET_PRICE_1(p->isRep[state2]);
25968 
25969             /* for (; lenTest2 >= 2; lenTest2--) */
25970             {
25971               uint32_t curAndLenPrice;
25972               COptimal *opt;
25973               uint32_t offset = cur + lenTest + 1 + lenTest2;
25974               while (lenEnd < offset)
25975                 p->opt[++lenEnd].price = kInfinityPrice;
25976               curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext);
25977               opt = &p->opt[offset];
25978               if (curAndLenPrice < opt->price)
25979               {
25980                 opt->price = curAndLenPrice;
25981                 opt->posPrev = cur + lenTest + 1;
25982                 opt->backPrev = 0;
25983                 opt->prev1IsChar = true;
25984                 opt->prev2 = true;
25985                 opt->posPrev2 = cur;
25986                 opt->backPrev2 = repIndex;
25987               }
25988             }
25989           }
25990         }
25991     }
25992     }
25993     /* for (uint32_t lenTest = 2; lenTest <= newLen; lenTest++) */
25994     if (newLen > numAvail)
25995     {
25996       newLen = numAvail;
25997       for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2);
25998       matches[numPairs] = newLen;
25999       numPairs += 2;
26000     }
26001     if (newLen >= startLen)
26002     {
26003       uint32_t normalMatchPrice = matchPrice + GET_PRICE_0(p->isRep[state]);
26004       uint32_t offs, curBack, posSlot;
26005       uint32_t lenTest;
26006       while (lenEnd < cur + newLen)
26007         p->opt[++lenEnd].price = kInfinityPrice;
26008 
26009       offs = 0;
26010       while (startLen > matches[offs])
26011         offs += 2;
26012       curBack = matches[offs + 1];
26013       GetPosSlot2(curBack, posSlot);
26014       for (lenTest = /*2*/ startLen; ; lenTest++)
26015       {
26016         uint32_t curAndLenPrice = normalMatchPrice + p->lenEnc.prices[posState][lenTest - LZMA_MATCH_LEN_MIN];
26017         uint32_t lenToPosState = GetLenToPosState(lenTest);
26018         COptimal *opt;
26019         if (curBack < kNumFullDistances)
26020           curAndLenPrice += p->distancesPrices[lenToPosState][curBack];
26021         else
26022           curAndLenPrice += p->posSlotPrices[lenToPosState][posSlot] + p->alignPrices[curBack & kAlignMask];
26023 
26024         opt = &p->opt[cur + lenTest];
26025         if (curAndLenPrice < opt->price)
26026         {
26027           opt->price = curAndLenPrice;
26028           opt->posPrev = cur;
26029           opt->backPrev = curBack + LZMA_NUM_REPS;
26030           opt->prev1IsChar = false;
26031         }
26032 
26033         if (/*_maxMode && */lenTest == matches[offs])
26034         {
26035           /* Try Match + Literal + Rep0 */
26036           const uint8_t *data2 = data - (curBack + 1);
26037           uint32_t lenTest2 = lenTest + 1;
26038           uint32_t limit = lenTest2 + p->numFastBytes;
26039           uint32_t nextRepMatchPrice;
26040           if (limit > numAvailFull)
26041             limit = numAvailFull;
26042           for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++);
26043           lenTest2 -= lenTest + 1;
26044           if (lenTest2 >= 2)
26045           {
26046             uint32_t state2 = kMatchNextStates[state];
26047             uint32_t posStateNext = (position + lenTest) & p->pbMask;
26048             uint32_t curAndLenCharPrice = curAndLenPrice +
26049                 GET_PRICE_0(p->isMatch[state2][posStateNext]) +
26050                 LitEnc_GetPriceMatched(LIT_PROBS(position + lenTest, data[lenTest - 1]),
26051                     data[lenTest], data2[lenTest], p->ProbPrices);
26052             state2 = kLiteralNextStates[state2];
26053             posStateNext = (posStateNext + 1) & p->pbMask;
26054             nextRepMatchPrice = curAndLenCharPrice +
26055                 GET_PRICE_1(p->isMatch[state2][posStateNext]) +
26056                 GET_PRICE_1(p->isRep[state2]);
26057 
26058             /* for (; lenTest2 >= 2; lenTest2--) */
26059             {
26060               uint32_t offset = cur + lenTest + 1 + lenTest2;
26061               uint32_t curAndLenPrice;
26062               COptimal *opt;
26063               while (lenEnd < offset)
26064                 p->opt[++lenEnd].price = kInfinityPrice;
26065               curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext);
26066               opt = &p->opt[offset];
26067               if (curAndLenPrice < opt->price)
26068               {
26069                 opt->price = curAndLenPrice;
26070                 opt->posPrev = cur + lenTest + 1;
26071                 opt->backPrev = 0;
26072                 opt->prev1IsChar = true;
26073                 opt->prev2 = true;
26074                 opt->posPrev2 = cur;
26075                 opt->backPrev2 = curBack + LZMA_NUM_REPS;
26076               }
26077             }
26078           }
26079           offs += 2;
26080           if (offs == numPairs)
26081             break;
26082           curBack = matches[offs + 1];
26083           if (curBack >= kNumFullDistances)
26084             GetPosSlot2(curBack, posSlot);
26085         }
26086       }
26087     }
26088   }
26089 }
26090 
26091 #define ChangePair(smallDist, bigDist) (((bigDist) >> 7) > (smallDist))
26092 
GetOptimumFast(CLzmaEnc * p,uint32_t * backRes)26093 static uint32_t GetOptimumFast(CLzmaEnc *p, uint32_t *backRes)
26094 {
26095   uint32_t numAvail, mainLen, mainDist, numPairs, repIndex, repLen, i;
26096   const uint8_t *data;
26097   const uint32_t *matches;
26098 
26099   if (p->additionalOffset == 0)
26100     mainLen = ReadMatchDistances(p, &numPairs);
26101   else
26102   {
26103     mainLen = p->longestMatchLength;
26104     numPairs = p->numPairs;
26105   }
26106 
26107   numAvail = p->numAvail;
26108   *backRes = (uint32_t)-1;
26109   if (numAvail < 2)
26110     return 1;
26111   if (numAvail > LZMA_MATCH_LEN_MAX)
26112     numAvail = LZMA_MATCH_LEN_MAX;
26113   data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1;
26114 
26115   repLen = repIndex = 0;
26116   for (i = 0; i < LZMA_NUM_REPS; i++)
26117   {
26118     uint32_t len;
26119     const uint8_t *data2 = data - (p->reps[i] + 1);
26120     if (data[0] != data2[0] || data[1] != data2[1])
26121       continue;
26122     for (len = 2; len < numAvail && data[len] == data2[len]; len++);
26123     if (len >= p->numFastBytes)
26124     {
26125       *backRes = i;
26126       MovePos(p, len - 1);
26127       return len;
26128     }
26129     if (len > repLen)
26130     {
26131       repIndex = i;
26132       repLen = len;
26133     }
26134   }
26135 
26136   matches = p->matches;
26137   if (mainLen >= p->numFastBytes)
26138   {
26139     *backRes = matches[numPairs - 1] + LZMA_NUM_REPS;
26140     MovePos(p, mainLen - 1);
26141     return mainLen;
26142   }
26143 
26144   mainDist = 0; /* for GCC */
26145   if (mainLen >= 2)
26146   {
26147     mainDist = matches[numPairs - 1];
26148     while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1)
26149     {
26150       if (!ChangePair(matches[numPairs - 3], mainDist))
26151         break;
26152       numPairs -= 2;
26153       mainLen = matches[numPairs - 2];
26154       mainDist = matches[numPairs - 1];
26155     }
26156     if (mainLen == 2 && mainDist >= 0x80)
26157       mainLen = 1;
26158   }
26159 
26160   if (repLen >= 2 && (
26161         (repLen + 1 >= mainLen) ||
26162         (repLen + 2 >= mainLen && mainDist >= (1 << 9)) ||
26163         (repLen + 3 >= mainLen && mainDist >= (1 << 15))))
26164   {
26165     *backRes = repIndex;
26166     MovePos(p, repLen - 1);
26167     return repLen;
26168   }
26169 
26170   if (mainLen < 2 || numAvail <= 2)
26171     return 1;
26172 
26173   p->longestMatchLength = ReadMatchDistances(p, &p->numPairs);
26174   if (p->longestMatchLength >= 2)
26175   {
26176     uint32_t newDistance = matches[p->numPairs - 1];
26177     if ((p->longestMatchLength >= mainLen && newDistance < mainDist) ||
26178         (p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) ||
26179         (p->longestMatchLength > mainLen + 1) ||
26180         (p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist)))
26181       return 1;
26182   }
26183 
26184   data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1;
26185   for (i = 0; i < LZMA_NUM_REPS; i++)
26186   {
26187     uint32_t len, limit;
26188     const uint8_t *data2 = data - (p->reps[i] + 1);
26189     if (data[0] != data2[0] || data[1] != data2[1])
26190       continue;
26191     limit = mainLen - 1;
26192     for (len = 2; len < limit && data[len] == data2[len]; len++);
26193     if (len >= limit)
26194       return 1;
26195   }
26196   *backRes = mainDist + LZMA_NUM_REPS;
26197   MovePos(p, mainLen - 2);
26198   return mainLen;
26199 }
26200 
WriteEndMarker(CLzmaEnc * p,uint32_t posState)26201 static void WriteEndMarker(CLzmaEnc *p, uint32_t posState)
26202 {
26203   uint32_t len;
26204   RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 1);
26205   RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 0);
26206   p->state = kMatchNextStates[p->state];
26207   len = LZMA_MATCH_LEN_MIN;
26208   LenEnc_Encode2(&p->lenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices);
26209   RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, (1 << kNumPosSlotBits) - 1);
26210   RangeEnc_EncodeDirectBits(&p->rc, (((uint32_t)1 << 30) - 1) >> kNumAlignBits, 30 - kNumAlignBits);
26211   RcTree_ReverseEncode(&p->rc, p->posAlignEncoder, kNumAlignBits, kAlignMask);
26212 }
26213 
CheckErrors(CLzmaEnc * p)26214 static SRes CheckErrors(CLzmaEnc *p)
26215 {
26216   if (p->result != SZ_OK)
26217     return p->result;
26218   if (p->rc.res != SZ_OK)
26219     p->result = SZ_ERROR_WRITE;
26220   if (p->matchFinderBase.result != SZ_OK)
26221     p->result = SZ_ERROR_READ;
26222   if (p->result != SZ_OK)
26223     p->finished = true;
26224   return p->result;
26225 }
26226 
Flush(CLzmaEnc * p,uint32_t nowPos)26227 static SRes Flush(CLzmaEnc *p, uint32_t nowPos)
26228 {
26229   /* ReleaseMFStream(); */
26230   p->finished = true;
26231   if (p->writeEndMark)
26232     WriteEndMarker(p, nowPos & p->pbMask);
26233   RangeEnc_FlushData(&p->rc);
26234   RangeEnc_FlushStream(&p->rc);
26235   return CheckErrors(p);
26236 }
26237 
FillAlignPrices(CLzmaEnc * p)26238 static void FillAlignPrices(CLzmaEnc *p)
26239 {
26240   uint32_t i;
26241   for (i = 0; i < kAlignTableSize; i++)
26242     p->alignPrices[i] = RcTree_ReverseGetPrice(p->posAlignEncoder, kNumAlignBits, i, p->ProbPrices);
26243   p->alignPriceCount = 0;
26244 }
26245 
FillDistancesPrices(CLzmaEnc * p)26246 static void FillDistancesPrices(CLzmaEnc *p)
26247 {
26248   uint32_t tempPrices[kNumFullDistances];
26249   uint32_t i, lenToPosState;
26250   for (i = kStartPosModelIndex; i < kNumFullDistances; i++)
26251   {
26252     uint32_t posSlot = GetPosSlot1(i);
26253     uint32_t footerBits = ((posSlot >> 1) - 1);
26254     uint32_t base = ((2 | (posSlot & 1)) << footerBits);
26255     tempPrices[i] = RcTree_ReverseGetPrice(p->posEncoders + base - posSlot - 1, footerBits, i - base, p->ProbPrices);
26256   }
26257 
26258   for (lenToPosState = 0; lenToPosState < kNumLenToPosStates; lenToPosState++)
26259   {
26260     uint32_t posSlot;
26261     const CLzmaProb *encoder = p->posSlotEncoder[lenToPosState];
26262     uint32_t *posSlotPrices = p->posSlotPrices[lenToPosState];
26263     for (posSlot = 0; posSlot < p->distTableSize; posSlot++)
26264       posSlotPrices[posSlot] = RcTree_GetPrice(encoder, kNumPosSlotBits, posSlot, p->ProbPrices);
26265     for (posSlot = kEndPosModelIndex; posSlot < p->distTableSize; posSlot++)
26266       posSlotPrices[posSlot] += ((((posSlot >> 1) - 1) - kNumAlignBits) << kNumBitPriceShiftBits);
26267 
26268     {
26269       uint32_t *distancesPrices = p->distancesPrices[lenToPosState];
26270       uint32_t i;
26271       for (i = 0; i < kStartPosModelIndex; i++)
26272         distancesPrices[i] = posSlotPrices[i];
26273       for (; i < kNumFullDistances; i++)
26274         distancesPrices[i] = posSlotPrices[GetPosSlot1(i)] + tempPrices[i];
26275     }
26276   }
26277   p->matchPriceCount = 0;
26278 }
26279 
LzmaEnc_Construct(CLzmaEnc * p)26280 void LzmaEnc_Construct(CLzmaEnc *p)
26281 {
26282   RangeEnc_Construct(&p->rc);
26283   MatchFinder_Construct(&p->matchFinderBase);
26284 
26285   {
26286     CLzmaEncProps props;
26287     LzmaEncProps_Init(&props);
26288     LzmaEnc_SetProps(p, &props);
26289   }
26290 
26291   #ifndef LZMA_LOG_BSR
26292   LzmaEnc_FastPosInit(p->g_FastPos);
26293   #endif
26294 
26295   LzmaEnc_InitPriceTables(p->ProbPrices);
26296   p->litProbs = 0;
26297   p->saveState.litProbs = 0;
26298 }
26299 
LzmaEnc_Create(ISzAlloc * alloc)26300 CLzmaEncHandle LzmaEnc_Create(ISzAlloc *alloc)
26301 {
26302   void *p;
26303   p = alloc->Alloc(alloc, sizeof(CLzmaEnc));
26304   if (p != 0)
26305     LzmaEnc_Construct((CLzmaEnc *)p);
26306   return p;
26307 }
26308 
LzmaEnc_FreeLits(CLzmaEnc * p,ISzAlloc * alloc)26309 void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAlloc *alloc)
26310 {
26311   alloc->Free(alloc, p->litProbs);
26312   alloc->Free(alloc, p->saveState.litProbs);
26313   p->litProbs = 0;
26314   p->saveState.litProbs = 0;
26315 }
26316 
LzmaEnc_Destruct(CLzmaEnc * p,ISzAlloc * alloc,ISzAlloc * allocBig)26317 void LzmaEnc_Destruct(CLzmaEnc *p, ISzAlloc *alloc, ISzAlloc *allocBig)
26318 {
26319   MatchFinder_Free(&p->matchFinderBase, allocBig);
26320   LzmaEnc_FreeLits(p, alloc);
26321   RangeEnc_Free(&p->rc, alloc);
26322 }
26323 
LzmaEnc_Destroy(CLzmaEncHandle p,ISzAlloc * alloc,ISzAlloc * allocBig)26324 void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig)
26325 {
26326   LzmaEnc_Destruct((CLzmaEnc *)p, alloc, allocBig);
26327   alloc->Free(alloc, p);
26328 }
26329 
LzmaEnc_CodeOneBlock(CLzmaEnc * p,bool useLimits,uint32_t maxPackSize,uint32_t maxUnpackSize)26330 static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, bool useLimits, uint32_t maxPackSize, uint32_t maxUnpackSize)
26331 {
26332   uint32_t nowPos32, startPos32;
26333   if (p->needInit)
26334   {
26335     p->matchFinder.Init(p->matchFinderObj);
26336     p->needInit = 0;
26337   }
26338 
26339   if (p->finished)
26340     return p->result;
26341   RINOK(CheckErrors(p));
26342 
26343   nowPos32 = (uint32_t)p->nowPos64;
26344   startPos32 = nowPos32;
26345 
26346   if (p->nowPos64 == 0)
26347   {
26348     uint32_t numPairs;
26349     uint8_t curByte;
26350     if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0)
26351       return Flush(p, nowPos32);
26352     ReadMatchDistances(p, &numPairs);
26353     RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][0], 0);
26354     p->state = kLiteralNextStates[p->state];
26355     curByte = p->matchFinder.GetIndexByte(p->matchFinderObj, 0 - p->additionalOffset);
26356     LitEnc_Encode(&p->rc, p->litProbs, curByte);
26357     p->additionalOffset--;
26358     nowPos32++;
26359   }
26360 
26361   if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) != 0)
26362   for (;;)
26363   {
26364     uint32_t pos, len, posState;
26365 
26366     if (p->fastMode)
26367       len = GetOptimumFast(p, &pos);
26368     else
26369       len = GetOptimum(p, nowPos32, &pos);
26370 
26371     posState = nowPos32 & p->pbMask;
26372     if (len == 1 && pos == (uint32_t)-1)
26373     {
26374       uint8_t curByte;
26375       CLzmaProb *probs;
26376       const uint8_t *data;
26377 
26378       RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 0);
26379       data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset;
26380       curByte = *data;
26381       probs = LIT_PROBS(nowPos32, *(data - 1));
26382       if (IsCharState(p->state))
26383         LitEnc_Encode(&p->rc, probs, curByte);
26384       else
26385         LitEnc_EncodeMatched(&p->rc, probs, curByte, *(data - p->reps[0] - 1));
26386       p->state = kLiteralNextStates[p->state];
26387     }
26388     else
26389     {
26390       RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 1);
26391       if (pos < LZMA_NUM_REPS)
26392       {
26393         RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 1);
26394         if (pos == 0)
26395         {
26396           RangeEnc_EncodeBit(&p->rc, &p->isRepG0[p->state], 0);
26397           RangeEnc_EncodeBit(&p->rc, &p->isRep0Long[p->state][posState], ((len == 1) ? 0 : 1));
26398         }
26399         else
26400         {
26401           uint32_t distance = p->reps[pos];
26402           RangeEnc_EncodeBit(&p->rc, &p->isRepG0[p->state], 1);
26403           if (pos == 1)
26404             RangeEnc_EncodeBit(&p->rc, &p->isRepG1[p->state], 0);
26405           else
26406           {
26407             RangeEnc_EncodeBit(&p->rc, &p->isRepG1[p->state], 1);
26408             RangeEnc_EncodeBit(&p->rc, &p->isRepG2[p->state], pos - 2);
26409             if (pos == 3)
26410               p->reps[3] = p->reps[2];
26411             p->reps[2] = p->reps[1];
26412           }
26413           p->reps[1] = p->reps[0];
26414           p->reps[0] = distance;
26415         }
26416         if (len == 1)
26417           p->state = kShortRepNextStates[p->state];
26418         else
26419         {
26420           LenEnc_Encode2(&p->repLenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices);
26421           p->state = kRepNextStates[p->state];
26422         }
26423       }
26424       else
26425       {
26426         uint32_t posSlot;
26427         RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 0);
26428         p->state = kMatchNextStates[p->state];
26429         LenEnc_Encode2(&p->lenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices);
26430         pos -= LZMA_NUM_REPS;
26431         GetPosSlot(pos, posSlot);
26432         RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, posSlot);
26433 
26434         if (posSlot >= kStartPosModelIndex)
26435         {
26436           uint32_t footerBits = ((posSlot >> 1) - 1);
26437           uint32_t base = ((2 | (posSlot & 1)) << footerBits);
26438           uint32_t posReduced = pos - base;
26439 
26440           if (posSlot < kEndPosModelIndex)
26441             RcTree_ReverseEncode(&p->rc, p->posEncoders + base - posSlot - 1, footerBits, posReduced);
26442           else
26443           {
26444             RangeEnc_EncodeDirectBits(&p->rc, posReduced >> kNumAlignBits, footerBits - kNumAlignBits);
26445             RcTree_ReverseEncode(&p->rc, p->posAlignEncoder, kNumAlignBits, posReduced & kAlignMask);
26446             p->alignPriceCount++;
26447           }
26448         }
26449         p->reps[3] = p->reps[2];
26450         p->reps[2] = p->reps[1];
26451         p->reps[1] = p->reps[0];
26452         p->reps[0] = pos;
26453         p->matchPriceCount++;
26454       }
26455     }
26456     p->additionalOffset -= len;
26457     nowPos32 += len;
26458     if (p->additionalOffset == 0)
26459     {
26460       uint32_t processed;
26461       if (!p->fastMode)
26462       {
26463         if (p->matchPriceCount >= (1 << 7))
26464           FillDistancesPrices(p);
26465         if (p->alignPriceCount >= kAlignTableSize)
26466           FillAlignPrices(p);
26467       }
26468       if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0)
26469         break;
26470       processed = nowPos32 - startPos32;
26471       if (useLimits)
26472       {
26473         if (processed + kNumOpts + 300 >= maxUnpackSize ||
26474             RangeEnc_GetProcessed(&p->rc) + kNumOpts * 2 >= maxPackSize)
26475           break;
26476       }
26477       else if (processed >= (1 << 15))
26478       {
26479         p->nowPos64 += nowPos32 - startPos32;
26480         return CheckErrors(p);
26481       }
26482     }
26483   }
26484   p->nowPos64 += nowPos32 - startPos32;
26485   return Flush(p, nowPos32);
26486 }
26487 
26488 #define kBigHashDicLimit ((uint32_t)1 << 24)
26489 
LzmaEnc_Alloc(CLzmaEnc * p,uint32_t keepWindowSize,ISzAlloc * alloc,ISzAlloc * allocBig)26490 static SRes LzmaEnc_Alloc(CLzmaEnc *p, uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig)
26491 {
26492   uint32_t beforeSize = kNumOpts;
26493   if (!RangeEnc_Alloc(&p->rc, alloc))
26494     return SZ_ERROR_MEM;
26495 
26496   {
26497     unsigned lclp = p->lc + p->lp;
26498     if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp)
26499     {
26500       LzmaEnc_FreeLits(p, alloc);
26501       p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb));
26502       p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb));
26503       if (p->litProbs == 0 || p->saveState.litProbs == 0)
26504       {
26505         LzmaEnc_FreeLits(p, alloc);
26506         return SZ_ERROR_MEM;
26507       }
26508       p->lclp = lclp;
26509     }
26510   }
26511 
26512   p->matchFinderBase.bigHash = (p->dictSize > kBigHashDicLimit);
26513 
26514   if (beforeSize + p->dictSize < keepWindowSize)
26515     beforeSize = keepWindowSize - p->dictSize;
26516 
26517   {
26518     if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig))
26519       return SZ_ERROR_MEM;
26520     p->matchFinderObj = &p->matchFinderBase;
26521     MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder);
26522   }
26523   return SZ_OK;
26524 }
26525 
LzmaEnc_Init(CLzmaEnc * p)26526 void LzmaEnc_Init(CLzmaEnc *p)
26527 {
26528   uint32_t i;
26529   p->state = 0;
26530   for (i = 0 ; i < LZMA_NUM_REPS; i++)
26531     p->reps[i] = 0;
26532 
26533   RangeEnc_Init(&p->rc);
26534 
26535   for (i = 0; i < kNumStates; i++)
26536   {
26537     uint32_t j;
26538     for (j = 0; j < LZMA_NUM_PB_STATES_MAX; j++)
26539     {
26540       p->isMatch[i][j] = kProbInitValue;
26541       p->isRep0Long[i][j] = kProbInitValue;
26542     }
26543     p->isRep[i] = kProbInitValue;
26544     p->isRepG0[i] = kProbInitValue;
26545     p->isRepG1[i] = kProbInitValue;
26546     p->isRepG2[i] = kProbInitValue;
26547   }
26548 
26549   {
26550     uint32_t num = 0x300 << (p->lp + p->lc);
26551     for (i = 0; i < num; i++)
26552       p->litProbs[i] = kProbInitValue;
26553   }
26554 
26555   {
26556     for (i = 0; i < kNumLenToPosStates; i++)
26557     {
26558       CLzmaProb *probs = p->posSlotEncoder[i];
26559       uint32_t j;
26560       for (j = 0; j < (1 << kNumPosSlotBits); j++)
26561         probs[j] = kProbInitValue;
26562     }
26563   }
26564   {
26565     for (i = 0; i < kNumFullDistances - kEndPosModelIndex; i++)
26566       p->posEncoders[i] = kProbInitValue;
26567   }
26568 
26569   LenEnc_Init(&p->lenEnc.p);
26570   LenEnc_Init(&p->repLenEnc.p);
26571 
26572   for (i = 0; i < (1 << kNumAlignBits); i++)
26573     p->posAlignEncoder[i] = kProbInitValue;
26574 
26575   p->optimumEndIndex = 0;
26576   p->optimumCurrentIndex = 0;
26577   p->additionalOffset = 0;
26578 
26579   p->pbMask = (1 << p->pb) - 1;
26580   p->lpMask = (1 << p->lp) - 1;
26581 }
26582 
LzmaEnc_InitPrices(CLzmaEnc * p)26583 void LzmaEnc_InitPrices(CLzmaEnc *p)
26584 {
26585   if (!p->fastMode)
26586   {
26587     FillDistancesPrices(p);
26588     FillAlignPrices(p);
26589   }
26590 
26591   p->lenEnc.tableSize =
26592   p->repLenEnc.tableSize =
26593       p->numFastBytes + 1 - LZMA_MATCH_LEN_MIN;
26594   LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, p->ProbPrices);
26595   LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, p->ProbPrices);
26596 }
26597 
LzmaEnc_AllocAndInit(CLzmaEnc * p,uint32_t keepWindowSize,ISzAlloc * alloc,ISzAlloc * allocBig)26598 static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig)
26599 {
26600   uint32_t i;
26601   for (i = 0; i < (uint32_t)kDicLogSizeMaxCompress; i++)
26602     if (p->dictSize <= ((uint32_t)1 << i))
26603       break;
26604   p->distTableSize = i * 2;
26605 
26606   p->finished = false;
26607   p->result = SZ_OK;
26608   RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig));
26609   LzmaEnc_Init(p);
26610   LzmaEnc_InitPrices(p);
26611   p->nowPos64 = 0;
26612   return SZ_OK;
26613 }
26614 
LzmaEnc_Prepare(CLzmaEncHandle pp,ISeqOutStream * outStream,ISeqInStream * inStream,ISzAlloc * alloc,ISzAlloc * allocBig)26615 static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream,
26616     ISzAlloc *alloc, ISzAlloc *allocBig)
26617 {
26618   CLzmaEnc *p = (CLzmaEnc *)pp;
26619   p->matchFinderBase.stream = inStream;
26620   p->needInit = 1;
26621   p->rc.outStream = outStream;
26622   return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig);
26623 }
26624 
LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp,ISeqInStream * inStream,uint32_t keepWindowSize,ISzAlloc * alloc,ISzAlloc * allocBig)26625 SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp,
26626     ISeqInStream *inStream, uint32_t keepWindowSize,
26627     ISzAlloc *alloc, ISzAlloc *allocBig)
26628 {
26629   CLzmaEnc *p = (CLzmaEnc *)pp;
26630   p->matchFinderBase.stream = inStream;
26631   p->needInit = 1;
26632   return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig);
26633 }
26634 
LzmaEnc_SetInputBuf(CLzmaEnc * p,const uint8_t * src,size_t srcLen)26635 static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const uint8_t *src, size_t srcLen)
26636 {
26637   p->matchFinderBase.directInput = 1;
26638   p->matchFinderBase.bufferBase = (uint8_t *)src;
26639   p->matchFinderBase.directInputRem = srcLen;
26640 }
26641 
LzmaEnc_MemPrepare(CLzmaEncHandle pp,const uint8_t * src,size_t srcLen,uint32_t keepWindowSize,ISzAlloc * alloc,ISzAlloc * allocBig)26642 SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const uint8_t *src, size_t srcLen,
26643     uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig)
26644 {
26645   CLzmaEnc *p = (CLzmaEnc *)pp;
26646   LzmaEnc_SetInputBuf(p, src, srcLen);
26647   p->needInit = 1;
26648 
26649   return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig);
26650 }
26651 
LzmaEnc_Finish(CLzmaEncHandle)26652 void LzmaEnc_Finish(CLzmaEncHandle /*pp*/)
26653 {
26654 }
26655 
26656 struct CSeqOutStreamBuf
26657 {
26658   ISeqOutStream funcTable;
26659   uint8_t *data;
26660   size_t rem;
26661   bool overflow;
26662 };
26663 
MyWrite(void * pp,const void * data,size_t size)26664 static size_t MyWrite(void *pp, const void *data, size_t size)
26665 {
26666   CSeqOutStreamBuf *p = (CSeqOutStreamBuf *)pp;
26667   if (p->rem < size)
26668   {
26669     size = p->rem;
26670     p->overflow = true;
26671   }
26672   memcpy(p->data, data, size);
26673   p->rem -= size;
26674   p->data += size;
26675   return size;
26676 }
26677 
LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp)26678 uint32_t LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp)
26679 {
26680   const CLzmaEnc *p = (CLzmaEnc *)pp;
26681   return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj);
26682 }
26683 
LzmaEnc_GetCurBuf(CLzmaEncHandle pp)26684 const uint8_t *LzmaEnc_GetCurBuf(CLzmaEncHandle pp)
26685 {
26686   const CLzmaEnc *p = (CLzmaEnc *)pp;
26687   return p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset;
26688 }
26689 
LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp,bool reInit,uint8_t * dest,size_t * destLen,uint32_t desiredPackSize,uint32_t * unpackSize)26690 SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, bool reInit,
26691     uint8_t *dest, size_t *destLen, uint32_t desiredPackSize, uint32_t *unpackSize)
26692 {
26693   CLzmaEnc *p = (CLzmaEnc *)pp;
26694   uint64_t nowPos64;
26695   SRes res;
26696   CSeqOutStreamBuf outStream;
26697 
26698   outStream.funcTable.Write = MyWrite;
26699   outStream.data = dest;
26700   outStream.rem = *destLen;
26701   outStream.overflow = false;
26702 
26703   p->writeEndMark = false;
26704   p->finished = false;
26705   p->result = SZ_OK;
26706 
26707   if (reInit)
26708     LzmaEnc_Init(p);
26709   LzmaEnc_InitPrices(p);
26710   nowPos64 = p->nowPos64;
26711   RangeEnc_Init(&p->rc);
26712   p->rc.outStream = &outStream.funcTable;
26713 
26714   res = LzmaEnc_CodeOneBlock(p, true, desiredPackSize, *unpackSize);
26715 
26716   *unpackSize = (uint32_t)(p->nowPos64 - nowPos64);
26717   *destLen -= outStream.rem;
26718   if (outStream.overflow)
26719     return SZ_ERROR_OUTPUT_EOF;
26720 
26721   return res;
26722 }
26723 
LzmaEnc_Encode2(CLzmaEnc * p,ICompressProgress * progress)26724 static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress)
26725 {
26726   SRes res = SZ_OK;
26727 
26728   for (;;)
26729   {
26730     res = LzmaEnc_CodeOneBlock(p, false, 0, 0);
26731     if (res != SZ_OK || p->finished != 0)
26732       break;
26733     if (progress != 0)
26734     {
26735       res = progress->Progress(progress, p->nowPos64, RangeEnc_GetProcessed(&p->rc));
26736       if (res != SZ_OK)
26737       {
26738         res = SZ_ERROR_PROGRESS;
26739         break;
26740       }
26741     }
26742   }
26743   LzmaEnc_Finish(p);
26744   return res;
26745 }
26746 
LzmaEnc_Encode(CLzmaEncHandle pp,ISeqOutStream * outStream,ISeqInStream * inStream,ICompressProgress * progress,ISzAlloc * alloc,ISzAlloc * allocBig)26747 SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress,
26748     ISzAlloc *alloc, ISzAlloc *allocBig)
26749 {
26750   RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig));
26751   return LzmaEnc_Encode2((CLzmaEnc *)pp, progress);
26752 }
26753 
LzmaEnc_WriteProperties(CLzmaEncHandle pp,uint8_t * props,size_t * size)26754 SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, uint8_t *props, size_t *size)
26755 {
26756   CLzmaEnc *p = (CLzmaEnc *)pp;
26757   int i;
26758   uint32_t dictSize = p->dictSize;
26759   if (*size < LZMA_PROPS_SIZE)
26760     return SZ_ERROR_PARAM;
26761   *size = LZMA_PROPS_SIZE;
26762   props[0] = (uint8_t)((p->pb * 5 + p->lp) * 9 + p->lc);
26763 
26764   for (i = 11; i <= 30; i++)
26765   {
26766     if (dictSize <= ((uint32_t)2 << i))
26767     {
26768       dictSize = (2 << i);
26769       break;
26770     }
26771     if (dictSize <= ((uint32_t)3 << i))
26772     {
26773       dictSize = (3 << i);
26774       break;
26775     }
26776   }
26777 
26778   for (i = 0; i < 4; i++)
26779     props[1 + i] = (uint8_t)(dictSize >> (8 * i));
26780   return SZ_OK;
26781 }
26782 
LzmaEnc_MemEncode(CLzmaEncHandle pp,uint8_t * dest,size_t * destLen,const uint8_t * src,size_t srcLen,int writeEndMark,ICompressProgress * progress,ISzAlloc * alloc,ISzAlloc * allocBig)26783 SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen,
26784     int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig)
26785 {
26786   SRes res;
26787   CLzmaEnc *p = (CLzmaEnc *)pp;
26788 
26789   CSeqOutStreamBuf outStream;
26790 
26791   LzmaEnc_SetInputBuf(p, src, srcLen);
26792 
26793   outStream.funcTable.Write = MyWrite;
26794   outStream.data = dest;
26795   outStream.rem = *destLen;
26796   outStream.overflow = false;
26797 
26798   p->writeEndMark = writeEndMark;
26799 
26800   p->rc.outStream = &outStream.funcTable;
26801   res = LzmaEnc_MemPrepare(pp, src, srcLen, 0, alloc, allocBig);
26802   if (res == SZ_OK)
26803     res = LzmaEnc_Encode2(p, progress);
26804 
26805   *destLen -= outStream.rem;
26806   if (outStream.overflow)
26807     return SZ_ERROR_OUTPUT_EOF;
26808   return res;
26809 }
26810 
LzmaEncode(uint8_t * dest,size_t * destLen,const uint8_t * src,size_t srcLen,const CLzmaEncProps * props,uint8_t * propsEncoded,size_t * propsSize,int writeEndMark,ICompressProgress * progress,ISzAlloc * alloc,ISzAlloc * allocBig)26811 SRes LzmaEncode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen,
26812     const CLzmaEncProps *props, uint8_t *propsEncoded, size_t *propsSize, int writeEndMark,
26813     ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig)
26814 {
26815   CLzmaEnc *p = (CLzmaEnc *)LzmaEnc_Create(alloc);
26816   SRes res;
26817   if (p == 0)
26818     return SZ_ERROR_MEM;
26819 
26820   res = LzmaEnc_SetProps(p, props);
26821   if (res == SZ_OK)
26822   {
26823     res = LzmaEnc_WriteProperties(p, propsEncoded, propsSize);
26824     if (res == SZ_OK)
26825       res = LzmaEnc_MemEncode(p, dest, destLen, src, srcLen,
26826           writeEndMark, progress, alloc, allocBig);
26827   }
26828 
26829   LzmaEnc_Destroy(p, alloc, allocBig);
26830   return res;
26831 }
26832 
26833 } // namespace lzma
26834 // End of LZMA compression library by Igor Pavlov
26835 
26836 #ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H
26837 #define UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H
LzmaAlloc(void *,size_t size)26838 static void *LzmaAlloc(void* /*p*/, size_t size) { return new char[size]; }
LzmaFree(void *,void * address)26839 static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; }
26840 static lzma::ISzAlloc lzmaAllocator = { LzmaAlloc, LzmaFree };
26841 #endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H
26842 
save(ostream & os,const binary_encoder & enc)26843 bool compressor::save(ostream& os, const binary_encoder& enc) {
26844   size_t uncompressed_size = enc.data.size(), compressed_size = 2 * enc.data.size() + 100;
26845   vector<unsigned char> compressed(compressed_size);
26846 
26847   lzma::CLzmaEncProps props;
26848   lzma::LzmaEncProps_Init(&props);
26849   unsigned char props_encoded[LZMA_PROPS_SIZE];
26850   size_t props_encoded_size = LZMA_PROPS_SIZE;
26851 
26852   auto res = lzma::LzmaEncode(compressed.data(), &compressed_size, enc.data.data(), uncompressed_size, &props, props_encoded, &props_encoded_size, 0, nullptr, &lzmaAllocator, &lzmaAllocator);
26853   if (res != SZ_OK) return false;
26854 
26855   uint32_t poor_crc = uncompressed_size * 19991 + compressed_size * 199999991 + 1234567890;
26856   if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false;
26857   if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false;
26858   if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false;
26859   if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false;
26860   if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false;
26861   if (!os.write((const char*) compressed.data(), compressed_size)) return false;
26862 
26863   return true;
26864 }
26865 
26866 } // namespace utils
26867 
26868 /////////
26869 // File: version/version.cpp
26870 /////////
26871 
26872 // This file is part of UDPipe <http://github.com/ufal/udpipe/>.
26873 //
26874 // Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
26875 // Mathematics and Physics, Charles University in Prague, Czech Republic.
26876 //
26877 // This Source Code Form is subject to the terms of the Mozilla Public
26878 // License, v. 2.0. If a copy of the MPL was not distributed with this
26879 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
26880 
26881 // Returns current version.
current()26882 version version::current() {
26883   return {1, 2, 0, ""};
26884 }
26885 
26886 // Returns multi-line formated version and copyright string.
version_and_copyright(const string & other_libraries)26887 string version::version_and_copyright(const string& other_libraries) {
26888   ostringstream info;
26889 
26890   auto udpipe = version::current();
26891   auto unilib = unilib::version::current();
26892   auto morphodita = morphodita::version::current();
26893   auto parsito = parsito::version::current();
26894 
26895   info << "UDPipe version " << udpipe.major << '.' << udpipe.minor << '.' << udpipe.patch
26896        << (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease
26897        << " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch
26898        << (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease
26899        << ",\nMorphoDiTa " << morphodita.major << '.' << morphodita.minor << '.' << unilib.patch
26900        << (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
26901        << ", Parsito " << parsito.major << '.' << parsito.minor << '.' << unilib.patch
26902        << (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
26903        << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
26904           "Copyright 2016 by Institute of Formal and Applied Linguistics, Faculty of\n"
26905           "Mathematics and Physics, Charles University in Prague, Czech Republic.";
26906 
26907   return info.str();
26908 }
26909 
26910 } // namespace udpipe
26911 } // namespace ufal
26912