1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 #include "log.h" // for LOG
14 
15 #include "matrix.h"
16 #include "normstrngs.h"
17 #include "pageres.h"
18 #include "ratngs.h"
19 #include "recodebeam.h"
20 #include "unicharcompress.h"
21 #include "unicharset_training_utils.h"
22 
23 #include "helpers.h"
24 
25 namespace tesseract {
26 
27 // Number of characters to test beam search with.
28 const int kNumChars = 100;
29 // Amount of extra random data to pad with after.
30 const int kPadding = 64;
31 // Dictionary test data.
32 // The top choice is: "Gef s wordsright.".
33 // The desired phrase is "Gets words right.".
34 // There is a competing dictionary phrase: "Get swords right.".
35 // ... due to the following errors from the network:
36 // f stronger than t in "Get".
37 // weak space between Gef and s and between s and words.
38 // weak space between words and right.
39 const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r",    "d",
40                           "s", "",  "r", "i", "g", "h", "t", ".", nullptr};
41 const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
42                                0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
43 const char *kGWR2nds[] = {"C", "c", "t", "",  "S", "",  "W", "O", "t",    "h",
44                           "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
45 const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
46                                0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
47 
48 const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
49 const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
50 const char *kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
51 const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
52 
53 const char *kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
54 const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
55 const char *kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
56 const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
57 
58 class RecodeBeamTest : public ::testing::Test {
59 protected:
SetUp()60   void SetUp() override {
61     std::locale::global(std::locale(""));
62     file::MakeTmpdir();
63   }
64 
RecodeBeamTest()65   RecodeBeamTest() : lstm_dict_(&ccutil_) {}
~RecodeBeamTest()66   ~RecodeBeamTest() override {
67     lstm_dict_.End();
68   }
69 
70   // Loads and compresses the given unicharset.
LoadUnicharset(const std::string & unicharset_name)71   void LoadUnicharset(const std::string &unicharset_name) {
72     std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
73     std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
74     std::string radical_data;
75     CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
76     CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
77     unichar_null_char_ =
78         ccutil_.unicharset.has_special_codes() ? UNICHAR_BROKEN : ccutil_.unicharset.size();
79     std::string radical_str(radical_data.c_str());
80     EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));
81     RecodedCharID code;
82     recoder_.EncodeUnichar(unichar_null_char_, &code);
83     encoded_null_char_ = code(0);
84     // Space should encode as itself.
85     recoder_.EncodeUnichar(UNICHAR_SPACE, &code);
86     EXPECT_EQ(UNICHAR_SPACE, code(0));
87     std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
88     std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
89     std::string encoding_str(&encoding[0], encoding.size());
90     CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
91     LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
92   }
93   // Loads the dictionary.
LoadDict(const std::string & lang)94   void LoadDict(const std::string &lang) {
95     std::string traineddata_name = lang + ".traineddata";
96     std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
97     lstm_dict_.SetupForLoad(nullptr);
98     tesseract::TessdataManager mgr;
99     mgr.Init(traineddata_file.c_str());
100     lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
101     lstm_dict_.FinishLoad();
102   }
103 
104   // Expects the appropriate results from the compressed_  ccutil_.unicharset.
ExpectCorrect(const GENERIC_2D_ARRAY<float> & output,const std::vector<int> & transcription)105   void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output,
106                      const std::vector<int> &transcription) {
107     // Get the utf8 string of the transcription.
108     std::string truth_utf8;
109     for (int i : transcription) {
110       truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
111     }
112     PointerVector<WERD_RES> words;
113     ExpectCorrect(output, truth_utf8, nullptr, &words);
114   }
ExpectCorrect(const GENERIC_2D_ARRAY<float> & output,const std::string & truth_utf8,Dict * dict,PointerVector<WERD_RES> * words)115   void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,
116                      Dict *dict, PointerVector<WERD_RES> *words) {
117     RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
118     beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
119     // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
120     // beam_search.DebugBeams(ccutil_.unicharset);
121     std::vector<int> labels, xcoords;
122     beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
123     LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
124     // Now decode using recoder_.
125     std::string decoded;
126     int end = 1;
127     for (unsigned start = 0; start < labels.size(); start = end) {
128       RecodedCharID code;
129       unsigned index = start;
130       int uni_id = INVALID_UNICHAR_ID;
131       do {
132         code.Set(code.length(), labels[index++]);
133         uni_id = recoder_.DecodeUnichar(code);
134       } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
135                (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
136       EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
137       // To the extent of truth_utf8, we expect decoded to match, but if
138       // transcription is shorter, that is OK too, as we may just be testing
139       // that we get a valid sequence when padded with random data.
140       if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
141         decoded += ccutil_.unicharset.id_to_unichar(uni_id);
142       }
143       end = index;
144     }
145     EXPECT_EQ(truth_utf8, decoded);
146 
147     // Check that ExtractBestPathAsUnicharIds does the same thing.
148     std::vector<int> unichar_ids;
149     std::vector<float> certainties, ratings;
150     beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
151                                             &ratings, &xcoords);
152     std::string u_decoded;
153     float total_rating = 0.0f;
154     for (unsigned u = 0; u < unichar_ids.size(); ++u) {
155       // To the extent of truth_utf8, we expect decoded to match, but if
156       // transcription is shorter, that is OK too, as we may just be testing
157       // that we get a valid sequence when padded with random data.
158       if (u_decoded.size() < truth_utf8.size()) {
159         const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
160         total_rating += ratings[u];
161         LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
162           << certainties[u] << ", r=" << ratings[u] << "r_sum="
163           << total_rating << " @" << xcoords[u] << "\n";
164         if (str[0] == ' ') {
165           total_rating = 0.0f;
166         }
167         u_decoded += str;
168       }
169     }
170     EXPECT_EQ(truth_utf8, u_decoded);
171 
172     // Check that ExtractBestPathAsWords does the same thing.
173     TBOX line_box(0, 0, 100, 10);
174     for (int i = 0; i < 2; ++i) {
175       beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
176       std::string w_decoded;
177       for (int w = 0; w < words->size(); ++w) {
178         const WERD_RES *word = (*words)[w];
179         if (w_decoded.size() < truth_utf8.size()) {
180           if (!w_decoded.empty() && word->word->space()) {
181             w_decoded += " ";
182           }
183           w_decoded += word->best_choice->unichar_string().c_str();
184         }
185         LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
186           << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
187           << ", perm=" << word->best_choice->permuter() << "\n";
188       }
189       std::string w_trunc(w_decoded.data(), truth_utf8.size());
190       if (truth_utf8 != w_trunc) {
191         tesseract::NormalizeUTF8String(
192             tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize,
193             tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
194         w_trunc.assign(w_decoded.data(), truth_utf8.size());
195       }
196       EXPECT_EQ(truth_utf8, w_trunc);
197     }
198   }
199   // Generates easy encoding of the given unichar_ids, and pads with at least
200   // padding of random data.
GenerateRandomPaddedOutputs(const std::vector<int> & unichar_ids,int padding)201   GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const std::vector<int> &unichar_ids,
202                                                       int padding) {
203     int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
204     int num_codes = recoder_.code_range();
205     GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
206     // Fill with random data.
207     TRand random;
208     for (int t = 0; t < width; ++t) {
209       for (int i = 0; i < num_codes; ++i) {
210         outputs(t, i) = random.UnsignedRand(0.25);
211       }
212     }
213     int t = 0;
214     for (int unichar_id : unichar_ids) {
215       RecodedCharID code;
216       int len = recoder_.EncodeUnichar(unichar_id, &code);
217       EXPECT_NE(0, len);
218       for (int j = 0; j < len; ++j) {
219         // Make the desired answer a clear winner.
220         if (j > 0 && code(j) == code(j - 1)) {
221           // We will collapse adjacent equal codes so put a null in between.
222           outputs(t++, encoded_null_char_) = 1.0f;
223         }
224         outputs(t++, code(j)) = 1.0f;
225       }
226       // Put a 0 as a null char in between.
227       outputs(t++, encoded_null_char_) = 1.0f;
228     }
229     // Normalize the probs.
230     for (int t = 0; t < width; ++t) {
231       double sum = 0.0;
232       for (int i = 0; i < num_codes; ++i) {
233         sum += outputs(t, i);
234       }
235       for (int i = 0; i < num_codes; ++i) {
236         outputs(t, i) /= sum;
237       }
238     }
239 
240     return outputs;
241   }
242   // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
243   // the score for the appropriate sequence of codes, returning the ending t.
EncodeUTF8(const char * utf8_str,float score,int start_t,TRand * random,GENERIC_2D_ARRAY<float> * outputs)244   int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random,
245                  GENERIC_2D_ARRAY<float> *outputs) {
246     int t = start_t;
247     std::vector<int> unichar_ids;
248     EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
249     if (unichar_ids.empty() || utf8_str[0] == '\0') {
250       unichar_ids.clear();
251       unichar_ids.push_back(unichar_null_char_);
252     }
253     int num_ids = unichar_ids.size();
254     for (int u = 0; u < num_ids; ++u) {
255       RecodedCharID code;
256       int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
257       EXPECT_NE(0, len);
258       for (int i = 0; i < len; ++i) {
259         // Apply the desired score.
260         (*outputs)(t++, code(i)) = score;
261         if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
262           int dups = static_cast<int>(random->UnsignedRand(3.0));
263           for (int d = 0; d < dups; ++d) {
264             // Duplicate the desired score.
265             (*outputs)(t++, code(i)) = score;
266           }
267         }
268       }
269       if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
270         int dups = static_cast<int>(random->UnsignedRand(3.0));
271         for (int d = 0; d < dups; ++d) {
272           // Add a random number of nulls as well.
273           (*outputs)(t++, encoded_null_char_) = score;
274         }
275       }
276     }
277     return t;
278   }
279   // Generates an encoding of the given 4 arrays as synthetic network scores.
280   // uses scores1 for chars1 and scores2 for chars2, and everything else gets
281   // the leftovers shared out equally. Note that empty string encodes as the
282   // null_char_.
GenerateSyntheticOutputs(const char * chars1[],const float scores1[],const char * chars2[],const float scores2[],TRand * random)283   GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],
284                                                    const char *chars2[], const float scores2[],
285                                                    TRand *random) {
286     int width = 0;
287     while (chars1[width] != nullptr) {
288       ++width;
289     }
290     int padding = width * RecodedCharID::kMaxCodeLen;
291     int num_codes = recoder_.code_range();
292     GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
293     int t = 0;
294     for (int i = 0; i < width; ++i) {
295       // In case there is overlap in the codes between 1st and 2nd choice, it
296       // is better to encode the 2nd choice first.
297       int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
298       int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
299       // Advance t to the max end, setting everything else to the leftovers.
300       int max_t = std::max(end_t1, end_t2);
301       while (t < max_t) {
302         double total_score = 0.0;
303         for (int j = 0; j < num_codes; ++j) {
304           total_score += outputs(t, j);
305         }
306         double null_remainder = (1.0 - total_score) / 2.0;
307         double remainder = null_remainder / (num_codes - 2);
308         if (outputs(t, encoded_null_char_) < null_remainder) {
309           outputs(t, encoded_null_char_) += null_remainder;
310         } else {
311           remainder += remainder;
312         }
313         for (int j = 0; j < num_codes; ++j) {
314           if (outputs(t, j) == 0.0f) {
315             outputs(t, j) = remainder;
316           }
317         }
318         ++t;
319       }
320     }
321     // Fill the rest with null chars.
322     while (t < width + padding) {
323       outputs(t++, encoded_null_char_) = 1.0f;
324     }
325     return outputs;
326   }
327   UnicharCompress recoder_;
328   int unichar_null_char_ = 0;
329   int encoded_null_char_ = 0;
330   CCUtil ccutil_;
331   Dict lstm_dict_;
332 };
333 
TEST_F(RecodeBeamTest,DoesChinese)334 TEST_F(RecodeBeamTest, DoesChinese) {
335   LOG(INFO) << "Testing chi_tra"
336             << "\n";
337   LoadUnicharset("chi_tra.unicharset");
338   // Correctly reproduce the first kNumchars characters from easy output.
339   std::vector<int> transcription;
340   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
341     transcription.push_back(i);
342   }
343   GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
344   ExpectCorrect(outputs, transcription);
345   LOG(INFO) << "Testing chi_sim"
346             << "\n";
347   LoadUnicharset("chi_sim.unicharset");
348   // Correctly reproduce the first kNumchars characters from easy output.
349   transcription.clear();
350   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
351     transcription.push_back(i);
352   }
353   outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
354   ExpectCorrect(outputs, transcription);
355 }
356 
TEST_F(RecodeBeamTest,DoesJapanese)357 TEST_F(RecodeBeamTest, DoesJapanese) {
358   LOG(INFO) << "Testing jpn"
359             << "\n";
360   LoadUnicharset("jpn.unicharset");
361   // Correctly reproduce the first kNumchars characters from easy output.
362   std::vector<int> transcription;
363   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
364     transcription.push_back(i);
365   }
366   GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
367   ExpectCorrect(outputs, transcription);
368 }
369 
TEST_F(RecodeBeamTest,DoesKorean)370 TEST_F(RecodeBeamTest, DoesKorean) {
371   LOG(INFO) << "Testing kor"
372             << "\n";
373   LoadUnicharset("kor.unicharset");
374   // Correctly reproduce the first kNumchars characters from easy output.
375   std::vector<int> transcription;
376   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
377     transcription.push_back(i);
378   }
379   GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
380   ExpectCorrect(outputs, transcription);
381 }
382 
TEST_F(RecodeBeamTest,DoesKannada)383 TEST_F(RecodeBeamTest, DoesKannada) {
384   LOG(INFO) << "Testing kan"
385             << "\n";
386   LoadUnicharset("kan.unicharset");
387   // Correctly reproduce the first kNumchars characters from easy output.
388   std::vector<int> transcription;
389   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
390     transcription.push_back(i);
391   }
392   GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
393   ExpectCorrect(outputs, transcription);
394 }
395 
TEST_F(RecodeBeamTest,DoesMarathi)396 TEST_F(RecodeBeamTest, DoesMarathi) {
397   LOG(INFO) << "Testing mar"
398             << "\n";
399   LoadUnicharset("mar.unicharset");
400   // Correctly reproduce the first kNumchars characters from easy output.
401   std::vector<int> transcription;
402   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
403     transcription.push_back(i);
404   }
405   GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
406   ExpectCorrect(outputs, transcription);
407 }
408 
TEST_F(RecodeBeamTest,DoesEnglish)409 TEST_F(RecodeBeamTest, DoesEnglish) {
410   LOG(INFO) << "Testing eng"
411             << "\n";
412   LoadUnicharset("eng.unicharset");
413   // Correctly reproduce the first kNumchars characters from easy output.
414   std::vector<int> transcription;
415   for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
416     transcription.push_back(i);
417   }
418   GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
419   ExpectCorrect(outputs, transcription);
420 }
421 
TEST_F(RecodeBeamTest,DISABLED_EngDictionary)422 TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
423   LOG(INFO) << "Testing eng dictionary"
424             << "\n";
425   LoadUnicharset("eng_beam.unicharset");
426   GENERIC_2D_ARRAY<float> outputs =
427       GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
428   std::string default_str;
429   for (int i = 0; kGWRTops[i] != nullptr; ++i) {
430     default_str += kGWRTops[i];
431   }
432   PointerVector<WERD_RES> words;
433   ExpectCorrect(outputs, default_str, nullptr, &words);
434   // Now try again with the dictionary.
435   LoadDict("eng_beam");
436   ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
437 }
438 
TEST_F(RecodeBeamTest,DISABLED_ChiDictionary)439 TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
440   LOG(INFO) << "Testing zh_hans dictionary"
441             << "\n";
442   LoadUnicharset("zh_hans.unicharset");
443   GENERIC_2D_ARRAY<float> outputs =
444       GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
445   PointerVector<WERD_RES> words;
446   ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
447   // Each is an individual word, with permuter = top choice.
448   EXPECT_EQ(7, words.size());
449   for (int w = 0; w < words.size(); ++w) {
450     EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
451   }
452   // Now try again with the dictionary.
453   LoadDict("zh_hans");
454   ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
455   // Number of words expected.
456   const int kNumWords = 5;
457   // Content of the words.
458   const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
459   // Permuters of the words.
460   const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,
461                                      TOP_CHOICE_PERM, SYSTEM_DAWG_PERM};
462   EXPECT_EQ(kNumWords, words.size());
463   for (int w = 0; w < kNumWords && w < words.size(); ++w) {
464     EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
465     EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
466   }
467 }
468 
469 // Tests that a recoder built with decomposed unicode allows true ctc
470 // arbitrary duplicates and inserted nulls inside the multicode sequence.
TEST_F(RecodeBeamTest,DISABLED_MultiCodeSequences)471 TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
472   LOG(INFO) << "Testing duplicates in multi-code sequences"
473             << "\n";
474   LoadUnicharset("vie.d.unicharset");
475   tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
476   TRand random;
477   GENERIC_2D_ARRAY<float> outputs =
478       GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
479   PointerVector<WERD_RES> words;
480   std::string truth_str;
481   tesseract::NormalizeUTF8String(tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
482                                  tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
483   ExpectCorrect(outputs, truth_str, nullptr, &words);
484 }
485 
486 } // namespace tesseract
487