1 // Copyright 2017 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 // Purpose: Collection of convenience functions to simplify creation of the
4 //          unicharset, recoder, and dawgs for an LSTM model.
5 
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include "lang_model_helpers.h"
17 
18 #include "dawg.h"
19 #include "fileio.h"
20 #include "tessdatamanager.h"
21 #include "trie.h"
22 #include "unicharcompress.h"
23 
24 #include <cstdlib>
25 
26 #include <sys/stat.h>
27 #include <sys/types.h>
28 
29 #if defined(_WIN32)
30 #  include <direct.h>
31 #endif
32 
33 namespace tesseract {
34 
35 // Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data
36 // to the file, using writer if not null, otherwise, a default writer.
37 // Default writer will overwrite any existing file, but a supplied writer
38 // can do its own thing. If lang is empty, returns true but does nothing.
39 // NOTE that suffix should contain any required . for the filename.
WriteFile(const std::string & output_dir,const std::string & lang,const std::string & suffix,const std::vector<char> & data,FileWriter writer)40 bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix,
41                const std::vector<char> &data, FileWriter writer) {
42   if (lang.empty()) {
43     return true;
44   }
45   std::string dirname = output_dir + "/" + lang;
46   // Attempt to make the directory, but ignore errors, as it may not be a
47   // standard filesystem, and the writer will complain if not successful.
48 #if defined(_WIN32)
49   _mkdir(dirname.c_str());
50 #else
51   mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
52 #endif
53   std::string filename = dirname + "/" + lang + suffix;
54   if (writer == nullptr) {
55     return SaveDataToFile(data, filename.c_str());
56   } else {
57     return (*writer)(data, filename.c_str());
58   }
59 }
60 
61 // Helper reads a file with optional reader and returns a string.
62 // On failure emits a warning message and returns an empty string.
ReadFile(const std::string & filename,FileReader reader)63 std::string ReadFile(const std::string &filename, FileReader reader) {
64   if (filename.empty()) {
65     return std::string();
66   }
67   std::vector<char> data;
68   bool read_result;
69   if (reader == nullptr) {
70     read_result = LoadDataFromFile(filename.c_str(), &data);
71   } else {
72     read_result = (*reader)(filename.c_str(), &data);
73   }
74   if (read_result) {
75     return std::string(&data[0], data.size());
76   }
77   tprintf("Failed to read data from: %s\n", filename.c_str());
78   return std::string();
79 }
80 
81 // Helper writes the unicharset to file and to the traineddata.
WriteUnicharset(const UNICHARSET & unicharset,const std::string & output_dir,const std::string & lang,FileWriter writer,TessdataManager * traineddata)82 bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir,
83                      const std::string &lang, FileWriter writer, TessdataManager *traineddata) {
84   std::vector<char> unicharset_data;
85   TFile fp;
86   fp.OpenWrite(&unicharset_data);
87   if (!unicharset.save_to_file(&fp)) {
88     return false;
89   }
90   traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],
91                               unicharset_data.size());
92   return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer);
93 }
94 
95 // Helper creates the recoder and writes it to the traineddata, and a human-
96 // readable form to file.
WriteRecoder(const UNICHARSET & unicharset,bool pass_through,const std::string & output_dir,const std::string & lang,FileWriter writer,std::string * radical_table_data,TessdataManager * traineddata)97 bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir,
98                   const std::string &lang, FileWriter writer, std::string *radical_table_data,
99                   TessdataManager *traineddata) {
100   UnicharCompress recoder;
101   // Where the unicharset is carefully setup already to contain a good
102   // compact encoding, use a pass-through recoder that does nothing.
103   // For scripts that have a large number of unicodes (Han, Hangul) we want
104   // to use the recoder to compress the symbol space by re-encoding each
105   // unicode as multiple codes from a smaller 'alphabet' that are related to the
106   // shapes in the character. Hangul Jamo is a perfect example of this.
107   // See the Hangul Syllables section, sub-section "Equivalence" in:
108   // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf
109   if (pass_through) {
110     recoder.SetupPassThrough(unicharset);
111   } else {
112     int null_char = unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();
113     tprintf("Null char=%d\n", null_char);
114     if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {
115       tprintf("Creation of encoded unicharset failed!!\n");
116       return false;
117     }
118   }
119   TFile fp;
120   std::vector<char> recoder_data;
121   fp.OpenWrite(&recoder_data);
122   if (!recoder.Serialize(&fp)) {
123     return false;
124   }
125   traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size());
126   std::string encoding = recoder.GetEncodingAsString(unicharset);
127   recoder_data.resize(encoding.length(), 0);
128   memcpy(&recoder_data[0], &encoding[0], encoding.length());
129   std::string suffix;
130   suffix += ".charset_size=" + std::to_string(recoder.code_range());
131   suffix += ".txt";
132   return WriteFile(output_dir, lang, suffix.c_str(), recoder_data, writer);
133 }
134 
135 // Helper builds a dawg from the given words, using the unicharset as coding,
136 // and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
WriteDawg(const std::vector<std::string> & words,const UNICHARSET & unicharset,Trie::RTLReversePolicy reverse_policy,TessdataType file_type,TessdataManager * traineddata)137 static bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset,
138                       Trie::RTLReversePolicy reverse_policy, TessdataType file_type,
139                       TessdataManager *traineddata) {
140   // The first 3 arguments are not used in this case.
141   Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0);
142   trie.add_word_list(words, unicharset, reverse_policy);
143   tprintf("Reducing Trie to SquishedDawg\n");
144   std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
145   if (dawg == nullptr || dawg->NumEdges() == 0) {
146     return false;
147   }
148   TFile fp;
149   std::vector<char> dawg_data;
150   fp.OpenWrite(&dawg_data);
151   if (!dawg->write_squished_dawg(&fp)) {
152     return false;
153   }
154   traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());
155   return true;
156 }
157 
158 // Builds and writes the dawgs, given a set of words, punctuation
159 // patterns, number patterns, to the traineddata. Encoding uses the given
160 // unicharset, and the punc dawgs is reversed if lang_is_rtl.
WriteDawgs(const std::vector<std::string> & words,const std::vector<std::string> & puncs,const std::vector<std::string> & numbers,bool lang_is_rtl,const UNICHARSET & unicharset,TessdataManager * traineddata)161 static bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs,
162                        const std::vector<std::string> &numbers, bool lang_is_rtl,
163                        const UNICHARSET &unicharset, TessdataManager *traineddata) {
164   if (puncs.empty()) {
165     tprintf("Must have non-empty puncs list to use language models!!\n");
166     return false;
167   }
168   // For each of the dawg types, make the dawg, and write to traineddata.
169   // Dawgs are reversed as follows:
170   // Words: According to the word content.
171   // Puncs: According to lang_is_rtl.
172   // Numbers: Never.
173   // System dawg (main wordlist).
174   if (!words.empty() && !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,
175                                    TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {
176     return false;
177   }
178   // punc/punc-dawg.
179   Trie::RTLReversePolicy reverse_policy =
180       lang_is_rtl ? Trie::RRP_FORCE_REVERSE : Trie::RRP_DO_NO_REVERSE;
181   if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG, traineddata)) {
182     return false;
183   }
184   // numbers/number-dawg.
185   if (!numbers.empty() && !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,
186                                      TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {
187     return false;
188   }
189   return true;
190 }
191 
192 // The main function for combine_lang_model.cpp.
193 // Returns EXIT_SUCCESS or EXIT_FAILURE for error.
CombineLangModel(const UNICHARSET & unicharset,const std::string & script_dir,const std::string & version_str,const std::string & output_dir,const std::string & lang,bool pass_through_recoder,const std::vector<std::string> & words,const std::vector<std::string> & puncs,const std::vector<std::string> & numbers,bool lang_is_rtl,FileReader reader,FileWriter writer)194 int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir,
195                      const std::string &version_str, const std::string &output_dir,
196                      const std::string &lang, bool pass_through_recoder,
197                      const std::vector<std::string> &words, const std::vector<std::string> &puncs,
198                      const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader,
199                      FileWriter writer) {
200   // Build the traineddata file.
201   TessdataManager traineddata;
202   if (!version_str.empty()) {
203     traineddata.SetVersionString(traineddata.VersionString() + ":" + version_str);
204   }
205   // Unicharset and recoder.
206   if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
207     tprintf("Error writing unicharset!!\n");
208     return EXIT_FAILURE;
209   } else {
210     tprintf("Config file is optional, continuing...\n");
211   }
212   // If there is a config file, read it and add to traineddata.
213   std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
214   std::string config_file = ReadFile(config_filename, reader);
215   if (config_file.length() > 0) {
216     traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length());
217   }
218   std::string radical_filename = script_dir + "/radical-stroke.txt";
219   std::string radical_data = ReadFile(radical_filename, reader);
220   if (radical_data.empty()) {
221     tprintf("Error reading radical code table %s\n", radical_filename.c_str());
222     return EXIT_FAILURE;
223   }
224   if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer, &radical_data,
225                     &traineddata)) {
226     tprintf("Error writing recoder!!\n");
227   }
228   if (!words.empty() || !puncs.empty() || !numbers.empty()) {
229     if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset, &traineddata)) {
230       tprintf("Error during conversion of wordlists to DAWGs!!\n");
231       return EXIT_FAILURE;
232     }
233   }
234 
235   // Traineddata file.
236   std::vector<char> traineddata_data;
237   traineddata.Serialize(&traineddata_data);
238   if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) {
239     tprintf("Error writing output traineddata file!!\n");
240     return EXIT_FAILURE;
241   }
242   return EXIT_SUCCESS;
243 }
244 
245 } // namespace tesseract
246