1 ///////////////////////////////////////////////////////////////////////
2 // File: combine_tessdata.cpp
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #include "commontraining.h" // CheckSharedLibraryVersion
21 #include "lstmrecognizer.h"
22 #include "tessdatamanager.h"
23
24 #include <cerrno>
25 #include <iostream> // std::cout
26
27 using namespace tesseract;
28
list_components(TessdataManager & tm,const char * filename)29 static int list_components(TessdataManager &tm, const char *filename) {
30 // Initialize TessdataManager with the data in the given traineddata file.
31 if (filename != nullptr && !tm.Init(filename)) {
32 tprintf("Failed to read %s\n", filename);
33 return EXIT_FAILURE;
34 }
35 tm.Directory();
36 return EXIT_SUCCESS;
37 }
38
list_network(TessdataManager & tm,const char * filename)39 static int list_network(TessdataManager &tm, const char *filename) {
40 if (filename != nullptr && !tm.Init(filename)) {
41 tprintf("Failed to read %s\n", filename);
42 return EXIT_FAILURE;
43 }
44 tesseract::TFile fp;
45 if (tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
46 tesseract::LSTMRecognizer recognizer;
47 if (!recognizer.DeSerialize(&tm, &fp)) {
48 tprintf("Failed to deserialize LSTM in %s!\n", filename);
49 return EXIT_FAILURE;
50 }
51 std::cout << "LSTM: network=" << recognizer.GetNetwork()
52 << ", int_mode=" << recognizer.IsIntMode()
53 << ", recoding=" << recognizer.IsRecoding()
54 << ", iteration=" << recognizer.training_iteration()
55 << ", sample_iteration=" << recognizer.sample_iteration()
56 << ", null_char=" << recognizer.null_char()
57 << ", learning_rate=" << recognizer.learning_rate()
58 << ", momentum=" << recognizer.GetMomentum()
59 << ", adam_beta=" << recognizer.GetAdamBeta() << '\n';
60
61 std::cout << "Layer Learning Rates: ";
62 auto layers = recognizer.EnumerateLayers();
63 for (const auto &id : layers) {
64 auto layer = recognizer.GetLayer(id);
65 std::cout << id << "(" << layer->name() << ")"
66 << "=" << recognizer.GetLayerLearningRate(id)
67 << (layers[layers.size() - 1] != id ? ", " : "");
68 }
69 std::cout << "\n";
70 }
71 return EXIT_SUCCESS;
72 }
73
74 // Main program to combine/extract/overwrite tessdata components
75 // in [lang].traineddata files.
76 //
77 // To combine all the individual tessdata components (unicharset, DAWGs,
78 // classifier templates, ambiguities, language configs) located at, say,
79 // /home/$USER/temp/eng.* run:
80 //
81 // combine_tessdata /home/$USER/temp/eng.
82 //
83 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
84 //
85 // Specify option -e if you would like to extract individual components
86 // from a combined traineddata file. For example, to extract language config
87 // file and the unicharset from tessdata/eng.traineddata run:
88 //
89 // combine_tessdata -e tessdata/eng.traineddata
90 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
91 //
92 // The desired config file and unicharset will be written to
93 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
94 //
95 // Specify option -o to overwrite individual components of the given
96 // [lang].traineddata file. For example, to overwrite language config
97 // and unichar ambiguities files in tessdata/eng.traineddata use:
98 //
99 // combine_tessdata -o tessdata/eng.traineddata
100 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
101 //
102 // As a result, tessdata/eng.traineddata will contain the new language config
103 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
104 //
105 // Note: the file names of the files to extract to and to overwrite from should
106 // have the appropriate file suffixes (extensions) indicating their tessdata
107 // component type (.unicharset for the unicharset, .unicharambigs for unichar
108 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
109 //
110 // Specify option -u to unpack all the components to the specified path:
111 //
112 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
113 //
114 // This will create /home/$USER/temp/eng.* files with individual tessdata
115 // components from tessdata/eng.traineddata.
116 //
main(int argc,char ** argv)117 int main(int argc, char **argv) {
118 tesseract::CheckSharedLibraryVersion();
119
120 int i;
121 tesseract::TessdataManager tm;
122 if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
123 printf("%s\n", tesseract::TessBaseAPI::Version());
124 return EXIT_SUCCESS;
125 } else if (argc == 2) {
126 printf("Combining tessdata files\n");
127 std::string lang = argv[1];
128 char *last = &argv[1][strlen(argv[1]) - 1];
129 if (*last != '.') {
130 lang += '.';
131 }
132 std::string output_file = lang;
133 output_file += kTrainedDataSuffix;
134 if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
135 printf("Error combining tessdata files into %s\n", output_file.c_str());
136 } else {
137 printf("Output %s created successfully.\n", output_file.c_str());
138 }
139 } else if (argc >= 4 &&
140 (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) {
141 // Initialize TessdataManager with the data in the given traineddata file.
142 if (!tm.Init(argv[2])) {
143 tprintf("Failed to read %s\n", argv[2]);
144 return EXIT_FAILURE;
145 }
146 printf("Extracting tessdata components from %s\n", argv[2]);
147 if (strcmp(argv[1], "-e") == 0) {
148 for (i = 3; i < argc; ++i) {
149 errno = 0;
150 if (tm.ExtractToFile(argv[i])) {
151 printf("Wrote %s\n", argv[i]);
152 } else if (errno == 0) {
153 printf(
154 "Not extracting %s, since this component"
155 " is not present\n",
156 argv[i]);
157 return EXIT_FAILURE;
158 } else {
159 printf("Error, could not extract %s: %s\n", argv[i], strerror(errno));
160 return EXIT_FAILURE;
161 }
162 }
163 } else { // extract all the components
164 for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
165 std::string filename = argv[3];
166 char *last = &argv[3][strlen(argv[3]) - 1];
167 if (*last != '.') {
168 filename += '.';
169 }
170 filename += tesseract::kTessdataFileSuffixes[i];
171 errno = 0;
172 if (tm.ExtractToFile(filename.c_str())) {
173 printf("Wrote %s\n", filename.c_str());
174 } else if (errno != 0) {
175 printf("Error, could not extract %s: %s\n", filename.c_str(),
176 strerror(errno));
177 return EXIT_FAILURE;
178 }
179 }
180 }
181 } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
182 // Rename the current traineddata file to a temporary name.
183 const char *new_traineddata_filename = argv[2];
184 std::string traineddata_filename = new_traineddata_filename;
185 traineddata_filename += ".__tmp__";
186 if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
187 tprintf("Failed to create a temporary file %s\n",
188 traineddata_filename.c_str());
189 return EXIT_FAILURE;
190 }
191
192 // Initialize TessdataManager with the data in the given traineddata file.
193 tm.Init(traineddata_filename.c_str());
194
195 // Write the updated traineddata file.
196 tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);
197 } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
198 if (!tm.Init(argv[2])) {
199 tprintf("Failed to read %s\n", argv[2]);
200 return EXIT_FAILURE;
201 }
202 tesseract::TFile fp;
203 if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
204 tprintf("No LSTM Component found in %s!\n", argv[2]);
205 return EXIT_FAILURE;
206 }
207 tesseract::LSTMRecognizer recognizer;
208 if (!recognizer.DeSerialize(&tm, &fp)) {
209 tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
210 return EXIT_FAILURE;
211 }
212 recognizer.ConvertToInt();
213 std::vector<char> lstm_data;
214 fp.OpenWrite(&lstm_data);
215 ASSERT_HOST(recognizer.Serialize(&tm, &fp));
216 tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
217 lstm_data.size());
218 if (!tm.SaveFile(argv[2], nullptr)) {
219 tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
220 return EXIT_FAILURE;
221 }
222 } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
223 return list_components(tm, argv[2]);
224 } else if (argc == 3 && strcmp(argv[1], "-l") == 0) {
225 return list_network(tm, argv[2]);
226 } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) {
227 int result = list_components(tm, argv[2]);
228 if (result == EXIT_SUCCESS) {
229 result = list_network(tm, nullptr);
230 }
231 return result;
232 } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) {
233 int result = list_network(tm, argv[2]);
234 if (result == EXIT_SUCCESS) {
235 result = list_components(tm, nullptr);
236 }
237 return result;
238 } else {
239 printf(
240 "Usage for combining tessdata components:\n"
241 " %s language_data_path_prefix\n"
242 " (e.g. %s tessdata/eng.)\n\n",
243 argv[0], argv[0]);
244 printf(
245 "Usage for extracting tessdata components:\n"
246 " %s -e traineddata_file [output_component_file...]\n"
247 " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
248 argv[0], argv[0]);
249 printf(
250 "Usage for overwriting tessdata components:\n"
251 " %s -o traineddata_file [input_component_file...]\n"
252 " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
253 argv[0], argv[0]);
254 printf(
255 "Usage for unpacking all tessdata components:\n"
256 " %s -u traineddata_file output_path_prefix\n"
257 " (e.g. %s -u eng.traineddata tmp/eng.)\n\n",
258 argv[0], argv[0]);
259 printf(
260 "Usage for listing the network information\n"
261 " %s -l traineddata_file\n"
262 " (e.g. %s -l eng.traineddata)\n\n",
263 argv[0], argv[0]);
264 printf(
265 "Usage for listing directory of components:\n"
266 " %s -d traineddata_file\n\n",
267 argv[0]);
268 printf(
269 "Usage for compacting LSTM component to int:\n"
270 " %s -c traineddata_file\n",
271 argv[0]);
272 return 1;
273 }
274 tm.Directory();
275 return EXIT_SUCCESS;
276 }
277