1 ///////////////////////////////////////////////////////////////////////
2 // File:        combine_tessdata.cpp
3 // Description: Creates a unified traineddata file from several
4 //              data files produced by the training process.
5 // Author:      Daria Antonova
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #include "commontraining.h" // CheckSharedLibraryVersion
21 #include "lstmrecognizer.h"
22 #include "tessdatamanager.h"
23 
24 #include <cerrno>
25 #include <iostream> // std::cout
26 
27 using namespace tesseract;
28 
list_components(TessdataManager & tm,const char * filename)29 static int list_components(TessdataManager &tm, const char *filename) {
30   // Initialize TessdataManager with the data in the given traineddata file.
31   if (filename != nullptr && !tm.Init(filename)) {
32     tprintf("Failed to read %s\n", filename);
33     return EXIT_FAILURE;
34   }
35   tm.Directory();
36   return EXIT_SUCCESS;
37 }
38 
list_network(TessdataManager & tm,const char * filename)39 static int list_network(TessdataManager &tm, const char *filename) {
40   if (filename != nullptr && !tm.Init(filename)) {
41     tprintf("Failed to read %s\n", filename);
42     return EXIT_FAILURE;
43   }
44   tesseract::TFile fp;
45   if (tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
46     tesseract::LSTMRecognizer recognizer;
47     if (!recognizer.DeSerialize(&tm, &fp)) {
48       tprintf("Failed to deserialize LSTM in %s!\n", filename);
49       return EXIT_FAILURE;
50     }
51     std::cout << "LSTM: network=" << recognizer.GetNetwork()
52               << ", int_mode=" << recognizer.IsIntMode()
53               << ", recoding=" << recognizer.IsRecoding()
54               << ", iteration=" << recognizer.training_iteration()
55               << ", sample_iteration=" << recognizer.sample_iteration()
56               << ", null_char=" << recognizer.null_char()
57               << ", learning_rate=" << recognizer.learning_rate()
58               << ", momentum=" << recognizer.GetMomentum()
59               << ", adam_beta=" << recognizer.GetAdamBeta() << '\n';
60 
61     std::cout << "Layer Learning Rates: ";
62     auto layers = recognizer.EnumerateLayers();
63     for (const auto &id : layers) {
64       auto layer = recognizer.GetLayer(id);
65       std::cout << id << "(" << layer->name() << ")"
66                 << "=" << recognizer.GetLayerLearningRate(id)
67                 << (layers[layers.size() - 1] != id ? ", " : "");
68     }
69     std::cout << "\n";
70   }
71   return EXIT_SUCCESS;
72 }
73 
74 // Main program to combine/extract/overwrite tessdata components
75 // in [lang].traineddata files.
76 //
77 // To combine all the individual tessdata components (unicharset, DAWGs,
78 // classifier templates, ambiguities, language configs) located at, say,
79 // /home/$USER/temp/eng.* run:
80 //
81 //   combine_tessdata /home/$USER/temp/eng.
82 //
83 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
84 //
85 // Specify option -e if you would like to extract individual components
86 // from a combined traineddata file. For example, to extract language config
87 // file and the unicharset from tessdata/eng.traineddata run:
88 //
89 //   combine_tessdata -e tessdata/eng.traineddata
90 //   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
91 //
92 // The desired config file and unicharset will be written to
93 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
94 //
95 // Specify option -o to overwrite individual components of the given
96 // [lang].traineddata file. For example, to overwrite language config
97 // and unichar ambiguities files in tessdata/eng.traineddata use:
98 //
99 //   combine_tessdata -o tessdata/eng.traineddata
100 //   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
101 //
102 // As a result, tessdata/eng.traineddata will contain the new language config
103 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
104 //
105 // Note: the file names of the files to extract to and to overwrite from should
106 // have the appropriate file suffixes (extensions) indicating their tessdata
107 // component type (.unicharset for the unicharset, .unicharambigs for unichar
108 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
109 //
110 // Specify option -u to unpack all the components to the specified path:
111 //
112 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
113 //
114 // This will create  /home/$USER/temp/eng.* files with individual tessdata
115 // components from tessdata/eng.traineddata.
116 //
main(int argc,char ** argv)117 int main(int argc, char **argv) {
118   tesseract::CheckSharedLibraryVersion();
119 
120   int i;
121   tesseract::TessdataManager tm;
122   if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
123     printf("%s\n", tesseract::TessBaseAPI::Version());
124     return EXIT_SUCCESS;
125   } else if (argc == 2) {
126     printf("Combining tessdata files\n");
127     std::string lang = argv[1];
128     char *last = &argv[1][strlen(argv[1]) - 1];
129     if (*last != '.') {
130       lang += '.';
131     }
132     std::string output_file = lang;
133     output_file += kTrainedDataSuffix;
134     if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
135       printf("Error combining tessdata files into %s\n", output_file.c_str());
136     } else {
137       printf("Output %s created successfully.\n", output_file.c_str());
138     }
139   } else if (argc >= 4 &&
140              (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) {
141     // Initialize TessdataManager with the data in the given traineddata file.
142     if (!tm.Init(argv[2])) {
143       tprintf("Failed to read %s\n", argv[2]);
144       return EXIT_FAILURE;
145     }
146     printf("Extracting tessdata components from %s\n", argv[2]);
147     if (strcmp(argv[1], "-e") == 0) {
148       for (i = 3; i < argc; ++i) {
149         errno = 0;
150         if (tm.ExtractToFile(argv[i])) {
151           printf("Wrote %s\n", argv[i]);
152         } else if (errno == 0) {
153           printf(
154               "Not extracting %s, since this component"
155               " is not present\n",
156               argv[i]);
157           return EXIT_FAILURE;
158         } else {
159           printf("Error, could not extract %s: %s\n", argv[i], strerror(errno));
160           return EXIT_FAILURE;
161         }
162       }
163     } else { // extract all the components
164       for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
165         std::string filename = argv[3];
166         char *last = &argv[3][strlen(argv[3]) - 1];
167         if (*last != '.') {
168           filename += '.';
169         }
170         filename += tesseract::kTessdataFileSuffixes[i];
171         errno = 0;
172         if (tm.ExtractToFile(filename.c_str())) {
173           printf("Wrote %s\n", filename.c_str());
174         } else if (errno != 0) {
175           printf("Error, could not extract %s: %s\n", filename.c_str(),
176                  strerror(errno));
177           return EXIT_FAILURE;
178         }
179       }
180     }
181   } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
182     // Rename the current traineddata file to a temporary name.
183     const char *new_traineddata_filename = argv[2];
184     std::string traineddata_filename = new_traineddata_filename;
185     traineddata_filename += ".__tmp__";
186     if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
187       tprintf("Failed to create a temporary file %s\n",
188               traineddata_filename.c_str());
189       return EXIT_FAILURE;
190     }
191 
192     // Initialize TessdataManager with the data in the given traineddata file.
193     tm.Init(traineddata_filename.c_str());
194 
195     // Write the updated traineddata file.
196     tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);
197   } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
198     if (!tm.Init(argv[2])) {
199       tprintf("Failed to read %s\n", argv[2]);
200       return EXIT_FAILURE;
201     }
202     tesseract::TFile fp;
203     if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
204       tprintf("No LSTM Component found in %s!\n", argv[2]);
205       return EXIT_FAILURE;
206     }
207     tesseract::LSTMRecognizer recognizer;
208     if (!recognizer.DeSerialize(&tm, &fp)) {
209       tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
210       return EXIT_FAILURE;
211     }
212     recognizer.ConvertToInt();
213     std::vector<char> lstm_data;
214     fp.OpenWrite(&lstm_data);
215     ASSERT_HOST(recognizer.Serialize(&tm, &fp));
216     tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
217                       lstm_data.size());
218     if (!tm.SaveFile(argv[2], nullptr)) {
219       tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
220       return EXIT_FAILURE;
221     }
222   } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
223     return list_components(tm, argv[2]);
224   } else if (argc == 3 && strcmp(argv[1], "-l") == 0) {
225     return list_network(tm, argv[2]);
226   } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) {
227     int result = list_components(tm, argv[2]);
228     if (result == EXIT_SUCCESS) {
229       result = list_network(tm, nullptr);
230     }
231     return result;
232   } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) {
233     int result = list_network(tm, argv[2]);
234     if (result == EXIT_SUCCESS) {
235       result = list_components(tm, nullptr);
236     }
237     return result;
238   } else {
239     printf(
240         "Usage for combining tessdata components:\n"
241         "  %s language_data_path_prefix\n"
242         "  (e.g. %s tessdata/eng.)\n\n",
243         argv[0], argv[0]);
244     printf(
245         "Usage for extracting tessdata components:\n"
246         "  %s -e traineddata_file [output_component_file...]\n"
247         "  (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
248         argv[0], argv[0]);
249     printf(
250         "Usage for overwriting tessdata components:\n"
251         "  %s -o traineddata_file [input_component_file...]\n"
252         "  (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
253         argv[0], argv[0]);
254     printf(
255         "Usage for unpacking all tessdata components:\n"
256         "  %s -u traineddata_file output_path_prefix\n"
257         "  (e.g. %s -u eng.traineddata tmp/eng.)\n\n",
258         argv[0], argv[0]);
259     printf(
260         "Usage for listing the network information\n"
261         "  %s -l traineddata_file\n"
262         "  (e.g. %s -l eng.traineddata)\n\n",
263         argv[0], argv[0]);
264     printf(
265         "Usage for listing directory of components:\n"
266         "  %s -d traineddata_file\n\n",
267         argv[0]);
268     printf(
269         "Usage for compacting LSTM component to int:\n"
270         "  %s -c traineddata_file\n",
271         argv[0]);
272     return 1;
273   }
274   tm.Directory();
275   return EXIT_SUCCESS;
276 }
277