1 ///////////////////////////////////////////////////////////////////////
2 // File:        tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author:      Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 
22 #include <tesseract/baseapi.h> // FileReader
23 #include <string>              // std::string
24 #include <vector>              // std::vector
25 #include "serialis.h"          // FileWriter
26 
27 static const char kTrainedDataSuffix[] = "traineddata";
28 
29 // When adding new tessdata types and file suffixes, please make sure to
30 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
31 static const char kLangConfigFileSuffix[] = "config";
32 static const char kUnicharsetFileSuffix[] = "unicharset";
33 static const char kAmbigsFileSuffix[] = "unicharambigs";
34 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
35 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
36 static const char kNormProtoFileSuffix[] = "normproto";
37 static const char kPuncDawgFileSuffix[] = "punc-dawg";
38 static const char kSystemDawgFileSuffix[] = "word-dawg";
39 static const char kNumberDawgFileSuffix[] = "number-dawg";
40 static const char kFreqDawgFileSuffix[] = "freq-dawg";
41 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
42 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
43 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
44 static const char kShapeTableFileSuffix[] = "shapetable";
45 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
46 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
47 static const char kParamsModelFileSuffix[] = "params-model";
48 static const char kLSTMModelFileSuffix[] = "lstm";
49 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
50 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
51 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
52 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
53 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
54 static const char kVersionFileSuffix[] = "version";
55 
56 namespace tesseract {
57 
58 enum TessdataType {
59   TESSDATA_LANG_CONFIG,        // 0
60   TESSDATA_UNICHARSET,         // 1
61   TESSDATA_AMBIGS,             // 2
62   TESSDATA_INTTEMP,            // 3
63   TESSDATA_PFFMTABLE,          // 4
64   TESSDATA_NORMPROTO,          // 5
65   TESSDATA_PUNC_DAWG,          // 6
66   TESSDATA_SYSTEM_DAWG,        // 7
67   TESSDATA_NUMBER_DAWG,        // 8
68   TESSDATA_FREQ_DAWG,          // 9
69   TESSDATA_FIXED_LENGTH_DAWGS, // 10  // deprecated
70   TESSDATA_CUBE_UNICHARSET,    // 11  // deprecated
71   TESSDATA_CUBE_SYSTEM_DAWG,   // 12  // deprecated
72   TESSDATA_SHAPE_TABLE,        // 13
73   TESSDATA_BIGRAM_DAWG,        // 14
74   TESSDATA_UNAMBIG_DAWG,       // 15
75   TESSDATA_PARAMS_MODEL,       // 16
76   TESSDATA_LSTM,               // 17
77   TESSDATA_LSTM_PUNC_DAWG,     // 18
78   TESSDATA_LSTM_SYSTEM_DAWG,   // 19
79   TESSDATA_LSTM_NUMBER_DAWG,   // 20
80   TESSDATA_LSTM_UNICHARSET,    // 21
81   TESSDATA_LSTM_RECODER,       // 22
82   TESSDATA_VERSION,            // 23
83 
84   TESSDATA_NUM_ENTRIES
85 };
86 
87 /**
88  * kTessdataFileSuffixes[i] indicates the file suffix for
89  * tessdata of type i (from TessdataType enum).
90  */
91 static const char *const kTessdataFileSuffixes[] = {
92     kLangConfigFileSuffix,       // 0
93     kUnicharsetFileSuffix,       // 1
94     kAmbigsFileSuffix,           // 2
95     kBuiltInTemplatesFileSuffix, // 3
96     kBuiltInCutoffsFileSuffix,   // 4
97     kNormProtoFileSuffix,        // 5
98     kPuncDawgFileSuffix,         // 6
99     kSystemDawgFileSuffix,       // 7
100     kNumberDawgFileSuffix,       // 8
101     kFreqDawgFileSuffix,         // 9
102     kFixedLengthDawgsFileSuffix, // 10  // deprecated
103     kCubeUnicharsetFileSuffix,   // 11  // deprecated
104     kCubeSystemDawgFileSuffix,   // 12  // deprecated
105     kShapeTableFileSuffix,       // 13
106     kBigramDawgFileSuffix,       // 14
107     kUnambigDawgFileSuffix,      // 15
108     kParamsModelFileSuffix,      // 16
109     kLSTMModelFileSuffix,        // 17
110     kLSTMPuncDawgFileSuffix,     // 18
111     kLSTMSystemDawgFileSuffix,   // 19
112     kLSTMNumberDawgFileSuffix,   // 20
113     kLSTMUnicharsetFileSuffix,   // 21
114     kLSTMRecoderFileSuffix,      // 22
115     kVersionFileSuffix,          // 23
116 };
117 
118 /**
119  * TessdataType could be updated to contain more entries, however
120  * we do not expect that number to be astronomically high.
121  * In order to automatically detect endianness TessdataManager will
122  * flip the bits if actual_tessdata_num_entries_ is larger than
123  * kMaxNumTessdataEntries.
124  */
125 static const int kMaxNumTessdataEntries = 1000;
126 
127 class TESS_API TessdataManager {
128 public:
129   TessdataManager();
130   explicit TessdataManager(FileReader reader);
131 
132   ~TessdataManager() = default;
133 
swap()134   bool swap() const {
135     return swap_;
136   }
is_loaded()137   bool is_loaded() const {
138     return is_loaded_;
139   }
140 
141   // Lazily loads from the the given filename. Won't actually read the file
142   // until it needs it.
143   void LoadFileLater(const char *data_file_name);
144   /**
145    * Opens and reads the given data file right now.
146    * @return true on success.
147    */
148   bool Init(const char *data_file_name);
149   // Loads from the given memory buffer as if a file, remembering name as some
150   // arbitrary source id for caching.
151   bool LoadMemBuffer(const char *name, const char *data, int size);
152   // Overwrites a single entry of the given type.
153   void OverwriteEntry(TessdataType type, const char *data, int size);
154 
155   // Saves to the given filename.
156   bool SaveFile(const char *filename, FileWriter writer) const;
157   // Serializes to the given vector.
158   void Serialize(std::vector<char> *data) const;
159   // Resets to the initial state, keeping the reader.
160   void Clear();
161 
162   // Prints a directory of contents.
163   void Directory() const;
164 
165   // Returns true if the component requested is present.
IsComponentAvailable(TessdataType type)166   bool IsComponentAvailable(TessdataType type) const {
167     return !entries_[type].empty();
168   }
169   // Opens the given TFile pointer to the given component type.
170   // Returns false in case of failure.
171   bool GetComponent(TessdataType type, TFile *fp);
172   // As non-const version except it can't load the component if not already
173   // loaded.
174   bool GetComponent(TessdataType type, TFile *fp) const;
175 
176   // Returns the current version string.
177   std::string VersionString() const;
178   // Sets the version string to the given v_str.
179   void SetVersionString(const std::string &v_str);
180 
181   // Returns true if the base Tesseract components are present.
IsBaseAvailable()182   bool IsBaseAvailable() const {
183     return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184   }
185 
186   // Returns true if the LSTM components are present.
IsLSTMAvailable()187   bool IsLSTMAvailable() const {
188     return !entries_[TESSDATA_LSTM].empty();
189   }
190 
191   // Return the name of the underlying data file.
GetDataFileName()192   const std::string &GetDataFileName() const {
193     return data_file_name_;
194   }
195 
196   /**
197    * Reads all the standard tesseract config and data files for a language
198    * at the given path and bundles them up into one binary data file.
199    * Returns true if the combined traineddata file was successfully written.
200    */
201   bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);
202 
203   /**
204    * Gets the individual components from the data_file_ with which the class was
205    * initialized. Overwrites the components specified by component_filenames.
206    * Writes the updated traineddata file to new_traineddata_filename.
207    */
208   bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,
209                            int num_new_components);
210 
211   /**
212    * Extracts tessdata component implied by the name of the input file from
213    * the combined traineddata loaded into TessdataManager.
214    * Writes the extracted component to the file indicated by the file name.
215    * E.g. if the filename given is somepath/somelang.unicharset, unicharset
216    * will be extracted from the data loaded into the TessdataManager and will
217    * be written to somepath/somelang.unicharset.
218    * @return true if the component was successfully extracted, false if the
219    * component was not present in the traineddata loaded into TessdataManager.
220    */
221   bool ExtractToFile(const char *filename);
222 
223 private:
224   // Use libarchive.
225   bool LoadArchiveFile(const char *filename);
226 
227   /**
228    * Fills type with TessdataType of the tessdata component represented by the
229    * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
230    * @return true if the tessdata component type could be determined
231    * from the given file name.
232    */
233   static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);
234 
235   /**
236    * Tries to determine tessdata component file suffix from filename,
237    * returns true on success.
238    */
239   static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);
240 
241   // Name of file it came from.
242   std::string data_file_name_;
243   // Function to load the file when we need it.
244   FileReader reader_;
245   // True if the file has been loaded.
246   bool is_loaded_;
247   // True if the bytes need swapping.
248   bool swap_;
249   // Contents of each element of the traineddata file.
250   std::vector<char> entries_[TESSDATA_NUM_ENTRIES];
251 };
252 
253 } // namespace tesseract
254 
255 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
256