1 /////////////////////////////////////////////////////////////////////// 2 // File: tessdatamanager.h 3 // Description: Functions to handle loading/combining tesseract data files. 4 // Author: Daria Antonova 5 // 6 // (C) Copyright 2009, Google Inc. 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // 17 /////////////////////////////////////////////////////////////////////// 18 19 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 20 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 21 22 #include <tesseract/baseapi.h> // FileReader 23 #include <string> // std::string 24 #include <vector> // std::vector 25 #include "serialis.h" // FileWriter 26 27 static const char kTrainedDataSuffix[] = "traineddata"; 28 29 // When adding new tessdata types and file suffixes, please make sure to 30 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. 31 static const char kLangConfigFileSuffix[] = "config"; 32 static const char kUnicharsetFileSuffix[] = "unicharset"; 33 static const char kAmbigsFileSuffix[] = "unicharambigs"; 34 static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; 35 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; 36 static const char kNormProtoFileSuffix[] = "normproto"; 37 static const char kPuncDawgFileSuffix[] = "punc-dawg"; 38 static const char kSystemDawgFileSuffix[] = "word-dawg"; 39 static const char kNumberDawgFileSuffix[] = "number-dawg"; 40 static const char kFreqDawgFileSuffix[] = "freq-dawg"; 41 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs"; 42 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset"; 43 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg"; 44 static const char kShapeTableFileSuffix[] = "shapetable"; 45 static const char kBigramDawgFileSuffix[] = "bigram-dawg"; 46 static const char kUnambigDawgFileSuffix[] = "unambig-dawg"; 47 static const char kParamsModelFileSuffix[] = "params-model"; 48 static const char kLSTMModelFileSuffix[] = "lstm"; 49 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg"; 50 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg"; 51 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg"; 52 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset"; 53 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder"; 54 static const char kVersionFileSuffix[] = "version"; 55 56 namespace tesseract { 57 58 enum TessdataType { 59 TESSDATA_LANG_CONFIG, // 0 60 TESSDATA_UNICHARSET, // 1 61 TESSDATA_AMBIGS, // 2 62 TESSDATA_INTTEMP, // 3 63 TESSDATA_PFFMTABLE, // 4 64 TESSDATA_NORMPROTO, // 5 65 TESSDATA_PUNC_DAWG, // 6 66 TESSDATA_SYSTEM_DAWG, // 7 67 TESSDATA_NUMBER_DAWG, // 8 68 TESSDATA_FREQ_DAWG, // 9 69 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated 70 TESSDATA_CUBE_UNICHARSET, // 11 // deprecated 71 TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated 72 TESSDATA_SHAPE_TABLE, // 13 73 TESSDATA_BIGRAM_DAWG, // 14 74 TESSDATA_UNAMBIG_DAWG, // 15 75 TESSDATA_PARAMS_MODEL, // 16 76 TESSDATA_LSTM, // 17 77 TESSDATA_LSTM_PUNC_DAWG, // 18 78 TESSDATA_LSTM_SYSTEM_DAWG, // 19 79 TESSDATA_LSTM_NUMBER_DAWG, // 20 80 TESSDATA_LSTM_UNICHARSET, // 21 81 TESSDATA_LSTM_RECODER, // 22 82 TESSDATA_VERSION, // 23 83 84 TESSDATA_NUM_ENTRIES 85 }; 86 87 /** 88 * kTessdataFileSuffixes[i] indicates the file suffix for 89 * tessdata of type i (from TessdataType enum). 90 */ 91 static const char *const kTessdataFileSuffixes[] = { 92 kLangConfigFileSuffix, // 0 93 kUnicharsetFileSuffix, // 1 94 kAmbigsFileSuffix, // 2 95 kBuiltInTemplatesFileSuffix, // 3 96 kBuiltInCutoffsFileSuffix, // 4 97 kNormProtoFileSuffix, // 5 98 kPuncDawgFileSuffix, // 6 99 kSystemDawgFileSuffix, // 7 100 kNumberDawgFileSuffix, // 8 101 kFreqDawgFileSuffix, // 9 102 kFixedLengthDawgsFileSuffix, // 10 // deprecated 103 kCubeUnicharsetFileSuffix, // 11 // deprecated 104 kCubeSystemDawgFileSuffix, // 12 // deprecated 105 kShapeTableFileSuffix, // 13 106 kBigramDawgFileSuffix, // 14 107 kUnambigDawgFileSuffix, // 15 108 kParamsModelFileSuffix, // 16 109 kLSTMModelFileSuffix, // 17 110 kLSTMPuncDawgFileSuffix, // 18 111 kLSTMSystemDawgFileSuffix, // 19 112 kLSTMNumberDawgFileSuffix, // 20 113 kLSTMUnicharsetFileSuffix, // 21 114 kLSTMRecoderFileSuffix, // 22 115 kVersionFileSuffix, // 23 116 }; 117 118 /** 119 * TessdataType could be updated to contain more entries, however 120 * we do not expect that number to be astronomically high. 121 * In order to automatically detect endianness TessdataManager will 122 * flip the bits if actual_tessdata_num_entries_ is larger than 123 * kMaxNumTessdataEntries. 124 */ 125 static const int kMaxNumTessdataEntries = 1000; 126 127 class TESS_API TessdataManager { 128 public: 129 TessdataManager(); 130 explicit TessdataManager(FileReader reader); 131 132 ~TessdataManager() = default; 133 swap()134 bool swap() const { 135 return swap_; 136 } is_loaded()137 bool is_loaded() const { 138 return is_loaded_; 139 } 140 141 // Lazily loads from the the given filename. Won't actually read the file 142 // until it needs it. 143 void LoadFileLater(const char *data_file_name); 144 /** 145 * Opens and reads the given data file right now. 146 * @return true on success. 147 */ 148 bool Init(const char *data_file_name); 149 // Loads from the given memory buffer as if a file, remembering name as some 150 // arbitrary source id for caching. 151 bool LoadMemBuffer(const char *name, const char *data, int size); 152 // Overwrites a single entry of the given type. 153 void OverwriteEntry(TessdataType type, const char *data, int size); 154 155 // Saves to the given filename. 156 bool SaveFile(const char *filename, FileWriter writer) const; 157 // Serializes to the given vector. 158 void Serialize(std::vector<char> *data) const; 159 // Resets to the initial state, keeping the reader. 160 void Clear(); 161 162 // Prints a directory of contents. 163 void Directory() const; 164 165 // Returns true if the component requested is present. IsComponentAvailable(TessdataType type)166 bool IsComponentAvailable(TessdataType type) const { 167 return !entries_[type].empty(); 168 } 169 // Opens the given TFile pointer to the given component type. 170 // Returns false in case of failure. 171 bool GetComponent(TessdataType type, TFile *fp); 172 // As non-const version except it can't load the component if not already 173 // loaded. 174 bool GetComponent(TessdataType type, TFile *fp) const; 175 176 // Returns the current version string. 177 std::string VersionString() const; 178 // Sets the version string to the given v_str. 179 void SetVersionString(const std::string &v_str); 180 181 // Returns true if the base Tesseract components are present. IsBaseAvailable()182 bool IsBaseAvailable() const { 183 return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty(); 184 } 185 186 // Returns true if the LSTM components are present. IsLSTMAvailable()187 bool IsLSTMAvailable() const { 188 return !entries_[TESSDATA_LSTM].empty(); 189 } 190 191 // Return the name of the underlying data file. GetDataFileName()192 const std::string &GetDataFileName() const { 193 return data_file_name_; 194 } 195 196 /** 197 * Reads all the standard tesseract config and data files for a language 198 * at the given path and bundles them up into one binary data file. 199 * Returns true if the combined traineddata file was successfully written. 200 */ 201 bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename); 202 203 /** 204 * Gets the individual components from the data_file_ with which the class was 205 * initialized. Overwrites the components specified by component_filenames. 206 * Writes the updated traineddata file to new_traineddata_filename. 207 */ 208 bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, 209 int num_new_components); 210 211 /** 212 * Extracts tessdata component implied by the name of the input file from 213 * the combined traineddata loaded into TessdataManager. 214 * Writes the extracted component to the file indicated by the file name. 215 * E.g. if the filename given is somepath/somelang.unicharset, unicharset 216 * will be extracted from the data loaded into the TessdataManager and will 217 * be written to somepath/somelang.unicharset. 218 * @return true if the component was successfully extracted, false if the 219 * component was not present in the traineddata loaded into TessdataManager. 220 */ 221 bool ExtractToFile(const char *filename); 222 223 private: 224 // Use libarchive. 225 bool LoadArchiveFile(const char *filename); 226 227 /** 228 * Fills type with TessdataType of the tessdata component represented by the 229 * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. 230 * @return true if the tessdata component type could be determined 231 * from the given file name. 232 */ 233 static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type); 234 235 /** 236 * Tries to determine tessdata component file suffix from filename, 237 * returns true on success. 238 */ 239 static bool TessdataTypeFromFileName(const char *filename, TessdataType *type); 240 241 // Name of file it came from. 242 std::string data_file_name_; 243 // Function to load the file when we need it. 244 FileReader reader_; 245 // True if the file has been loaded. 246 bool is_loaded_; 247 // True if the bytes need swapping. 248 bool swap_; 249 // Contents of each element of the traineddata file. 250 std::vector<char> entries_[TESSDATA_NUM_ENTRIES]; 251 }; 252 253 } // namespace tesseract 254 255 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 256