1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15 #define TESSERACT_TRAINING_COMMONTRAINING_H_
16 
17 #ifdef HAVE_CONFIG_H
18 #  include "config_auto.h"
19 #endif
20 
21 #include "commandlineflags.h"
22 #include "export.h"
23 #include "tprintf.h"
24 
25 #include <tesseract/baseapi.h>
26 
27 #include <memory>
28 
29 namespace tesseract {
30 
31 TESS_COMMON_TRAINING_API
32 void ParseArguments(int *argc, char ***argv);
33 
34 // Check whether the shared tesseract library is the right one.
35 // This function must be inline because otherwise it would be part of
36 // the shared library, so it could not compare the versions.
CheckSharedLibraryVersion()37 static inline void CheckSharedLibraryVersion() {
38 #ifdef HAVE_CONFIG_H
39   if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
40     tprintf(
41         "ERROR: shared library version mismatch (was %s, expected %s\n"
42         "Did you use a wrong shared tesseract library?\n",
43         TessBaseAPI::Version(), TESSERACT_VERSION_STR);
44     exit(1);
45   }
46 #endif
47 }
48 
49 } // namespace tesseract
50 
51 #ifndef DISABLED_LEGACY_ENGINE
52 
53 #  include "cluster.h"
54 #  include "featdefs.h"
55 #  include "intproto.h"
56 #  include "oldlist.h"
57 
58 namespace tesseract {
59 
60 class Classify;
61 class MasterTrainer;
62 class ShapeTable;
63 
64 //////////////////////////////////////////////////////////////////////////////
65 // Globals ///////////////////////////////////////////////////////////////////
66 //////////////////////////////////////////////////////////////////////////////
67 
68 TESS_COMMON_TRAINING_API
69 extern FEATURE_DEFS_STRUCT feature_defs;
70 
71 // Must be defined in the file that "implements" commonTraining facilities.
72 TESS_COMMON_TRAINING_API
73 extern CLUSTERCONFIG Config;
74 
75 //////////////////////////////////////////////////////////////////////////////
76 // Structs ///////////////////////////////////////////////////////////////////
77 //////////////////////////////////////////////////////////////////////////////
78 struct LABELEDLISTNODE {
79   /// This constructor allocates a new, empty labeled list and gives
80   /// it the specified label.
81   /// @param Label label for new list
LABELEDLISTNODELABELEDLISTNODE82   LABELEDLISTNODE(const char *label) : Label(label) {
83   }
84   std::string Label;
85   int SampleCount = 0;
86   int font_sample_count = 0;
87   LIST List = nullptr;
88 };
89 using LABELEDLIST = LABELEDLISTNODE *;
90 
91 struct MERGE_CLASS_NODE {
MERGE_CLASS_NODEMERGE_CLASS_NODE92   MERGE_CLASS_NODE(const char * label) : Label(label), Class(NewClass(MAX_NUM_PROTOS, MAX_NUM_CONFIGS)) {
93   }
94   std::string Label;
95   int NumMerged[MAX_NUM_PROTOS];
96   tesseract::CLASS_TYPE Class;
97 };
98 using MERGE_CLASS = MERGE_CLASS_NODE *;
99 
100 //////////////////////////////////////////////////////////////////////////////
101 // Functions /////////////////////////////////////////////////////////////////
102 //////////////////////////////////////////////////////////////////////////////
103 
104 // Helper loads shape table from the given file.
105 ShapeTable *LoadShapeTable(const std::string &file_prefix);
106 // Helper to write the shape_table.
107 TESS_COMMON_TRAINING_API
108 void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table);
109 
110 // Creates a MasterTraininer and loads the training data into it:
111 // Initializes feature_defs and IntegerFX.
112 // Loads the shape_table if shape_table != nullptr.
113 // Loads initial unicharset from -U command-line option.
114 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
115 //   Loads font info from -F option.
116 //   Loads xheights from -X option.
117 //   Loads samples from .tr files in remaining command-line args.
118 //   Deletes outliers and computes canonical samples.
119 //   If FLAGS_output_trainer is set, saves the trainer for future use.
120 // Computes canonical and cloud features.
121 // If shape_table is not nullptr, but failed to load, make a fake flat one,
122 // as shape clustering was not run.
123 TESS_COMMON_TRAINING_API
124 std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
125                                                 ShapeTable **shape_table, std::string &file_prefix);
126 
127 LABELEDLIST FindList(tesseract::LIST List, const std::string &Label);
128 
129 TESS_COMMON_TRAINING_API
130 void ReadTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &feature_defs,
131                          const char *feature_name, int max_samples,
132                          tesseract::UNICHARSET *unicharset, FILE *file,
133                          tesseract::LIST *training_samples);
134 
135 void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory,
136                           tesseract::LIST CharList, const char *program_feature_type);
137 
138 TESS_COMMON_TRAINING_API
139 void FreeTrainingSamples(tesseract::LIST CharList);
140 
141 TESS_COMMON_TRAINING_API
142 void FreeLabeledList(LABELEDLIST LabeledList);
143 
144 TESS_COMMON_TRAINING_API
145 void FreeLabeledClassList(tesseract::LIST ClassListList);
146 
147 TESS_COMMON_TRAINING_API
148 tesseract::CLUSTERER *SetUpForClustering(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs,
149                                          LABELEDLIST CharSample, const char *program_feature_type);
150 
151 TESS_COMMON_TRAINING_API
152 tesseract::LIST RemoveInsignificantProtos(tesseract::LIST ProtoList, bool KeepSigProtos,
153                                           bool KeepInsigProtos, int N);
154 
155 TESS_COMMON_TRAINING_API
156 void CleanUpUnusedData(tesseract::LIST ProtoList);
157 
158 TESS_COMMON_TRAINING_API
159 void MergeInsignificantProtos(tesseract::LIST ProtoList, const char *label,
160                               tesseract::CLUSTERER *Clusterer, tesseract::CLUSTERCONFIG *Config);
161 
162 TESS_COMMON_TRAINING_API
163 MERGE_CLASS FindClass(tesseract::LIST List, const std::string &Label);
164 
165 TESS_COMMON_TRAINING_API
166 tesseract::CLASS_STRUCT *SetUpForFloat2Int(const tesseract::UNICHARSET &unicharset,
167                                            tesseract::LIST LabeledClassList);
168 
169 void Normalize(float *Values);
170 
171 TESS_COMMON_TRAINING_API
172 void FreeNormProtoList(tesseract::LIST CharList);
173 
174 TESS_COMMON_TRAINING_API
175 void AddToNormProtosList(tesseract::LIST *NormProtoList, tesseract::LIST ProtoList, const std::string &CharName);
176 
177 TESS_COMMON_TRAINING_API
178 int NumberOfProtos(tesseract::LIST ProtoList, bool CountSigProtos, bool CountInsigProtos);
179 
180 void allocNormProtos();
181 
182 } // namespace tesseract
183 
184 #endif // def DISABLED_LEGACY_ENGINE
185 
186 #endif // TESSERACT_TRAINING_COMMONTRAINING_H_
187