1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15 #define TESSERACT_TRAINING_COMMONTRAINING_H_
16
17 #ifdef HAVE_CONFIG_H
18 # include "config_auto.h"
19 #endif
20
21 #include "commandlineflags.h"
22 #include "export.h"
23 #include "tprintf.h"
24
25 #include <tesseract/baseapi.h>
26
27 #include <memory>
28
29 namespace tesseract {
30
31 TESS_COMMON_TRAINING_API
32 void ParseArguments(int *argc, char ***argv);
33
34 // Check whether the shared tesseract library is the right one.
35 // This function must be inline because otherwise it would be part of
36 // the shared library, so it could not compare the versions.
CheckSharedLibraryVersion()37 static inline void CheckSharedLibraryVersion() {
38 #ifdef HAVE_CONFIG_H
39 if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
40 tprintf(
41 "ERROR: shared library version mismatch (was %s, expected %s\n"
42 "Did you use a wrong shared tesseract library?\n",
43 TessBaseAPI::Version(), TESSERACT_VERSION_STR);
44 exit(1);
45 }
46 #endif
47 }
48
49 } // namespace tesseract
50
51 #ifndef DISABLED_LEGACY_ENGINE
52
53 # include "cluster.h"
54 # include "featdefs.h"
55 # include "intproto.h"
56 # include "oldlist.h"
57
58 namespace tesseract {
59
60 class Classify;
61 class MasterTrainer;
62 class ShapeTable;
63
64 //////////////////////////////////////////////////////////////////////////////
65 // Globals ///////////////////////////////////////////////////////////////////
66 //////////////////////////////////////////////////////////////////////////////
67
68 TESS_COMMON_TRAINING_API
69 extern FEATURE_DEFS_STRUCT feature_defs;
70
71 // Must be defined in the file that "implements" commonTraining facilities.
72 TESS_COMMON_TRAINING_API
73 extern CLUSTERCONFIG Config;
74
75 //////////////////////////////////////////////////////////////////////////////
76 // Structs ///////////////////////////////////////////////////////////////////
77 //////////////////////////////////////////////////////////////////////////////
78 struct LABELEDLISTNODE {
79 /// This constructor allocates a new, empty labeled list and gives
80 /// it the specified label.
81 /// @param Label label for new list
LABELEDLISTNODELABELEDLISTNODE82 LABELEDLISTNODE(const char *label) : Label(label) {
83 }
84 std::string Label;
85 int SampleCount = 0;
86 int font_sample_count = 0;
87 LIST List = nullptr;
88 };
89 using LABELEDLIST = LABELEDLISTNODE *;
90
91 struct MERGE_CLASS_NODE {
MERGE_CLASS_NODEMERGE_CLASS_NODE92 MERGE_CLASS_NODE(const char * label) : Label(label), Class(NewClass(MAX_NUM_PROTOS, MAX_NUM_CONFIGS)) {
93 }
94 std::string Label;
95 int NumMerged[MAX_NUM_PROTOS];
96 tesseract::CLASS_TYPE Class;
97 };
98 using MERGE_CLASS = MERGE_CLASS_NODE *;
99
100 //////////////////////////////////////////////////////////////////////////////
101 // Functions /////////////////////////////////////////////////////////////////
102 //////////////////////////////////////////////////////////////////////////////
103
104 // Helper loads shape table from the given file.
105 ShapeTable *LoadShapeTable(const std::string &file_prefix);
106 // Helper to write the shape_table.
107 TESS_COMMON_TRAINING_API
108 void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table);
109
110 // Creates a MasterTraininer and loads the training data into it:
111 // Initializes feature_defs and IntegerFX.
112 // Loads the shape_table if shape_table != nullptr.
113 // Loads initial unicharset from -U command-line option.
114 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
115 // Loads font info from -F option.
116 // Loads xheights from -X option.
117 // Loads samples from .tr files in remaining command-line args.
118 // Deletes outliers and computes canonical samples.
119 // If FLAGS_output_trainer is set, saves the trainer for future use.
120 // Computes canonical and cloud features.
121 // If shape_table is not nullptr, but failed to load, make a fake flat one,
122 // as shape clustering was not run.
123 TESS_COMMON_TRAINING_API
124 std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
125 ShapeTable **shape_table, std::string &file_prefix);
126
127 LABELEDLIST FindList(tesseract::LIST List, const std::string &Label);
128
129 TESS_COMMON_TRAINING_API
130 void ReadTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &feature_defs,
131 const char *feature_name, int max_samples,
132 tesseract::UNICHARSET *unicharset, FILE *file,
133 tesseract::LIST *training_samples);
134
135 void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory,
136 tesseract::LIST CharList, const char *program_feature_type);
137
138 TESS_COMMON_TRAINING_API
139 void FreeTrainingSamples(tesseract::LIST CharList);
140
141 TESS_COMMON_TRAINING_API
142 void FreeLabeledList(LABELEDLIST LabeledList);
143
144 TESS_COMMON_TRAINING_API
145 void FreeLabeledClassList(tesseract::LIST ClassListList);
146
147 TESS_COMMON_TRAINING_API
148 tesseract::CLUSTERER *SetUpForClustering(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs,
149 LABELEDLIST CharSample, const char *program_feature_type);
150
151 TESS_COMMON_TRAINING_API
152 tesseract::LIST RemoveInsignificantProtos(tesseract::LIST ProtoList, bool KeepSigProtos,
153 bool KeepInsigProtos, int N);
154
155 TESS_COMMON_TRAINING_API
156 void CleanUpUnusedData(tesseract::LIST ProtoList);
157
158 TESS_COMMON_TRAINING_API
159 void MergeInsignificantProtos(tesseract::LIST ProtoList, const char *label,
160 tesseract::CLUSTERER *Clusterer, tesseract::CLUSTERCONFIG *Config);
161
162 TESS_COMMON_TRAINING_API
163 MERGE_CLASS FindClass(tesseract::LIST List, const std::string &Label);
164
165 TESS_COMMON_TRAINING_API
166 tesseract::CLASS_STRUCT *SetUpForFloat2Int(const tesseract::UNICHARSET &unicharset,
167 tesseract::LIST LabeledClassList);
168
169 void Normalize(float *Values);
170
171 TESS_COMMON_TRAINING_API
172 void FreeNormProtoList(tesseract::LIST CharList);
173
174 TESS_COMMON_TRAINING_API
175 void AddToNormProtosList(tesseract::LIST *NormProtoList, tesseract::LIST ProtoList, const std::string &CharName);
176
177 TESS_COMMON_TRAINING_API
178 int NumberOfProtos(tesseract::LIST ProtoList, bool CountSigProtos, bool CountInsigProtos);
179
180 void allocNormProtos();
181
182 } // namespace tesseract
183
184 #endif // def DISABLED_LEGACY_ENGINE
185
186 #endif // TESSERACT_TRAINING_COMMONTRAINING_H_
187