1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 **
7 ** (c) Copyright Hewlett-Packard Company, 1988.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 ******************************************************************************/
18
19 /*----------------------------------------------------------------------------
20 Include Files and Type Defines
21 ----------------------------------------------------------------------------*/
22 #include <tesseract/unichar.h>
23 #include <cmath>
24 #include <cstdio>
25 #include <cstring>
26 #include "cluster.h"
27 #include "clusttool.h"
28 #include "commontraining.h"
29 #include "featdefs.h"
30 #include "ocrfeatures.h"
31 #include "oldlist.h"
32
33 #define PROGRAM_FEATURE_TYPE "cn"
34
35 using namespace tesseract;
36
37 /*----------------------------------------------------------------------------
38 Private Function Prototypes
39 ----------------------------------------------------------------------------*/
40
41 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
42 const FEATURE_DESC_STRUCT *feature_desc);
43
44 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
45 bool WriteInsigProtos);
46
47 /*----------------------------------------------------------------------------
48 Global Data Definitions and Declarations
49 ----------------------------------------------------------------------------*/
50 /* global variable to hold configuration parameters to control clustering */
51 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
52 static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0};
53
54 /*----------------------------------------------------------------------------
55 Public Code
56 ----------------------------------------------------------------------------*/
57
58 /**
59 * This program reads in a text file consisting of feature
60 * samples from a training page in the following format:
61 * @verbatim
62 FontName CharName NumberOfFeatureTypes(N)
63 FeatureTypeName1 NumberOfFeatures(M)
64 Feature1
65 ...
66 FeatureM
67 FeatureTypeName2 NumberOfFeatures(M)
68 Feature1
69 ...
70 FeatureM
71 ...
72 FeatureTypeNameN NumberOfFeatures(M)
73 Feature1
74 ...
75 FeatureM
76 FontName CharName ...
77 @endverbatim
78 * It then appends these samples into a separate file for each
79 * character. The name of the file is
80 *
81 * DirectoryName/FontName/CharName.FeatureTypeName
82 *
83 * The DirectoryName can be specified via a command
84 * line argument. If not specified, it defaults to the
85 * current directory. The format of the resulting files is:
86 * @verbatim
87 NumberOfFeatures(M)
88 Feature1
89 ...
90 FeatureM
91 NumberOfFeatures(M)
92 ...
93 @endverbatim
94 * The output files each have a header which describes the
95 * type of feature which the file contains. This header is
96 * in the format required by the clusterer. A command line
97 * argument can also be used to specify that only the first
98 * N samples of each class should be used.
99 * @param argc number of command line arguments
100 * @param argv array of command line arguments
101 * @return 0 on success
102 */
main(int argc,char * argv[])103 int main(int argc, char *argv[]) {
104 tesseract::CheckSharedLibraryVersion();
105
106 // Set the global Config parameters before parsing the command line.
107 Config = CNConfig;
108
109 LIST CharList = NIL_LIST;
110 CLUSTERER *Clusterer = nullptr;
111 LIST ProtoList = NIL_LIST;
112 LIST NormProtoList = NIL_LIST;
113 LIST pCharList;
114 LABELEDLIST CharSample;
115 FEATURE_DEFS_STRUCT FeatureDefs;
116 InitFeatureDefs(&FeatureDefs);
117
118 ParseArguments(&argc, &argv);
119 int num_fonts = 0;
120 for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) {
121 printf("Reading %s ...\n", PageName);
122 FILE *TrainingPage = fopen(PageName, "rb");
123 ASSERT_HOST(TrainingPage);
124 if (TrainingPage) {
125 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList);
126 fclose(TrainingPage);
127 ++num_fonts;
128 }
129 }
130 printf("Clustering ...\n");
131 // To allow an individual font to form a separate cluster,
132 // reduce the min samples:
133 // Config.MinSamples = 0.5 / num_fonts;
134 pCharList = CharList;
135 // The norm protos will count the source protos, so we keep them here in
136 // freeable_protos, so they can be freed later.
137 std::vector<LIST> freeable_protos;
138 iterate(pCharList) {
139 // Cluster
140 CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node());
141 Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
142 if (Clusterer == nullptr) { // To avoid a SIGSEGV
143 fprintf(stderr, "Error: nullptr clusterer!\n");
144 return 1;
145 }
146 float SavedMinSamples = Config.MinSamples;
147 // To disable the tendency to produce a single cluster for all fonts,
148 // make MagicSamples an impossible to achieve number:
149 // Config.MagicSamples = CharSample->SampleCount * 10;
150 Config.MagicSamples = CharSample->SampleCount;
151 while (Config.MinSamples > 0.001) {
152 ProtoList = ClusterSamples(Clusterer, &Config);
153 if (NumberOfProtos(ProtoList, true, false) > 0) {
154 break;
155 } else {
156 Config.MinSamples *= 0.95;
157 printf(
158 "0 significant protos for %s."
159 " Retrying clustering with MinSamples = %f%%\n",
160 CharSample->Label.c_str(), Config.MinSamples);
161 }
162 }
163 Config.MinSamples = SavedMinSamples;
164 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
165 freeable_protos.push_back(ProtoList);
166 FreeClusterer(Clusterer);
167 }
168 FreeTrainingSamples(CharList);
169 int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
170 WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]);
171 FreeNormProtoList(NormProtoList);
172 for (auto &freeable_proto : freeable_protos) {
173 FreeProtoList(&freeable_proto);
174 }
175 printf("\n");
176 return 0;
177 } // main
178
179 /*----------------------------------------------------------------------------
180 Private Code
181 ----------------------------------------------------------------------------*/
182
183 /*----------------------------------------------------------------------------*/
184 /**
185 * This routine writes the specified samples into files which
186 * are organized according to the font name and character name
187 * of the samples.
188 * @param Directory directory to place sample files into
189 * @param LabeledProtoList List of labeled protos
190 * @param feature_desc Description of the features
191 */
WriteNormProtos(const char * Directory,LIST LabeledProtoList,const FEATURE_DESC_STRUCT * feature_desc)192 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
193 const FEATURE_DESC_STRUCT *feature_desc) {
194 FILE *File;
195 LABELEDLIST LabeledProto;
196 int N;
197
198 std::string Filename = "";
199 if (Directory != nullptr && Directory[0] != '\0') {
200 Filename += Directory;
201 Filename += "/";
202 }
203 Filename += "normproto";
204 printf("\nWriting %s ...", Filename.c_str());
205 File = fopen(Filename.c_str(), "wb");
206 ASSERT_HOST(File);
207 fprintf(File, "%0d\n", feature_desc->NumParams);
208 WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
209 iterate(LabeledProtoList) {
210 LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node());
211 N = NumberOfProtos(LabeledProto->List, true, false);
212 if (N < 1) {
213 printf(
214 "\nError! Not enough protos for %s: %d protos"
215 " (%d significant protos"
216 ", %d insignificant protos)\n",
217 LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false),
218 NumberOfProtos(LabeledProto->List, false, true));
219 exit(1);
220 }
221 fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N);
222 WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
223 }
224 fclose(File);
225
226 } // WriteNormProtos
227
228 /*-------------------------------------------------------------------------*/
229
WriteProtos(FILE * File,uint16_t N,LIST ProtoList,bool WriteSigProtos,bool WriteInsigProtos)230 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
231 bool WriteInsigProtos) {
232 PROTOTYPE *Proto;
233
234 // write prototypes
235 iterate(ProtoList) {
236 Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
237 if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) {
238 WritePrototype(File, N, Proto);
239 }
240 }
241 } // WriteProtos
242