1 /******************************************************************************
2  **  Filename:  cntraining.cpp
3  **  Purpose:  Generates a normproto and pffmtable.
4  **  Author:    Dan Johnson
5  **  Revisment:  Christy Russon
6  **
7  **  (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 /*----------------------------------------------------------------------------
20           Include Files and Type Defines
21 ----------------------------------------------------------------------------*/
22 #include <tesseract/unichar.h>
23 #include <cmath>
24 #include <cstdio>
25 #include <cstring>
26 #include "cluster.h"
27 #include "clusttool.h"
28 #include "commontraining.h"
29 #include "featdefs.h"
30 #include "ocrfeatures.h"
31 #include "oldlist.h"
32 
33 #define PROGRAM_FEATURE_TYPE "cn"
34 
35 using namespace tesseract;
36 
37 /*----------------------------------------------------------------------------
38           Private Function Prototypes
39 ----------------------------------------------------------------------------*/
40 
41 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
42                             const FEATURE_DESC_STRUCT *feature_desc);
43 
44 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
45                         bool WriteInsigProtos);
46 
47 /*----------------------------------------------------------------------------
48           Global Data Definitions and Declarations
49 ----------------------------------------------------------------------------*/
50 /* global variable to hold configuration parameters to control clustering */
51 //-M 0.025   -B 0.05   -I 0.8   -C 1e-3
52 static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0};
53 
54 /*----------------------------------------------------------------------------
55               Public Code
56 ----------------------------------------------------------------------------*/
57 
58 /**
59 * This program reads in a text file consisting of feature
60 * samples from a training page in the following format:
61 * @verbatim
62    FontName CharName NumberOfFeatureTypes(N)
63       FeatureTypeName1 NumberOfFeatures(M)
64          Feature1
65          ...
66          FeatureM
67       FeatureTypeName2 NumberOfFeatures(M)
68          Feature1
69          ...
70          FeatureM
71       ...
72       FeatureTypeNameN NumberOfFeatures(M)
73          Feature1
74          ...
75          FeatureM
76    FontName CharName ...
77 @endverbatim
78 * It then appends these samples into a separate file for each
79 * character.  The name of the file is
80 *
81 *   DirectoryName/FontName/CharName.FeatureTypeName
82 *
83 * The DirectoryName can be specified via a command
84 * line argument.  If not specified, it defaults to the
85 * current directory.  The format of the resulting files is:
86 * @verbatim
87    NumberOfFeatures(M)
88       Feature1
89       ...
90       FeatureM
91    NumberOfFeatures(M)
92    ...
93 @endverbatim
94 * The output files each have a header which describes the
95 * type of feature which the file contains.  This header is
96 * in the format required by the clusterer.  A command line
97 * argument can also be used to specify that only the first
98 * N samples of each class should be used.
99 * @param argc  number of command line arguments
100 * @param argv  array of command line arguments
101 * @return 0 on success
102 */
main(int argc,char * argv[])103 int main(int argc, char *argv[]) {
104   tesseract::CheckSharedLibraryVersion();
105 
106   // Set the global Config parameters before parsing the command line.
107   Config = CNConfig;
108 
109   LIST CharList = NIL_LIST;
110   CLUSTERER *Clusterer = nullptr;
111   LIST ProtoList = NIL_LIST;
112   LIST NormProtoList = NIL_LIST;
113   LIST pCharList;
114   LABELEDLIST CharSample;
115   FEATURE_DEFS_STRUCT FeatureDefs;
116   InitFeatureDefs(&FeatureDefs);
117 
118   ParseArguments(&argc, &argv);
119   int num_fonts = 0;
120   for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) {
121     printf("Reading %s ...\n", PageName);
122     FILE *TrainingPage = fopen(PageName, "rb");
123     ASSERT_HOST(TrainingPage);
124     if (TrainingPage) {
125       ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList);
126       fclose(TrainingPage);
127       ++num_fonts;
128     }
129   }
130   printf("Clustering ...\n");
131   // To allow an individual font to form a separate cluster,
132   // reduce the min samples:
133   // Config.MinSamples = 0.5 / num_fonts;
134   pCharList = CharList;
135   // The norm protos will count the source protos, so we keep them here in
136   // freeable_protos, so they can be freed later.
137   std::vector<LIST> freeable_protos;
138   iterate(pCharList) {
139     // Cluster
140     CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node());
141     Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
142     if (Clusterer == nullptr) { // To avoid a SIGSEGV
143       fprintf(stderr, "Error: nullptr clusterer!\n");
144       return 1;
145     }
146     float SavedMinSamples = Config.MinSamples;
147     // To disable the tendency to produce a single cluster for all fonts,
148     // make MagicSamples an impossible to achieve number:
149     // Config.MagicSamples = CharSample->SampleCount * 10;
150     Config.MagicSamples = CharSample->SampleCount;
151     while (Config.MinSamples > 0.001) {
152       ProtoList = ClusterSamples(Clusterer, &Config);
153       if (NumberOfProtos(ProtoList, true, false) > 0) {
154         break;
155       } else {
156         Config.MinSamples *= 0.95;
157         printf(
158             "0 significant protos for %s."
159             " Retrying clustering with MinSamples = %f%%\n",
160             CharSample->Label.c_str(), Config.MinSamples);
161       }
162     }
163     Config.MinSamples = SavedMinSamples;
164     AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
165     freeable_protos.push_back(ProtoList);
166     FreeClusterer(Clusterer);
167   }
168   FreeTrainingSamples(CharList);
169   int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
170   WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]);
171   FreeNormProtoList(NormProtoList);
172   for (auto &freeable_proto : freeable_protos) {
173     FreeProtoList(&freeable_proto);
174   }
175   printf("\n");
176   return 0;
177 } // main
178 
179 /*----------------------------------------------------------------------------
180               Private Code
181 ----------------------------------------------------------------------------*/
182 
183 /*----------------------------------------------------------------------------*/
184 /**
185  * This routine writes the specified samples into files which
186  * are organized according to the font name and character name
187  * of the samples.
188  * @param Directory  directory to place sample files into
189  * @param LabeledProtoList List of labeled protos
190  * @param feature_desc Description of the features
191  */
WriteNormProtos(const char * Directory,LIST LabeledProtoList,const FEATURE_DESC_STRUCT * feature_desc)192 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
193                             const FEATURE_DESC_STRUCT *feature_desc) {
194   FILE *File;
195   LABELEDLIST LabeledProto;
196   int N;
197 
198   std::string Filename = "";
199   if (Directory != nullptr && Directory[0] != '\0') {
200     Filename += Directory;
201     Filename += "/";
202   }
203   Filename += "normproto";
204   printf("\nWriting %s ...", Filename.c_str());
205   File = fopen(Filename.c_str(), "wb");
206   ASSERT_HOST(File);
207   fprintf(File, "%0d\n", feature_desc->NumParams);
208   WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
209   iterate(LabeledProtoList) {
210     LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node());
211     N = NumberOfProtos(LabeledProto->List, true, false);
212     if (N < 1) {
213       printf(
214           "\nError! Not enough protos for %s: %d protos"
215           " (%d significant protos"
216           ", %d insignificant protos)\n",
217           LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false),
218           NumberOfProtos(LabeledProto->List, false, true));
219       exit(1);
220     }
221     fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N);
222     WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
223   }
224   fclose(File);
225 
226 } // WriteNormProtos
227 
228 /*-------------------------------------------------------------------------*/
229 
WriteProtos(FILE * File,uint16_t N,LIST ProtoList,bool WriteSigProtos,bool WriteInsigProtos)230 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
231                         bool WriteInsigProtos) {
232   PROTOTYPE *Proto;
233 
234   // write prototypes
235   iterate(ProtoList) {
236     Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
237     if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) {
238       WritePrototype(File, N, Proto);
239     }
240   }
241 } // WriteProtos
242