1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #define _USE_MATH_DEFINES // for M_PI
15 
16 #include "commontraining.h"
17 
18 #ifdef DISABLED_LEGACY_ENGINE
19 
20 #  include "params.h"
21 #  include "tprintf.h"
22 
23 namespace tesseract {
24 
25 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
26 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
27 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
28 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
29 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
30 STRING_PARAM_FLAG(X, "", "File listing font xheights");
31 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
32 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
33 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
34 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
35 STRING_PARAM_FLAG(fonts_dir, "",
36                   "If empty it uses system default. Otherwise it overrides "
37                   "system default font location");
38 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir");
39 
40 /**
41  * This routine parses the command line arguments that were
42  * passed to the program and uses them to set relevant
43  * training-related global parameters.
44  *
45  * Globals:
46  * - Config  current clustering parameters
47  * @param argc number of command line arguments to parse
48  * @param argv command line arguments
49  * @note Exceptions: Illegal options terminate the program.
50  */
ParseArguments(int * argc,char *** argv)51 void ParseArguments(int *argc, char ***argv) {
52   std::string usage;
53   if (*argc) {
54     usage += (*argv)[0];
55     usage += " -v | --version | ";
56     usage += (*argv)[0];
57   }
58   usage += " [.tr files ...]";
59   tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
60 }
61 
62 } // namespace tesseract.
63 
64 #else
65 
66 #  include <allheaders.h>
67 #  include "ccutil.h"
68 #  include "classify.h"
69 #  include "cluster.h"
70 #  include "clusttool.h"
71 #  include "featdefs.h"
72 #  include "fontinfo.h"
73 #  include "intfeaturespace.h"
74 #  include "mastertrainer.h"
75 #  include "mf.h"
76 #  include "oldlist.h"
77 #  include "params.h"
78 #  include "shapetable.h"
79 #  include "tessdatamanager.h"
80 #  include "tprintf.h"
81 #  include "unicity_table.h"
82 
83 namespace tesseract {
84 
85 // Global Variables.
86 
87 // global variable to hold configuration parameters to control clustering
88 // -M 0.625   -B 0.05   -I 1.0   -C 1e-6.
89 CLUSTERCONFIG Config = {elliptical, 0.625, 0.05, 1.0, 1e-6, 0};
90 FEATURE_DEFS_STRUCT feature_defs;
91 static CCUtil ccutil;
92 
93 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
94 static INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
95 static STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
96 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
97 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
98 STRING_PARAM_FLAG(X, "", "File listing font xheights");
99 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
100 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
101 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
102 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
103 STRING_PARAM_FLAG(fonts_dir, "", "");
104 STRING_PARAM_FLAG(fontconfig_tmpdir, "", "");
105 static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
106                          "Min number of samples per proto as % of total");
107 static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
108                          "Max percentage of samples in a cluster which have more"
109                          " than 1 feature in that cluster");
110 static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
111                          "Desired independence between dimensions");
112 static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
113                          "Desired confidence in prototypes created");
114 
115 /**
116  * This routine parses the command line arguments that were
117  * passed to the program and uses them to set relevant
118  * training-related global parameters.
119  *
120  * Globals:
121  * - Config  current clustering parameters
122  * @param argc number of command line arguments to parse
123  * @param argv command line arguments
124  */
ParseArguments(int * argc,char *** argv)125 void ParseArguments(int *argc, char ***argv) {
126   std::string usage;
127   if (*argc) {
128     usage += (*argv)[0];
129     usage += " -v | --version | ";
130     usage += (*argv)[0];
131   }
132   usage += " [.tr files ...]";
133   tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
134   // Set some global values based on the flags.
135   Config.MinSamples =
136       std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
137   Config.MaxIllegal = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
138   Config.Independence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
139   Config.Confidence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
140   // Set additional parameters from config file if specified.
141   if (!FLAGS_configfile.empty()) {
142     tesseract::ParamUtils::ReadParamsFile(
143         FLAGS_configfile.c_str(), tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, ccutil.params());
144   }
145 }
146 
147 // Helper loads shape table from the given file.
LoadShapeTable(const std::string & file_prefix)148 ShapeTable *LoadShapeTable(const std::string &file_prefix) {
149   ShapeTable *shape_table = nullptr;
150   std::string shape_table_file = file_prefix;
151   shape_table_file += kShapeTableFileSuffix;
152   TFile shape_fp;
153   if (shape_fp.Open(shape_table_file.c_str(), nullptr)) {
154     shape_table = new ShapeTable;
155     if (!shape_table->DeSerialize(&shape_fp)) {
156       delete shape_table;
157       shape_table = nullptr;
158       tprintf("Error: Failed to read shape table %s\n", shape_table_file.c_str());
159     } else {
160       int num_shapes = shape_table->NumShapes();
161       tprintf("Read shape table %s of %d shapes\n", shape_table_file.c_str(), num_shapes);
162     }
163   } else {
164     tprintf("Warning: No shape table file present: %s\n", shape_table_file.c_str());
165   }
166   return shape_table;
167 }
168 
169 // Helper to write the shape_table.
WriteShapeTable(const std::string & file_prefix,const ShapeTable & shape_table)170 void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table) {
171   std::string shape_table_file = file_prefix;
172   shape_table_file += kShapeTableFileSuffix;
173   FILE *fp = fopen(shape_table_file.c_str(), "wb");
174   if (fp != nullptr) {
175     if (!shape_table.Serialize(fp)) {
176       fprintf(stderr, "Error writing shape table: %s\n", shape_table_file.c_str());
177     }
178     fclose(fp);
179   } else {
180     fprintf(stderr, "Error creating shape table: %s\n", shape_table_file.c_str());
181   }
182 }
183 
184 /**
185  * Creates a MasterTrainer and loads the training data into it:
186  * Initializes feature_defs and IntegerFX.
187  * Loads the shape_table if shape_table != nullptr.
188  * Loads initial unicharset from -U command-line option.
189  * If FLAGS_T is set, loads the majority of data from there, else:
190  *  - Loads font info from -F option.
191  *  - Loads xheights from -X option.
192  *  - Loads samples from .tr files in remaining command-line args.
193  *  - Deletes outliers and computes canonical samples.
194  *  - If FLAGS_output_trainer is set, saves the trainer for future use.
195  *    TODO: Who uses that? There is currently no code which reads it.
196  * Computes canonical and cloud features.
197  * If shape_table is not nullptr, but failed to load, make a fake flat one,
198  * as shape clustering was not run.
199  */
LoadTrainingData(const char * const * filelist,bool replication,ShapeTable ** shape_table,std::string & file_prefix)200 std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
201                                                 ShapeTable **shape_table, std::string &file_prefix) {
202   InitFeatureDefs(&feature_defs);
203   InitIntegerFX();
204   file_prefix = "";
205   if (!FLAGS_D.empty()) {
206     file_prefix += FLAGS_D.c_str();
207     file_prefix += "/";
208   }
209   // If we are shape clustering (nullptr shape_table) or we successfully load
210   // a shape_table written by a previous shape clustering, then
211   // shape_analysis will be true, meaning that the MasterTrainer will replace
212   // some members of the unicharset with their fragments.
213   bool shape_analysis = false;
214   if (shape_table != nullptr) {
215     *shape_table = LoadShapeTable(file_prefix);
216     if (*shape_table != nullptr) {
217       shape_analysis = true;
218     }
219   } else {
220     shape_analysis = true;
221   }
222   auto trainer = std::make_unique<MasterTrainer>(NM_CHAR_ANISOTROPIC, shape_analysis, replication,
223                                                  FLAGS_debug_level);
224   IntFeatureSpace fs;
225   fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
226   trainer->LoadUnicharset(FLAGS_U.c_str());
227   // Get basic font information from font_properties.
228   if (!FLAGS_F.empty()) {
229     if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
230       return {};
231     }
232   }
233   if (!FLAGS_X.empty()) {
234     if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
235       return {};
236     }
237   }
238   trainer->SetFeatureSpace(fs);
239   // Load training data from .tr files in filelist (terminated by nullptr).
240   for (const char *page_name = *filelist++; page_name != nullptr; page_name = *filelist++) {
241     tprintf("Reading %s ...\n", page_name);
242     trainer->ReadTrainingSamples(page_name, feature_defs, false);
243 
244     // If there is a file with [lang].[fontname].exp[num].fontinfo present,
245     // read font spacing information in to fontinfo_table.
246     int pagename_len = strlen(page_name);
247     char *fontinfo_file_name = new char[pagename_len + 7];
248     strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
249     strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
250     trainer->AddSpacingInfo(fontinfo_file_name);
251     delete[] fontinfo_file_name;
252 
253     // Load the images into memory if required by the classifier.
254     if (FLAGS_load_images) {
255       std::string image_name = page_name;
256       // Chop off the tr and replace with tif. Extension must be tif!
257       image_name.resize(image_name.length() - 2);
258       image_name += "tif";
259       trainer->LoadPageImages(image_name.c_str());
260     }
261   }
262   trainer->PostLoadCleanup();
263   // Write the master trainer if required.
264   if (!FLAGS_output_trainer.empty()) {
265     FILE *fp = fopen(FLAGS_output_trainer.c_str(), "wb");
266     if (fp == nullptr) {
267       tprintf("Can't create saved trainer data!\n");
268     } else {
269       trainer->Serialize(fp);
270       fclose(fp);
271     }
272   }
273   trainer->PreTrainingSetup();
274   if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
275     fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
276     return {};
277   }
278 
279   if (shape_table != nullptr) {
280     // If we previously failed to load a shapetable, then shape clustering
281     // wasn't run so make a flat one now.
282     if (*shape_table == nullptr) {
283       *shape_table = new ShapeTable;
284       trainer->SetupFlatShapeTable(*shape_table);
285       tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().c_str());
286     }
287     (*shape_table)->set_unicharset(trainer->unicharset());
288   }
289   return trainer;
290 }
291 
292 /*---------------------------------------------------------------------------*/
293 /**
294  * This routine searches through a list of labeled lists to find
295  * a list with the specified label.  If a matching labeled list
296  * cannot be found, nullptr is returned.
297  * @param List list to search
298  * @param Label label to search for
299  * @return Labeled list with the specified label or nullptr.
300  * @note Globals: none
301  */
FindList(LIST List,const std::string & Label)302 LABELEDLIST FindList(LIST List, const std::string &Label) {
303   LABELEDLIST LabeledList;
304 
305   iterate(List) {
306     LabeledList = reinterpret_cast<LABELEDLIST>(List->first_node());
307     if (LabeledList->Label == Label) {
308       return (LabeledList);
309     }
310   }
311   return (nullptr);
312 
313 } /* FindList */
314 
315 /*---------------------------------------------------------------------------*/
316 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
317 // the new method or get rid of it entirely.
318 /**
319  * This routine reads training samples from a file and
320  * places them into a data structure which organizes the
321  * samples by FontName and CharName.  It then returns this
322  * data structure.
323  * @param file open text file to read samples from
324  * @param feature_definitions
325  * @param feature_name
326  * @param max_samples
327  * @param unicharset
328  * @param training_samples
329  */
ReadTrainingSamples(const FEATURE_DEFS_STRUCT & feature_definitions,const char * feature_name,int max_samples,UNICHARSET * unicharset,FILE * file,LIST * training_samples)330 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name,
331                          int max_samples, UNICHARSET *unicharset, FILE *file,
332                          LIST *training_samples) {
333   char buffer[2048];
334   char unichar[UNICHAR_LEN + 1];
335   LABELEDLIST char_sample;
336   FEATURE_SET feature_samples;
337   uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name);
338 
339   // Zero out the font_sample_count for all the classes.
340   LIST it = *training_samples;
341   iterate(it) {
342     char_sample = reinterpret_cast<LABELEDLIST>(it->first_node());
343     char_sample->font_sample_count = 0;
344   }
345 
346   while (fgets(buffer, 2048, file) != nullptr) {
347     if (buffer[0] == '\n') {
348       continue;
349     }
350 
351     sscanf(buffer, "%*s %s", unichar);
352     if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
353       unicharset->unichar_insert(unichar);
354       if (unicharset->size() > MAX_NUM_CLASSES) {
355         tprintf(
356             "Error: Size of unicharset in training is "
357             "greater than MAX_NUM_CLASSES\n");
358         exit(1);
359       }
360     }
361     char_sample = FindList(*training_samples, unichar);
362     if (char_sample == nullptr) {
363       char_sample = new LABELEDLISTNODE(unichar);
364       *training_samples = push(*training_samples, char_sample);
365     }
366     auto char_desc = ReadCharDescription(feature_definitions, file);
367     feature_samples = char_desc->FeatureSets[feature_type];
368     if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
369       char_sample->List = push(char_sample->List, feature_samples);
370       char_sample->SampleCount++;
371       char_sample->font_sample_count++;
372     } else {
373       delete feature_samples;
374     }
375     for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
376       if (feature_type != i) {
377         delete char_desc->FeatureSets[i];
378       }
379     }
380     delete char_desc;
381   }
382 } // ReadTrainingSamples
383 
384 /*---------------------------------------------------------------------------*/
385 /**
386  * This routine deallocates all of the space allocated to
387  * the specified list of training samples.
388  * @param CharList list of all fonts in document
389  */
FreeTrainingSamples(LIST CharList)390 void FreeTrainingSamples(LIST CharList) {
391   LABELEDLIST char_sample;
392   FEATURE_SET FeatureSet;
393   LIST FeatureList;
394 
395   LIST nodes = CharList;
396   iterate(CharList) { /* iterate through all of the fonts */
397     char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());
398     FeatureList = char_sample->List;
399     iterate(FeatureList) { /* iterate through all of the classes */
400       FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
401       delete FeatureSet;
402     }
403     FreeLabeledList(char_sample);
404   }
405   destroy(nodes);
406 } /* FreeTrainingSamples */
407 
408 /*---------------------------------------------------------------------------*/
409 /**
410  * This routine deallocates all of the memory consumed by
411  * a labeled list.  It does not free any memory which may be
412  * consumed by the items in the list.
413  * @param LabeledList labeled list to be freed
414  * @note Globals: none
415  */
FreeLabeledList(LABELEDLIST LabeledList)416 void FreeLabeledList(LABELEDLIST LabeledList) {
417   destroy(LabeledList->List);
418   delete LabeledList;
419 } /* FreeLabeledList */
420 
421 /*---------------------------------------------------------------------------*/
422 /**
423  * This routine reads samples from a LABELEDLIST and enters
424  * those samples into a clusterer data structure.  This
425  * data structure is then returned to the caller.
426  * @param char_sample: LABELEDLIST that holds all the feature information for a
427  * @param FeatureDefs
428  * @param program_feature_type
429  * given character.
430  * @return Pointer to new clusterer data structure.
431  * @note Globals: None
432  */
SetUpForClustering(const FEATURE_DEFS_STRUCT & FeatureDefs,LABELEDLIST char_sample,const char * program_feature_type)433 CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample,
434                               const char *program_feature_type) {
435   uint16_t N;
436   CLUSTERER *Clusterer;
437   LIST FeatureList = nullptr;
438   FEATURE_SET FeatureSet = nullptr;
439 
440   int32_t desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
441   N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
442   Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
443 
444   FeatureList = char_sample->List;
445   uint32_t CharID = 0;
446   std::vector<float> Sample;
447   iterate(FeatureList) {
448     FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
449     for (int i = 0; i < FeatureSet->MaxNumFeatures; i++) {
450       if (Sample.empty()) {
451         Sample.resize(N);
452       }
453       for (int j = 0; j < N; j++) {
454         Sample[j] = FeatureSet->Features[i]->Params[j];
455       }
456       MakeSample(Clusterer, &Sample[0], CharID);
457     }
458     CharID++;
459   }
460   return Clusterer;
461 
462 } /* SetUpForClustering */
463 
464 /*------------------------------------------------------------------------*/
MergeInsignificantProtos(LIST ProtoList,const char * label,CLUSTERER * Clusterer,CLUSTERCONFIG * clusterconfig)465 void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer,
466                               CLUSTERCONFIG *clusterconfig) {
467   PROTOTYPE *Prototype;
468   bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
469 
470   LIST pProtoList = ProtoList;
471   iterate(pProtoList) {
472     Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
473     if (Prototype->Significant || Prototype->Merged) {
474       continue;
475     }
476     float best_dist = 0.125;
477     PROTOTYPE *best_match = nullptr;
478     // Find the nearest alive prototype.
479     LIST list_it = ProtoList;
480     iterate(list_it) {
481       auto *test_p = reinterpret_cast<PROTOTYPE *>(list_it->first_node());
482       if (test_p != Prototype && !test_p->Merged) {
483         float dist = ComputeDistance(Clusterer->SampleSize, Clusterer->ParamDesc, &Prototype->Mean[0],
484                                      &test_p->Mean[0]);
485         if (dist < best_dist) {
486           best_match = test_p;
487           best_dist = dist;
488         }
489       }
490     }
491     if (best_match != nullptr && !best_match->Significant) {
492       if (debug) {
493         tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", best_match->NumSamples,
494                 Prototype->NumSamples, best_match->Mean[0], best_match->Mean[1], Prototype->Mean[0],
495                 Prototype->Mean[1]);
496       }
497       best_match->NumSamples =
498           MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, best_match->NumSamples,
499                         Prototype->NumSamples, &best_match->Mean[0], &best_match->Mean[0], &Prototype->Mean[0]);
500       Prototype->NumSamples = 0;
501       Prototype->Merged = true;
502     } else if (best_match != nullptr) {
503       if (debug) {
504         tprintf("Red proto at %g,%g matched a green one at %g,%g\n", Prototype->Mean[0],
505                 Prototype->Mean[1], best_match->Mean[0], best_match->Mean[1]);
506       }
507       Prototype->Merged = true;
508     }
509   }
510   // Mark significant those that now have enough samples.
511   int min_samples = static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
512   pProtoList = ProtoList;
513   iterate(pProtoList) {
514     Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
515     // Process insignificant protos that do not match a green one
516     if (!Prototype->Significant && Prototype->NumSamples >= min_samples && !Prototype->Merged) {
517       if (debug) {
518         tprintf("Red proto at %g,%g becoming green\n", Prototype->Mean[0], Prototype->Mean[1]);
519       }
520       Prototype->Significant = true;
521     }
522   }
523 } /* MergeInsignificantProtos */
524 
525 /*-----------------------------------------------------------------------------*/
CleanUpUnusedData(LIST ProtoList)526 void CleanUpUnusedData(LIST ProtoList) {
527   PROTOTYPE *Prototype;
528 
529   iterate(ProtoList) {
530     Prototype = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
531     delete[] Prototype->Variance.Elliptical;
532     Prototype->Variance.Elliptical = nullptr;
533     delete[] Prototype->Magnitude.Elliptical;
534     Prototype->Magnitude.Elliptical = nullptr;
535     delete[] Prototype->Weight.Elliptical;
536     Prototype->Weight.Elliptical = nullptr;
537   }
538 }
539 
540 /*------------------------------------------------------------------------*/
RemoveInsignificantProtos(LIST ProtoList,bool KeepSigProtos,bool KeepInsigProtos,int N)541 LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
542 
543 {
544   LIST NewProtoList = NIL_LIST;
545   auto pProtoList = ProtoList;
546   iterate(pProtoList) {
547     auto Proto = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
548     if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) {
549       auto NewProto = new PROTOTYPE;
550       NewProto->Mean = Proto->Mean;
551       NewProto->Significant = Proto->Significant;
552       NewProto->Style = Proto->Style;
553       NewProto->NumSamples = Proto->NumSamples;
554       NewProto->Cluster = nullptr;
555       NewProto->Distrib.clear();
556 
557       if (Proto->Variance.Elliptical != nullptr) {
558         NewProto->Variance.Elliptical = new float[N];
559         for (int i = 0; i < N; i++) {
560           NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
561         }
562       } else {
563         NewProto->Variance.Elliptical = nullptr;
564       }
565       //---------------------------------------------
566       if (Proto->Magnitude.Elliptical != nullptr) {
567         NewProto->Magnitude.Elliptical = new float[N];
568         for (int i = 0; i < N; i++) {
569           NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
570         }
571       } else {
572         NewProto->Magnitude.Elliptical = nullptr;
573       }
574       //------------------------------------------------
575       if (Proto->Weight.Elliptical != nullptr) {
576         NewProto->Weight.Elliptical = new float[N];
577         for (int i = 0; i < N; i++) {
578           NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
579         }
580       } else {
581         NewProto->Weight.Elliptical = nullptr;
582       }
583 
584       NewProto->TotalMagnitude = Proto->TotalMagnitude;
585       NewProto->LogMagnitude = Proto->LogMagnitude;
586       NewProtoList = push_last(NewProtoList, NewProto);
587     }
588   }
589   FreeProtoList(&ProtoList);
590   return (NewProtoList);
591 } /* RemoveInsignificantProtos */
592 
593 /*----------------------------------------------------------------------------*/
FindClass(LIST List,const std::string & Label)594 MERGE_CLASS FindClass(LIST List, const std::string &Label) {
595   MERGE_CLASS MergeClass;
596 
597   iterate(List) {
598     MergeClass = reinterpret_cast<MERGE_CLASS>(List->first_node());
599     if (MergeClass->Label == Label) {
600       return (MergeClass);
601     }
602   }
603   return (nullptr);
604 
605 } /* FindClass */
606 
607 /*-----------------------------------------------------------------------------*/
608 /**
609  * This routine deallocates all of the space allocated to
610  * the specified list of training samples.
611  * @param ClassList list of all fonts in document
612  */
FreeLabeledClassList(LIST ClassList)613 void FreeLabeledClassList(LIST ClassList) {
614   MERGE_CLASS MergeClass;
615 
616   LIST nodes = ClassList;
617   iterate(ClassList) /* iterate through all of the fonts */
618   {
619     MergeClass = reinterpret_cast<MERGE_CLASS>(ClassList->first_node());
620     FreeClass(MergeClass->Class);
621     delete MergeClass;
622   }
623   destroy(nodes);
624 
625 } /* FreeLabeledClassList */
626 
627 /* SetUpForFloat2Int */
SetUpForFloat2Int(const UNICHARSET & unicharset,LIST LabeledClassList)628 CLASS_STRUCT *SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList) {
629   MERGE_CLASS MergeClass;
630   CLASS_TYPE Class;
631   int NumProtos;
632   int NumConfigs;
633   int NumWords;
634   int i, j;
635   float Values[3];
636   PROTO_STRUCT *NewProto;
637   PROTO_STRUCT *OldProto;
638   BIT_VECTOR NewConfig;
639   BIT_VECTOR OldConfig;
640 
641   //  printf("Float2Int ...\n");
642 
643   auto *float_classes = new CLASS_STRUCT[unicharset.size()];
644   iterate(LabeledClassList) {
645     UnicityTable<int> font_set;
646     MergeClass = reinterpret_cast<MERGE_CLASS>(LabeledClassList->first_node());
647     Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label.c_str())];
648     NumProtos = MergeClass->Class->NumProtos;
649     NumConfigs = MergeClass->Class->NumConfigs;
650     font_set.move(&MergeClass->Class->font_set);
651     Class->NumProtos = NumProtos;
652     Class->MaxNumProtos = NumProtos;
653     Class->Prototypes.resize(NumProtos);
654     for (i = 0; i < NumProtos; i++) {
655       NewProto = ProtoIn(Class, i);
656       OldProto = ProtoIn(MergeClass->Class, i);
657       Values[0] = OldProto->X;
658       Values[1] = OldProto->Y;
659       Values[2] = OldProto->Angle;
660       Normalize(Values);
661       NewProto->X = OldProto->X;
662       NewProto->Y = OldProto->Y;
663       NewProto->Length = OldProto->Length;
664       NewProto->Angle = OldProto->Angle;
665       NewProto->A = Values[0];
666       NewProto->B = Values[1];
667       NewProto->C = Values[2];
668     }
669 
670     Class->NumConfigs = NumConfigs;
671     Class->MaxNumConfigs = NumConfigs;
672     Class->font_set.move(&font_set);
673     Class->Configurations.resize(NumConfigs);
674     NumWords = WordsInVectorOfSize(NumProtos);
675     for (i = 0; i < NumConfigs; i++) {
676       NewConfig = NewBitVector(NumProtos);
677       OldConfig = MergeClass->Class->Configurations[i];
678       for (j = 0; j < NumWords; j++) {
679         NewConfig[j] = OldConfig[j];
680       }
681       Class->Configurations[i] = NewConfig;
682     }
683   }
684   return float_classes;
685 } // SetUpForFloat2Int
686 
687 /*--------------------------------------------------------------------------*/
Normalize(float * Values)688 void Normalize(float *Values) {
689   float Slope;
690   float Intercept;
691   float Normalizer;
692 
693   Slope = tan(Values[2] * 2 * M_PI);
694   Intercept = Values[1] - Slope * Values[0];
695   Normalizer = 1 / sqrt(Slope * Slope + 1.0);
696 
697   Values[0] = Slope * Normalizer;
698   Values[1] = -Normalizer;
699   Values[2] = Intercept * Normalizer;
700 } // Normalize
701 
702 /*-------------------------------------------------------------------------*/
FreeNormProtoList(LIST CharList)703 void FreeNormProtoList(LIST CharList)
704 
705 {
706   LABELEDLIST char_sample;
707 
708   LIST nodes = CharList;
709   iterate(CharList) /* iterate through all of the fonts */
710   {
711     char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());
712     FreeLabeledList(char_sample);
713   }
714   destroy(nodes);
715 
716 } // FreeNormProtoList
717 
718 /*---------------------------------------------------------------------------*/
AddToNormProtosList(LIST * NormProtoList,LIST ProtoList,const std::string & CharName)719 void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName) {
720   auto LabeledProtoList = new LABELEDLISTNODE(CharName.c_str());
721   iterate(ProtoList) {
722     auto Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
723     LabeledProtoList->List = push(LabeledProtoList->List, Proto);
724   }
725   *NormProtoList = push(*NormProtoList, LabeledProtoList);
726 }
727 
728 /*---------------------------------------------------------------------------*/
NumberOfProtos(LIST ProtoList,bool CountSigProtos,bool CountInsigProtos)729 int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos) {
730   int N = 0;
731   iterate(ProtoList) {
732     auto *Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
733     if ((Proto->Significant && CountSigProtos) || (!Proto->Significant && CountInsigProtos)) {
734       N++;
735     }
736   }
737   return (N);
738 }
739 
740 } // namespace tesseract.
741 
742 #endif // def DISABLED_LEGACY_ENGINE
743