1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 //  Filename: classifier_tester.cpp
15 //  Purpose:  Tests a character classifier on data as formatted for training,
16 //            but doesn't have to be the same as the training data.
17 //  Author:   Ray Smith
18 
19 #include <tesseract/baseapi.h>
20 #include <algorithm>
21 #include <cstdio>
22 #include "commontraining.h"
23 #include "mastertrainer.h"
24 #include "params.h"
25 #include "tessclassifier.h"
26 #include "tesseractclass.h"
27 
28 using namespace tesseract;
29 
30 static STRING_PARAM_FLAG(classifier, "", "Classifier to test");
31 static STRING_PARAM_FLAG(lang, "eng", "Language to test");
32 static STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
33 
34 enum ClassifierName { CN_PRUNER, CN_FULL, CN_COUNT };
35 
36 static const char *names[] = {"pruner", "full"};
37 
InitializeClassifier(const char * classifer_name,const UNICHARSET & unicharset,int argc,char ** argv,tesseract::TessBaseAPI ** api)38 static tesseract::ShapeClassifier *InitializeClassifier(const char *classifer_name,
39                                                         const UNICHARSET &unicharset, int argc,
40                                                         char **argv, tesseract::TessBaseAPI **api) {
41   // Decode the classifier string.
42   ClassifierName classifier = CN_COUNT;
43   for (int c = 0; c < CN_COUNT; ++c) {
44     if (strcmp(classifer_name, names[c]) == 0) {
45       classifier = static_cast<ClassifierName>(c);
46       break;
47     }
48   }
49   if (classifier == CN_COUNT) {
50     fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
51     return nullptr;
52   }
53 
54   // We need to initialize tesseract to test.
55   *api = new tesseract::TessBaseAPI;
56   tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY;
57   tesseract::Tesseract *tesseract = nullptr;
58   tesseract::Classify *classify = nullptr;
59   if (classifier == CN_PRUNER || classifier == CN_FULL) {
60     if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(), engine_mode) < 0) {
61       fprintf(stderr, "Tesseract initialization failed!\n");
62       return nullptr;
63     }
64     tesseract = const_cast<tesseract::Tesseract *>((*api)->tesseract());
65     classify = static_cast<tesseract::Classify *>(tesseract);
66     if (classify->shape_table() == nullptr) {
67       fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
68       return nullptr;
69     }
70   }
71   tesseract::ShapeClassifier *shape_classifier = nullptr;
72 
73   if (classifier == CN_PRUNER) {
74     shape_classifier = new tesseract::TessClassifier(true, classify);
75   } else if (classifier == CN_FULL) {
76     shape_classifier = new tesseract::TessClassifier(false, classify);
77   }
78   tprintf("Testing classifier %s:\n", classifer_name);
79   return shape_classifier;
80 }
81 
82 // This program has complex setup requirements, so here is some help:
83 // Two different modes, tr files and serialized mastertrainer.
84 // From tr files:
85 //   classifier_tester -U unicharset -F font_properties -X xheights
86 //     -classifier x -lang lang [-output_trainer trainer] *.tr
87 // From a serialized trainer:
88 //  classifier_tester -input_trainer trainer [-lang lang] -classifier x
89 //
90 // In the first case, the unicharset must be the unicharset from within
91 // the classifier under test, and the font_properties and xheights files must
92 // match the files used during training.
93 // In the second case, the trainer file must have been prepared from
94 // some previous run of shapeclustering, mftraining, or classifier_tester
95 // using the same conditions as above, ie matching unicharset/font_properties.
96 //
97 // Available values of classifier (x above) are:
98 // pruner   : Tesseract class pruner only.
99 // full     : Tesseract full classifier.
100 //            with an input trainer.)
main(int argc,char ** argv)101 int main(int argc, char **argv) {
102   tesseract::CheckSharedLibraryVersion();
103   ParseArguments(&argc, &argv);
104   std::string file_prefix;
105   auto trainer = tesseract::LoadTrainingData(argv + 1, false, nullptr, file_prefix);
106   tesseract::TessBaseAPI *api;
107   // Decode the classifier string.
108   tesseract::ShapeClassifier *shape_classifier =
109       InitializeClassifier(FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
110   if (shape_classifier == nullptr) {
111     fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
112     return 1;
113   }
114 
115   // We want to test junk as well if it is available.
116   // trainer->IncludeJunk();
117   // We want to test with replicated samples too.
118   trainer->ReplicateAndRandomizeSamplesIfRequired();
119 
120   trainer->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR,
121                                    std::max(3, static_cast<int>(FLAGS_debug_level)), false,
122                                    shape_classifier, nullptr);
123   delete shape_classifier;
124   delete api;
125 
126   return 0;
127 } /* main */
128