1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 ///////////////////////////////////////////////////////////////////////
15 
16 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
17 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
18 
19 #include "matrix.h"
20 #include "statistc.h"
21 
22 struct Pix;
23 
24 namespace tesseract {
25 
26 template <typename T>
27 class UnicityTable;
28 struct FontInfo;
29 class FontInfoTable;
30 class SampleIterator;
31 class ShapeClassifier;
32 class TrainingSample;
33 struct UnicharRating;
34 
35 // Enumeration of the different types of error count.
36 // Error counts work as follows:
37 //
38 // Ground truth is a valid unichar-id / font-id pair:
39 //        Number of classifier answers?
40 //          0                       >0
41 //     CT_REJECT          unichar-id matches top shape?
42 //     __________             yes!                      no
43 //                   CT_UNICHAR_TOP_OK           CT_UNICHAR_TOP1_ERR
44 //      Top shape-id has multiple unichars?   2nd shape unichar id matches?
45 //            yes!              no              yes!              no
46 //      CT_OK_MULTI_UNICHAR     |              _____    CT_UNICHAR_TOP2_ERR
47 //             Font attributes match?                 Any unichar-id matches?
48 //              yes!              no                  yes!        no
49 //      CT_FONT_ATTR_OK   CT_FONT_ATTR_ERR          ______  CT_UNICHAR_TOPN_ERR
50 //                |       __________________                 _________________
51 //      Top shape-id has multiple font attrs?
52 //            yes!              no
53 //      CT_OK_MULTI_FONT
54 //      _____________________________
55 //
56 // Note that multiple counts may be activated for a single sample!
57 //
58 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
59 // This is called junk and is expected to be rejected:
60 //        Number of classifier answers?
61 //          0                       >0
62 //     CT_REJECTED_JUNK     CT_ACCEPTED_JUNK
63 //
64 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
65 // the mean rank of the correct result, counting from 0, and with an error
66 // receiving the number of answers as the correct rank.
67 //
68 // Keep in sync with the ReportString function.
69 enum CountTypes {
70   CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id.
71   // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
72   // kRatingEpsilon from the first result in each group. The real top choice
73   // is measured using TOPTOP.
74   CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
75   CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
76   CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
77   CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct.
78   CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
79   CT_OK_JOINED,          // Top shape id is correct but marked joined.
80   CT_OK_BROKEN,          // Top shape id is correct but marked broken.
81   CT_REJECT,             // Classifier hates this.
82   CT_FONT_ATTR_ERR,      // Top unichar OK, but font attributes incorrect.
83   CT_OK_MULTI_FONT,      // CT_FONT_ATTR_OK but there are multiple font attrs.
84   CT_NUM_RESULTS,        // Number of answers produced.
85   CT_RANK,               // Rank of correct answer.
86   CT_REJECTED_JUNK,      // Junk that was correctly rejected.
87   CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.
88 
89   CT_SIZE // Number of types for array sizing.
90 };
91 
92 // Class to encapsulate all the functionality and sub-structures required
93 // to count errors for an isolated character classifier (ShapeClassifier).
94 class ErrorCounter {
95 public:
96   // Computes and returns the unweighted boosting_mode error rate of the given
97   // classifier. Can be used for testing, or inside an iterative training
98   // system, including one that uses boosting.
99   // report_levels:
100   // 0 = no output.
101   // 1 = bottom-line error rate.
102   // 2 = bottom-line error rate + time.
103   // 3 = font-level error rate + time.
104   // 4 = list of all errors + short classifier debug output on 16 errors.
105   // 5 = list of all errors + short classifier debug output on 25 errors.
106   // * The boosting_mode determines which error type is used for computing the
107   //   scaled_error output, and setting the is_error flag in the samples.
108   // * The fontinfo_table is used to get string font names for the debug
109   //   output, and also to count font attributes errors.
110   // * The page_images vector may contain a Pix* (which may be nullptr) for each
111   //   page index assigned to the samples.
112   // * The it provides encapsulated iteration over some sample set.
113   // * The outputs unichar_error, scaled_error and totals_report are all
114   //   optional.
115   // * If not nullptr, unichar error gets the top1 unichar error rate.
116   // * Scaled_error gets the error chosen by boosting_mode weighted by the
117   //   weights on the samples.
118   // * Fonts_report gets a string summarizing the error rates for each font in
119   //   both human-readable form and as a tab-separated list of error counts.
120   //   The human-readable form is all before the first tab.
121   // * The return value is the un-weighted version of the scaled_error.
122   static double ComputeErrorRate(ShapeClassifier *classifier, int report_level,
123                                  CountTypes boosting_mode, const FontInfoTable &fontinfo_table,
124                                  const std::vector<Image > &page_images, SampleIterator *it,
125                                  double *unichar_error, double *scaled_error, std::string *fonts_report);
126   // Tests a pair of classifiers, debugging errors of the new against the old.
127   // See errorcounter.h for description of arguments.
128   // Iterates over the samples, calling the classifiers in normal/silent mode.
129   // If the new_classifier makes a boosting_mode error that the old_classifier
130   // does not, and the appropriate, it will then call the new_classifier again
131   // with a debug flag and a keep_this argument to find out what is going on.
132   static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier,
133                              CountTypes boosting_mode, const FontInfoTable &fontinfo_table,
134                              const std::vector<Image > &page_images, SampleIterator *it);
135 
136 private:
137   // Simple struct to hold an array of counts.
138   struct Counts {
139     Counts();
140     // Adds other into this for computing totals.
141     void operator+=(const Counts &other);
142 
143     int n[CT_SIZE];
144   };
145 
146   // Constructor is private. Only anticipated use of ErrorCounter is via
147   // the static ComputeErrorRate.
148   ErrorCounter(const UNICHARSET &unicharset, int fontsize);
149   ~ErrorCounter() = default;
150 
151   // Accumulates the errors from the classifier results on a single sample.
152   // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
153   // boosting_mode selects the type of error to be used for boosting and the
154   // is_error_ member of sample is set according to whether the required type
155   // of error occurred. The font_table provides access to font properties
156   // for error counting and shape_table is used to understand the relationship
157   // between unichar_ids and shape_ids in the results
158   bool AccumulateErrors(bool debug, CountTypes boosting_mode, const FontInfoTable &font_table,
159                         const std::vector<UnicharRating> &results, TrainingSample *sample);
160 
161   // Accumulates counts for junk. Counts only whether the junk was correctly
162   // rejected or not.
163   bool AccumulateJunk(bool debug, const std::vector<UnicharRating> &results,
164                       TrainingSample *sample);
165 
166   // Creates a report of the error rate. The report_level controls the detail
167   // that is reported to stderr via tprintf:
168   // 0   -> no output.
169   // >=1 -> bottom-line error rate.
170   // >=3 -> font-level error rate.
171   // boosting_mode determines the return value. It selects which (un-weighted)
172   // error rate to return.
173   // The fontinfo_table from MasterTrainer provides the names of fonts.
174   // The it determines the current subset of the training samples.
175   // If not nullptr, the top-choice unichar error rate is saved in
176   // unichar_error. If not nullptr, the report string is saved in fonts_report.
177   // (Ignoring report_level).
178   double ReportErrors(int report_level, CountTypes boosting_mode,
179                       const FontInfoTable &fontinfo_table, const SampleIterator &it,
180                       double *unichar_error, std::string *fonts_report);
181 
182   // Sets the report string to a combined human and machine-readable report
183   // string of the error rates.
184   // Returns false if there is no data, leaving report unchanged, unless
185   // even_if_empty is true.
186   static bool ReportString(bool even_if_empty, const Counts &counts, std::string &report);
187 
188   // Computes the error rates and returns in rates which is an array of size
189   // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
190   static bool ComputeRates(const Counts &counts, double rates[CT_SIZE]);
191 
192   // Total scaled error used by boosting algorithms.
193   double scaled_error_;
194   // Difference in result rating to be thought of as an "equal" choice.
195   double rating_epsilon_;
196   // Vector indexed by font_id from the samples of error accumulators.
197   std::vector<Counts> font_counts_;
198   // Counts of the results that map each unichar_id (from samples) to an
199   // incorrect shape_id.
200   GENERIC_2D_ARRAY<int> unichar_counts_;
201   // Count of the number of times each shape_id occurs, is correct, and multi-
202   // unichar.
203   std::vector<int> multi_unichar_counts_;
204   // Histogram of scores (as percent) for correct answers.
205   STATS ok_score_hist_;
206   // Histogram of scores (as percent) for incorrect answers.
207   STATS bad_score_hist_;
208   // Unicharset for printing character ids in results.
209   const UNICHARSET &unicharset_;
210 };
211 
212 } // namespace tesseract.
213 
214 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */
215