1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 ///////////////////////////////////////////////////////////////////////
15 
16 #ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
17 #define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
18 
19 namespace tesseract {
20 
21 class IndexMapBiDi;
22 class IntFeatureMap;
23 class ShapeTable;
24 class TrainingSample;
25 class TrainingSampleSet;
26 struct UnicharAndFonts;
27 
28 // Iterator class to encapsulate the complex iteration involved in getting
29 // all samples of all shapes needed for a classification problem.
30 //
31 // =====INPUTS TO Init FUNCTION=====
32 // The charset_map defines a subset of the sample_set classes (with a nullptr
33 // shape_table, or the shape_table classes if not nullptr.)
34 //
35 // The shape_table (if not nullptr) defines the mapping from shapes to
36 // font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
37 //
38 // The sample_set holds the samples and provides indexed access to samples
39 // of font_id/class_id pairs.
40 //
41 // If randomize is true, the samples are perturbed slightly, but the
42 // perturbation is guaranteed to be the same for multiple identical
43 // iterations.
44 //
45 // =====DIFFERENT COMBINATIONS OF INPUTS=====
46 // nullptr shape_table:
47 // Without a shape_table, everything works in UNICHAR_IDs.
48 //
49 // nullptr shape_table, nullptr charset_map:
50 // Iterations simply run over the samples in the order the samples occur in the
51 // input files.
52 // GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
53 //
54 // nullptr shape_table, non-nullptr charset_map:
55 // When shape_table is nullptr, the charset_map indexes unichar_ids directly,
56 // and an iteration returns all samples of all chars in the charset_map, which
57 // is a subset of the full unicharset.
58 // The iteration will be in groups of the same unichar_id, in the order
59 // defined by the charset_map.
60 // GetCompactClassID returns the charset_map index of a sample, and
61 // GetSparseClassID returns the sample UNICHAR_ID.
62 //
63 // Non-nullptr shape_table:
64 // With a shape_table, samples are grouped according to the shape_table, so
65 // multiple UNICHAR_IDs and fonts may be grouped together, and everything
66 // works in shape_ids.
67 //
68 // Non-nullptr shape_table, nullptr charset_map.
69 // Iterations simply run over the samples in the order of shape_id.
70 // GetCompactClassID and GetSparseClassID both return the shape_id.
71 // (If you want the unichar_id or font_id, the sample still has them.)
72 //
73 // Non-nullptr shape_table, non-nullptr charset_map.
74 // When shape_table is not nullptr, the charset_map indexes and subsets shapes
75 // in the shape_table, and iterations will be in shape_table order, not
76 // charset_map order.
77 // GetCompactClassID returns the charset_map index of a shape, and
78 // GetSparseClassID returns the shape_id.
79 //
80 // =====What is SampleIterator good for?=====
81 // Inside a classifier training module, the SampleIterator has abstracted away
82 // all the different modes above.
83 // Use the following iteration to train your classifier:
84 // for (it.Begin(); !it.AtEnd(); it.Next()) {
85 //   const TrainingSample& sample = it.GetSample();
86 //   int class_id = it.GetCompactClassID();
87 // Your classifier may or may not be dealing with a shape_table, and may be
88 // dealing with some subset of the character/shape set. It doesn't need to
89 // know and shouldn't care. It is just learning shapes with compact class ids
90 // in the range [0, it.CompactCharsetSize()).
91 class SampleIterator {
92 public:
93   SampleIterator();
94   ~SampleIterator();
95 
96   void Clear();
97 
98   // See class comment for arguments.
99   void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize,
100             TrainingSampleSet *sample_set);
101 
102   // Iterator functions designed for use with a simple for loop:
103   // for (it.Begin(); !it.AtEnd(); it.Next()) {
104   //   const TrainingSample& sample = it.GetSample();
105   //   int class_id = it.GetCompactClassID();
106   //   ...
107   // }
108   void Begin();
109   bool AtEnd() const;
110   const TrainingSample &GetSample() const;
111   TrainingSample *MutableSample() const;
112   // Returns the total index (from the original set of samples) of the current
113   // sample.
114   int GlobalSampleIndex() const;
115   // Returns the index of the current sample in compact charset space, so
116   // in a 2-class problem between x and y, the returned indices will all be
117   // 0 or 1, and have nothing to do with the unichar_ids.
118   // If the charset_map_ is nullptr, then this is equal to GetSparseClassID().
119   int GetCompactClassID() const;
120   // Returns the index of the current sample in sparse charset space, so
121   // in a 2-class problem between x and y, the returned indices will all be
122   // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
123   // with a shape_table_.
124   int GetSparseClassID() const;
125   // Moves on to the next indexable sample. If the end is reached, leaves
126   // the state such that AtEnd() is true.
127   void Next();
128 
129   // Returns the size of the compact charset space.
130   int CompactCharsetSize() const;
131   // Returns the size of the sparse charset space.
132   int SparseCharsetSize() const;
133 
charset_map()134   const IndexMapBiDi &charset_map() const {
135     return *charset_map_;
136   }
shape_table()137   const ShapeTable *shape_table() const {
138     return shape_table_;
139   }
140   // Sample set operations.
sample_set()141   const TrainingSampleSet *sample_set() const {
142     return sample_set_;
143   }
144 
145   // A set of functions that do something to all the samples accessed by the
146   // iterator, as it is currently setup.
147 
148   // Apply the supplied feature_space/feature_map transform to all samples
149   // accessed by this iterator.
150   void MapSampleFeatures(const IntFeatureMap &feature_map);
151 
152   // Adjust the weights of all the samples to be uniform in the given charset.
153   // Returns the number of samples in the iterator.
154   int UniformSamples();
155 
156   // Normalize the weights of all the samples defined by the iterator so they
157   // sum to 1. Returns the minimum assigned sample weight.
158   double NormalizeSamples();
159 
160 private:
161   // Helper returns the current UnicharAndFont shape_entry.
162   const UnicharAndFonts *GetShapeEntry() const;
163 
164   // Map to subset the actual charset space.
165   const IndexMapBiDi *charset_map_;
166   // Shape table to recombine character classes into shapes
167   const ShapeTable *shape_table_;
168   // The samples to iterate over.
169   TrainingSampleSet *sample_set_;
170   // Flag to control randomizing the sample features.
171   bool randomize_;
172   // Shape table owned by this used to iterate character classes.
173   ShapeTable *owned_shape_table_;
174 
175   // Top-level iteration. Shape index in sparse charset_map space.
176   int shape_index_;
177   int num_shapes_;
178   // Index to the character class within a shape.
179   int shape_char_index_;
180   int num_shape_chars_;
181   // Index to the font within a shape/class pair.
182   int shape_font_index_;
183   int num_shape_fonts_;
184   // The lowest level iteration. sample_index_/num_samples_ counts samples
185   // in the current shape/class/font combination.
186   int sample_index_;
187   int num_samples_;
188 };
189 
190 } // namespace tesseract.
191 
192 #endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
193