1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // Author: rays@google.com (Ray Smith) 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 // 14 /////////////////////////////////////////////////////////////////////// 15 16 #ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ 17 #define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ 18 19 namespace tesseract { 20 21 class IndexMapBiDi; 22 class IntFeatureMap; 23 class ShapeTable; 24 class TrainingSample; 25 class TrainingSampleSet; 26 struct UnicharAndFonts; 27 28 // Iterator class to encapsulate the complex iteration involved in getting 29 // all samples of all shapes needed for a classification problem. 30 // 31 // =====INPUTS TO Init FUNCTION===== 32 // The charset_map defines a subset of the sample_set classes (with a nullptr 33 // shape_table, or the shape_table classes if not nullptr.) 34 // 35 // The shape_table (if not nullptr) defines the mapping from shapes to 36 // font_id/class_id pairs. Each shape is a list of unichar_id and font lists. 37 // 38 // The sample_set holds the samples and provides indexed access to samples 39 // of font_id/class_id pairs. 40 // 41 // If randomize is true, the samples are perturbed slightly, but the 42 // perturbation is guaranteed to be the same for multiple identical 43 // iterations. 44 // 45 // =====DIFFERENT COMBINATIONS OF INPUTS===== 46 // nullptr shape_table: 47 // Without a shape_table, everything works in UNICHAR_IDs. 48 // 49 // nullptr shape_table, nullptr charset_map: 50 // Iterations simply run over the samples in the order the samples occur in the 51 // input files. 52 // GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID. 53 // 54 // nullptr shape_table, non-nullptr charset_map: 55 // When shape_table is nullptr, the charset_map indexes unichar_ids directly, 56 // and an iteration returns all samples of all chars in the charset_map, which 57 // is a subset of the full unicharset. 58 // The iteration will be in groups of the same unichar_id, in the order 59 // defined by the charset_map. 60 // GetCompactClassID returns the charset_map index of a sample, and 61 // GetSparseClassID returns the sample UNICHAR_ID. 62 // 63 // Non-nullptr shape_table: 64 // With a shape_table, samples are grouped according to the shape_table, so 65 // multiple UNICHAR_IDs and fonts may be grouped together, and everything 66 // works in shape_ids. 67 // 68 // Non-nullptr shape_table, nullptr charset_map. 69 // Iterations simply run over the samples in the order of shape_id. 70 // GetCompactClassID and GetSparseClassID both return the shape_id. 71 // (If you want the unichar_id or font_id, the sample still has them.) 72 // 73 // Non-nullptr shape_table, non-nullptr charset_map. 74 // When shape_table is not nullptr, the charset_map indexes and subsets shapes 75 // in the shape_table, and iterations will be in shape_table order, not 76 // charset_map order. 77 // GetCompactClassID returns the charset_map index of a shape, and 78 // GetSparseClassID returns the shape_id. 79 // 80 // =====What is SampleIterator good for?===== 81 // Inside a classifier training module, the SampleIterator has abstracted away 82 // all the different modes above. 83 // Use the following iteration to train your classifier: 84 // for (it.Begin(); !it.AtEnd(); it.Next()) { 85 // const TrainingSample& sample = it.GetSample(); 86 // int class_id = it.GetCompactClassID(); 87 // Your classifier may or may not be dealing with a shape_table, and may be 88 // dealing with some subset of the character/shape set. It doesn't need to 89 // know and shouldn't care. It is just learning shapes with compact class ids 90 // in the range [0, it.CompactCharsetSize()). 91 class SampleIterator { 92 public: 93 SampleIterator(); 94 ~SampleIterator(); 95 96 void Clear(); 97 98 // See class comment for arguments. 99 void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, 100 TrainingSampleSet *sample_set); 101 102 // Iterator functions designed for use with a simple for loop: 103 // for (it.Begin(); !it.AtEnd(); it.Next()) { 104 // const TrainingSample& sample = it.GetSample(); 105 // int class_id = it.GetCompactClassID(); 106 // ... 107 // } 108 void Begin(); 109 bool AtEnd() const; 110 const TrainingSample &GetSample() const; 111 TrainingSample *MutableSample() const; 112 // Returns the total index (from the original set of samples) of the current 113 // sample. 114 int GlobalSampleIndex() const; 115 // Returns the index of the current sample in compact charset space, so 116 // in a 2-class problem between x and y, the returned indices will all be 117 // 0 or 1, and have nothing to do with the unichar_ids. 118 // If the charset_map_ is nullptr, then this is equal to GetSparseClassID(). 119 int GetCompactClassID() const; 120 // Returns the index of the current sample in sparse charset space, so 121 // in a 2-class problem between x and y, the returned indices will all be 122 // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids 123 // with a shape_table_. 124 int GetSparseClassID() const; 125 // Moves on to the next indexable sample. If the end is reached, leaves 126 // the state such that AtEnd() is true. 127 void Next(); 128 129 // Returns the size of the compact charset space. 130 int CompactCharsetSize() const; 131 // Returns the size of the sparse charset space. 132 int SparseCharsetSize() const; 133 charset_map()134 const IndexMapBiDi &charset_map() const { 135 return *charset_map_; 136 } shape_table()137 const ShapeTable *shape_table() const { 138 return shape_table_; 139 } 140 // Sample set operations. sample_set()141 const TrainingSampleSet *sample_set() const { 142 return sample_set_; 143 } 144 145 // A set of functions that do something to all the samples accessed by the 146 // iterator, as it is currently setup. 147 148 // Apply the supplied feature_space/feature_map transform to all samples 149 // accessed by this iterator. 150 void MapSampleFeatures(const IntFeatureMap &feature_map); 151 152 // Adjust the weights of all the samples to be uniform in the given charset. 153 // Returns the number of samples in the iterator. 154 int UniformSamples(); 155 156 // Normalize the weights of all the samples defined by the iterator so they 157 // sum to 1. Returns the minimum assigned sample weight. 158 double NormalizeSamples(); 159 160 private: 161 // Helper returns the current UnicharAndFont shape_entry. 162 const UnicharAndFonts *GetShapeEntry() const; 163 164 // Map to subset the actual charset space. 165 const IndexMapBiDi *charset_map_; 166 // Shape table to recombine character classes into shapes 167 const ShapeTable *shape_table_; 168 // The samples to iterate over. 169 TrainingSampleSet *sample_set_; 170 // Flag to control randomizing the sample features. 171 bool randomize_; 172 // Shape table owned by this used to iterate character classes. 173 ShapeTable *owned_shape_table_; 174 175 // Top-level iteration. Shape index in sparse charset_map space. 176 int shape_index_; 177 int num_shapes_; 178 // Index to the character class within a shape. 179 int shape_char_index_; 180 int num_shape_chars_; 181 // Index to the font within a shape/class pair. 182 int shape_font_index_; 183 int num_shape_fonts_; 184 // The lowest level iteration. sample_index_/num_samples_ counts samples 185 // in the current shape/class/font combination. 186 int sample_index_; 187 int num_samples_; 188 }; 189 190 } // namespace tesseract. 191 192 #endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ 193