1 // Copyright 2010 Google Inc. All Rights Reserved. 2 // Author: rays@google.com (Ray Smith) 3 /////////////////////////////////////////////////////////////////////// 4 // File: intfeaturemap.h 5 // Description: Encapsulation of IntFeatureSpace with IndexMapBiDi 6 // to provide a subspace mapping and fast feature lookup. 7 // 8 // Licensed under the Apache License, Version 2.0 (the "License"); 9 // you may not use this file except in compliance with the License. 10 // You may obtain a copy of the License at 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 // 18 /////////////////////////////////////////////////////////////////////// 19 20 #ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H_ 21 #define TESSERACT_CLASSIFY_INTFEATUREMAP_H_ 22 23 #include "export.h" 24 #include "indexmapbidi.h" 25 #include "intfeaturespace.h" 26 #include "intproto.h" 27 28 namespace tesseract { 29 30 class SampleIterator; 31 32 // Number of positive and negative offset maps. 33 static const int kNumOffsetMaps = 2; 34 35 // Class to map a feature space defined by INT_FEATURE_STRUCT to a compact 36 // down-sampled subspace of actually used features. 37 // The IntFeatureMap copes with 2 stages of transformation: 38 // The first step is down-sampling (re-quantization) and converting to a 39 // single index value from the 3-D input: 40 // INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and 41 // the second is a feature-space compaction to map only the feature indices 42 // that are actually used. This saves space in classifiers that are built 43 // using the mapped feature space. 44 // index (sparse) feature <-> map (compact) feature via IndexMapBiDi. 45 // Although the transformations are reversible, the inverses are lossy and do 46 // not return the exact input INT_FEATURE_STRUCT, due to the many->one nature 47 // of both transformations. 48 class TESS_COMMON_TRAINING_API IntFeatureMap { 49 public: 50 IntFeatureMap(); 51 ~IntFeatureMap(); 52 53 // Accessors. sparse_size()54 int sparse_size() const { 55 return feature_space_.Size(); 56 } compact_size()57 int compact_size() const { 58 return compact_size_; 59 } feature_space()60 const IntFeatureSpace &feature_space() const { 61 return feature_space_; 62 } feature_map()63 const IndexMapBiDi &feature_map() const { 64 return feature_map_; 65 } 66 67 // Pseudo-accessors. 68 int IndexFeature(const INT_FEATURE_STRUCT &f) const; 69 int MapFeature(const INT_FEATURE_STRUCT &f) const; 70 int MapIndexFeature(int index_feature) const; 71 INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const; 72 INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const; 73 void DeleteMapFeature(int map_feature); 74 bool IsMapFeatureDeleted(int map_feature) const; 75 76 // Copies the given feature_space and uses it as the index feature map 77 // from INT_FEATURE_STRUCT. 78 void Init(const IntFeatureSpace &feature_space); 79 80 // Helper to return an offset index feature. In this context an offset 81 // feature with a dir of +/-1 is a feature of a similar direction, 82 // but shifted perpendicular to the direction of the feature. An offset 83 // feature with a dir of +/-2 is feature at the same position, but rotated 84 // by +/- one [compact] quantum. Returns the index of the generated offset 85 // feature, or -1 if it doesn't exist. Dir should be in 86 // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction. 87 // A dir of 0 is an identity transformation. 88 // Both input and output are from the index(sparse) feature space, not 89 // the mapped/compact feature space, but the offset feature is the minimum 90 // distance moved from the input to guarantee that it maps to the next 91 // available quantum in the mapped/compact space. 92 int OffsetFeature(int index_feature, int dir) const; 93 94 // Computes the features used by the subset of samples defined by 95 // the iterator and sets up the feature mapping. 96 // Returns the size of the compacted feature space. 97 int FindNZFeatureMapping(SampleIterator *it); 98 99 // After deleting some features, finish setting up the mapping, and map 100 // all the samples. Returns the size of the compacted feature space. 101 int FinalizeMapping(SampleIterator *it); 102 103 // Indexes the given array of features to a vector of sorted indices. IndexAndSortFeatures(const INT_FEATURE_STRUCT * features,int num_features,std::vector<int> * sorted_features)104 void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, 105 std::vector<int> *sorted_features) const { 106 feature_space_.IndexAndSortFeatures(features, num_features, sorted_features); 107 } 108 // Maps the given array of index/sparse features to an array of map/compact 109 // features. 110 // Assumes the input is sorted. The output indices are sorted and uniqued. 111 // Returns the number of "missed" features, being features that 112 // don't map to the compact feature space. MapIndexedFeatures(const std::vector<int> & index_features,std::vector<int> * map_features)113 int MapIndexedFeatures(const std::vector<int> &index_features, 114 std::vector<int> *map_features) const { 115 return feature_map_.MapFeatures(index_features, map_features); 116 } 117 118 // Prints the map features from the set in human-readable form. 119 void DebugMapFeatures(const std::vector<int> &map_features) const; 120 121 private: 122 void Clear(); 123 124 // Helper to compute an offset index feature. In this context an offset 125 // feature with a dir of +/-1 is a feature of a similar direction, 126 // but shifted perpendicular to the direction of the feature. An offset 127 // feature with a dir of +/-2 is feature at the same position, but rotated 128 // by +/- one [compact] quantum. Returns the index of the generated offset 129 // feature, or -1 if it doesn't exist. Dir should be in 130 // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction. 131 // A dir of 0 is an identity transformation. 132 // Both input and output are from the index(sparse) feature space, not 133 // the mapped/compact feature space, but the offset feature is the minimum 134 // distance moved from the input to guarantee that it maps to the next 135 // available quantum in the mapped/compact space. 136 int ComputeOffsetFeature(int index_feature, int dir) const; 137 138 // True if the mapping has changed since it was last finalized. 139 bool mapping_changed_; 140 // Size of the compacted feature space, after unused features are removed. 141 int compact_size_; 142 // Feature space quantization definition and indexing from INT_FEATURE_STRUCT. 143 IntFeatureSpace feature_space_; 144 // Mapping from indexed feature space to the compacted space with unused 145 // features mapping to -1. 146 IndexMapBiDi feature_map_; 147 // Index tables to map a feature index to the corresponding feature after a 148 // shift perpendicular to the feature direction, or a rotation in place. 149 // An entry of -1 indicates that there is no corresponding feature. 150 // Array of arrays of size feature_space_.Size() owned by this class. 151 int *offset_plus_[kNumOffsetMaps]; 152 int *offset_minus_[kNumOffsetMaps]; 153 154 // Don't use default copy and assign! 155 IntFeatureMap(const IntFeatureMap &); 156 void operator=(const IntFeatureMap &); 157 }; 158 159 } // namespace tesseract. 160 161 #endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H_ 162