1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 ///////////////////////////////////////////////////////////////////////
4 // File:        intfeaturemap.h
5 // Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
6 //              to provide a subspace mapping and fast feature lookup.
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H_
21 #define TESSERACT_CLASSIFY_INTFEATUREMAP_H_
22 
23 #include "export.h"
24 #include "indexmapbidi.h"
25 #include "intfeaturespace.h"
26 #include "intproto.h"
27 
28 namespace tesseract {
29 
30 class SampleIterator;
31 
32 // Number of positive and negative offset maps.
33 static const int kNumOffsetMaps = 2;
34 
35 // Class to map a feature space defined by INT_FEATURE_STRUCT to a compact
36 // down-sampled subspace of actually used features.
37 // The IntFeatureMap copes with 2 stages of transformation:
38 // The first step is down-sampling (re-quantization) and converting to a
39 // single index value from the 3-D input:
40 //   INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and
41 // the second is a feature-space compaction to map only the feature indices
42 // that are actually used. This saves space in classifiers that are built
43 // using the mapped feature space.
44 //   index (sparse) feature <-> map (compact) feature via IndexMapBiDi.
45 // Although the transformations are reversible, the inverses are lossy and do
46 // not return the exact input INT_FEATURE_STRUCT, due to the many->one nature
47 // of both transformations.
48 class TESS_COMMON_TRAINING_API IntFeatureMap {
49 public:
50   IntFeatureMap();
51   ~IntFeatureMap();
52 
53   // Accessors.
sparse_size()54   int sparse_size() const {
55     return feature_space_.Size();
56   }
compact_size()57   int compact_size() const {
58     return compact_size_;
59   }
feature_space()60   const IntFeatureSpace &feature_space() const {
61     return feature_space_;
62   }
feature_map()63   const IndexMapBiDi &feature_map() const {
64     return feature_map_;
65   }
66 
67   // Pseudo-accessors.
68   int IndexFeature(const INT_FEATURE_STRUCT &f) const;
69   int MapFeature(const INT_FEATURE_STRUCT &f) const;
70   int MapIndexFeature(int index_feature) const;
71   INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;
72   INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;
73   void DeleteMapFeature(int map_feature);
74   bool IsMapFeatureDeleted(int map_feature) const;
75 
76   // Copies the given feature_space and uses it as the index feature map
77   // from INT_FEATURE_STRUCT.
78   void Init(const IntFeatureSpace &feature_space);
79 
80   // Helper to return an offset index feature. In this context an offset
81   // feature with a dir of +/-1 is a feature of a similar direction,
82   // but shifted perpendicular to the direction of the feature. An offset
83   // feature with a dir of +/-2 is feature at the same position, but rotated
84   // by +/- one [compact] quantum. Returns the index of the generated offset
85   // feature, or -1 if it doesn't exist. Dir should be in
86   // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
87   // A dir of 0 is an identity transformation.
88   // Both input and output are from the index(sparse) feature space, not
89   // the mapped/compact feature space, but the offset feature is the minimum
90   // distance moved from the input to guarantee that it maps to the next
91   // available quantum in the mapped/compact space.
92   int OffsetFeature(int index_feature, int dir) const;
93 
94   // Computes the features used by the subset of samples defined by
95   // the iterator and sets up the feature mapping.
96   // Returns the size of the compacted feature space.
97   int FindNZFeatureMapping(SampleIterator *it);
98 
99   // After deleting some features, finish setting up the mapping, and map
100   // all the samples. Returns the size of the compacted feature space.
101   int FinalizeMapping(SampleIterator *it);
102 
103   // Indexes the given array of features to a vector of sorted indices.
IndexAndSortFeatures(const INT_FEATURE_STRUCT * features,int num_features,std::vector<int> * sorted_features)104   void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features,
105                             std::vector<int> *sorted_features) const {
106     feature_space_.IndexAndSortFeatures(features, num_features, sorted_features);
107   }
108   // Maps the given array of index/sparse features to an array of map/compact
109   // features.
110   // Assumes the input is sorted. The output indices are sorted and uniqued.
111   // Returns the number of "missed" features, being features that
112   // don't map to the compact feature space.
MapIndexedFeatures(const std::vector<int> & index_features,std::vector<int> * map_features)113   int MapIndexedFeatures(const std::vector<int> &index_features,
114                          std::vector<int> *map_features) const {
115     return feature_map_.MapFeatures(index_features, map_features);
116   }
117 
118   // Prints the map features from the set in human-readable form.
119   void DebugMapFeatures(const std::vector<int> &map_features) const;
120 
121 private:
122   void Clear();
123 
124   // Helper to compute an offset index feature. In this context an offset
125   // feature with a dir of +/-1 is a feature of a similar direction,
126   // but shifted perpendicular to the direction of the feature. An offset
127   // feature with a dir of +/-2 is feature at the same position, but rotated
128   // by +/- one [compact] quantum. Returns the index of the generated offset
129   // feature, or -1 if it doesn't exist. Dir should be in
130   // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
131   // A dir of 0 is an identity transformation.
132   // Both input and output are from the index(sparse) feature space, not
133   // the mapped/compact feature space, but the offset feature is the minimum
134   // distance moved from the input to guarantee that it maps to the next
135   // available quantum in the mapped/compact space.
136   int ComputeOffsetFeature(int index_feature, int dir) const;
137 
138   // True if the mapping has changed since it was last finalized.
139   bool mapping_changed_;
140   // Size of the compacted feature space, after unused features are removed.
141   int compact_size_;
142   // Feature space quantization definition and indexing from INT_FEATURE_STRUCT.
143   IntFeatureSpace feature_space_;
144   // Mapping from indexed feature space to the compacted space with unused
145   // features mapping to -1.
146   IndexMapBiDi feature_map_;
147   // Index tables to map a feature index to the corresponding feature after a
148   // shift perpendicular to the feature direction, or a rotation in place.
149   // An entry of -1 indicates that there is no corresponding feature.
150   // Array of arrays of size feature_space_.Size() owned by this class.
151   int *offset_plus_[kNumOffsetMaps];
152   int *offset_minus_[kNumOffsetMaps];
153 
154   // Don't use default copy and assign!
155   IntFeatureMap(const IntFeatureMap &);
156   void operator=(const IntFeatureMap &);
157 };
158 
159 } // namespace tesseract.
160 
161 #endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H_
162