1 
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 //     http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 // Copyright 2005-2010 Google, Inc.
15 // Author: sorenj@google.com (Jeffrey Sorensen)
16 
17 #ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
18 #define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
19 
20 #include <utility>
21 using std::pair; using std::make_pair;
22 #include <vector>
23 using std::vector;
24 
25 #include <fst/compat.h>
26 
27 // This class is a bitstring storage class with an index that allows
28 // seeking to the Nth set or clear bit in time O(Log(N)) where N is
29 // the length of the bit vector.  In addition, it allows counting set or
30 // clear bits over ranges in constant time.
31 //
32 // This is accomplished by maintaining an "secondary" index of limited
33 // size in bits that maintains a running count of the number of bits set
34 // in each block of bitmap data.  A block is defined as the number of
35 // uint64 values that can fit in the secondary index before an overflow
36 // occurs.
37 //
38 // To handle overflows, a "primary" index containing a running count of
39 // bits set in each block is created using the type uint64.
40 
41 namespace fst {
42 
43 class BitmapIndex {
44  public:
StorageSize(size_t size)45   static size_t StorageSize(size_t size) {
46     return ((size + kStorageBlockMask) >> kStorageLogBitSize);
47   }
48 
BitmapIndex()49   BitmapIndex() : bits_(NULL), size_(0) { }
50 
Get(size_t index)51   bool Get(size_t index) const {
52     return (bits_[index >> kStorageLogBitSize] &
53             (kOne << (index & kStorageBlockMask))) != 0;
54   }
55 
Set(uint64 * bits,size_t index)56   static void Set(uint64* bits, size_t index) {
57     bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
58   }
59 
Clear(uint64 * bits,size_t index)60   static void Clear(uint64* bits, size_t index) {
61     bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
62   }
63 
Bits()64   size_t Bits() const {
65     return size_;
66   }
67 
ArraySize()68   size_t ArraySize() const {
69     return StorageSize(size_);
70   }
71 
72   // Returns the number of one bits in the bitmap
GetOnesCount()73   size_t GetOnesCount() const {
74     return primary_index_[primary_index_size() - 1];
75   }
76 
77   // Returns the number of one bits in positions 0 to limit - 1.
78   // REQUIRES: limit <= Bits()
79   size_t Rank1(size_t end) const;
80 
81   // Returns the number of one bits in the range start to end - 1.
82   // REQUIRES: limit <= Bits()
GetOnesCountInRange(size_t start,size_t end)83   size_t GetOnesCountInRange(size_t start, size_t end) const {
84     return Rank1(end) - Rank1(start);
85   }
86 
87   // Returns the number of zero bits in positions 0 to limit - 1.
88   // REQUIRES: limit <= Bits()
Rank0(size_t end)89   size_t Rank0(size_t end) const {
90     return end - Rank1(end);
91   }
92 
93   // Returns the number of zero bits in the range start to end - 1.
94   // REQUIRES: limit <= Bits()
GetZeroesCountInRange(size_t start,size_t end)95   size_t GetZeroesCountInRange(size_t start, size_t end) const {
96     return end - start - GetOnesCountInRange(start, end);
97   }
98 
99   // Return true if any bit between begin inclusive and end exclusive
100   // is set.  0 <= begin <= end <= Bits() is required.
101   //
TestRange(size_t start,size_t end)102   bool TestRange(size_t start, size_t end) const {
103     return Rank1(end) > Rank1(start);
104   }
105 
106   // Returns the offset to the nth set bit (zero based)
107   // or Bits() if index >= number of ones
108   size_t Select1(size_t bit_index) const;
109 
110   // Returns the offset to the nth clear bit (zero based)
111   // or Bits() if index > number of
112   size_t Select0(size_t bit_index) const;
113 
114   // Returns the offset of the nth and nth+1 clear bit (zero based),
115   // equivalent to two calls to Select0, but more efficient.
116   pair<size_t, size_t> Select0s(size_t bit_index) const;
117 
118   // Rebuilds from index for the associated Bitmap, should be called
119   // whenever changes have been made to the Bitmap or else behavior
120   // of the indexed bitmap methods will be undefined.
121   void BuildIndex(const uint64 *bits, size_t size);
122 
123   // the secondary index accumulates counts until it can possibly overflow
124   // this constant computes the number of uint64 units that can fit into
125   // units the size of uint16.
126   static const uint64 kOne = 1;
127   static const uint32 kStorageBitSize = 64;
128   static const uint32 kStorageLogBitSize = 6;
129   static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
130       >> kStorageLogBitSize;
131 
132  private:
133   static const uint32 kStorageBlockMask = kStorageBitSize - 1;
134 
135   // returns, from the index, the count of ones up to array_index
136   size_t get_index_ones_count(size_t array_index) const;
137 
138   // because the indexes, both primary and secondary, contain a running
139   // count of the population of one bits contained in [0,i), there is
140   // no reason to have an element in the zeroth position as this value would
141   // necessarily be zero.  (The bits are indexed in a zero based way.)  Thus
142   // we don't store the 0th element in either index.  Both of the following
143   // functions, if greater than 0, must be decremented by one before retreiving
144   // the value from the corresponding array.
145   // returns the 1 + the block that contains the bitindex in question
146   // the inverted version works the same but looks for zeros using an inverted
147   // view of the index
148   size_t find_primary_block(size_t bit_index) const;
149 
150   size_t find_inverted_primary_block(size_t bit_index) const;
151 
152   // similarly, the secondary index (which resets its count to zero at
153   // the end of every kSecondaryBlockSize entries) does not store the element
154   // at 0.  Note that the rem_bit_index parameter is the number of bits
155   // within the secondary block, after the bits accounted for by the primary
156   // block have been removed (i.e. the remaining bits)  And, because we
157   // reset to zero with each new block, there is no need to store those
158   // actual zeros.
159   // returns 1 + the secondary block that contains the bitindex in question
160   size_t find_secondary_block(size_t block, size_t rem_bit_index) const;
161 
162   size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
163       const;
164 
165   // We create a primary index based upon the number of secondary index
166   // blocks.  The primary index uses fields wide enough to accomodate any
167   // index of the bitarray so cannot overflow
168   // The primary index is the actual running
169   // count of one bits set for all blocks (and, thus, all uint64s).
primary_index_size()170   size_t primary_index_size() const {
171     return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
172   }
173 
174   const uint64* bits_;
175   size_t size_;
176 
177   // The primary index contains the running popcount of all blocks
178   // which means the nth value contains the popcounts of
179   // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
180   vector<uint32> primary_index_;
181   // The secondary index contains the running popcount of the associated
182   // bitmap.  It is the same length (in units of uint16) as the
183   // bitmap's map is in units of uint64s.
184   vector<uint16> secondary_index_;
185 };
186 
187 }  // end namespace fst
188 
189 #endif  // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
190