1 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 // 14 // Copyright 2005-2010 Google, Inc. 15 // Author: sorenj@google.com (Jeffrey Sorensen) 16 17 #ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_ 18 #define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_ 19 20 #include <utility> 21 using std::pair; using std::make_pair; 22 #include <vector> 23 using std::vector; 24 25 #include <fst/compat.h> 26 27 // This class is a bitstring storage class with an index that allows 28 // seeking to the Nth set or clear bit in time O(Log(N)) where N is 29 // the length of the bit vector. In addition, it allows counting set or 30 // clear bits over ranges in constant time. 31 // 32 // This is accomplished by maintaining an "secondary" index of limited 33 // size in bits that maintains a running count of the number of bits set 34 // in each block of bitmap data. A block is defined as the number of 35 // uint64 values that can fit in the secondary index before an overflow 36 // occurs. 37 // 38 // To handle overflows, a "primary" index containing a running count of 39 // bits set in each block is created using the type uint64. 40 41 namespace fst { 42 43 class BitmapIndex { 44 public: StorageSize(size_t size)45 static size_t StorageSize(size_t size) { 46 return ((size + kStorageBlockMask) >> kStorageLogBitSize); 47 } 48 BitmapIndex()49 BitmapIndex() : bits_(NULL), size_(0) { } 50 Get(size_t index)51 bool Get(size_t index) const { 52 return (bits_[index >> kStorageLogBitSize] & 53 (kOne << (index & kStorageBlockMask))) != 0; 54 } 55 Set(uint64 * bits,size_t index)56 static void Set(uint64* bits, size_t index) { 57 bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask)); 58 } 59 Clear(uint64 * bits,size_t index)60 static void Clear(uint64* bits, size_t index) { 61 bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask)); 62 } 63 Bits()64 size_t Bits() const { 65 return size_; 66 } 67 ArraySize()68 size_t ArraySize() const { 69 return StorageSize(size_); 70 } 71 72 // Returns the number of one bits in the bitmap GetOnesCount()73 size_t GetOnesCount() const { 74 return primary_index_[primary_index_size() - 1]; 75 } 76 77 // Returns the number of one bits in positions 0 to limit - 1. 78 // REQUIRES: limit <= Bits() 79 size_t Rank1(size_t end) const; 80 81 // Returns the number of one bits in the range start to end - 1. 82 // REQUIRES: limit <= Bits() GetOnesCountInRange(size_t start,size_t end)83 size_t GetOnesCountInRange(size_t start, size_t end) const { 84 return Rank1(end) - Rank1(start); 85 } 86 87 // Returns the number of zero bits in positions 0 to limit - 1. 88 // REQUIRES: limit <= Bits() Rank0(size_t end)89 size_t Rank0(size_t end) const { 90 return end - Rank1(end); 91 } 92 93 // Returns the number of zero bits in the range start to end - 1. 94 // REQUIRES: limit <= Bits() GetZeroesCountInRange(size_t start,size_t end)95 size_t GetZeroesCountInRange(size_t start, size_t end) const { 96 return end - start - GetOnesCountInRange(start, end); 97 } 98 99 // Return true if any bit between begin inclusive and end exclusive 100 // is set. 0 <= begin <= end <= Bits() is required. 101 // TestRange(size_t start,size_t end)102 bool TestRange(size_t start, size_t end) const { 103 return Rank1(end) > Rank1(start); 104 } 105 106 // Returns the offset to the nth set bit (zero based) 107 // or Bits() if index >= number of ones 108 size_t Select1(size_t bit_index) const; 109 110 // Returns the offset to the nth clear bit (zero based) 111 // or Bits() if index > number of 112 size_t Select0(size_t bit_index) const; 113 114 // Returns the offset of the nth and nth+1 clear bit (zero based), 115 // equivalent to two calls to Select0, but more efficient. 116 pair<size_t, size_t> Select0s(size_t bit_index) const; 117 118 // Rebuilds from index for the associated Bitmap, should be called 119 // whenever changes have been made to the Bitmap or else behavior 120 // of the indexed bitmap methods will be undefined. 121 void BuildIndex(const uint64 *bits, size_t size); 122 123 // the secondary index accumulates counts until it can possibly overflow 124 // this constant computes the number of uint64 units that can fit into 125 // units the size of uint16. 126 static const uint64 kOne = 1; 127 static const uint32 kStorageBitSize = 64; 128 static const uint32 kStorageLogBitSize = 6; 129 static const uint32 kSecondaryBlockSize = ((1 << 16) - 1) 130 >> kStorageLogBitSize; 131 132 private: 133 static const uint32 kStorageBlockMask = kStorageBitSize - 1; 134 135 // returns, from the index, the count of ones up to array_index 136 size_t get_index_ones_count(size_t array_index) const; 137 138 // because the indexes, both primary and secondary, contain a running 139 // count of the population of one bits contained in [0,i), there is 140 // no reason to have an element in the zeroth position as this value would 141 // necessarily be zero. (The bits are indexed in a zero based way.) Thus 142 // we don't store the 0th element in either index. Both of the following 143 // functions, if greater than 0, must be decremented by one before retreiving 144 // the value from the corresponding array. 145 // returns the 1 + the block that contains the bitindex in question 146 // the inverted version works the same but looks for zeros using an inverted 147 // view of the index 148 size_t find_primary_block(size_t bit_index) const; 149 150 size_t find_inverted_primary_block(size_t bit_index) const; 151 152 // similarly, the secondary index (which resets its count to zero at 153 // the end of every kSecondaryBlockSize entries) does not store the element 154 // at 0. Note that the rem_bit_index parameter is the number of bits 155 // within the secondary block, after the bits accounted for by the primary 156 // block have been removed (i.e. the remaining bits) And, because we 157 // reset to zero with each new block, there is no need to store those 158 // actual zeros. 159 // returns 1 + the secondary block that contains the bitindex in question 160 size_t find_secondary_block(size_t block, size_t rem_bit_index) const; 161 162 size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index) 163 const; 164 165 // We create a primary index based upon the number of secondary index 166 // blocks. The primary index uses fields wide enough to accomodate any 167 // index of the bitarray so cannot overflow 168 // The primary index is the actual running 169 // count of one bits set for all blocks (and, thus, all uint64s). primary_index_size()170 size_t primary_index_size() const { 171 return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize; 172 } 173 174 const uint64* bits_; 175 size_t size_; 176 177 // The primary index contains the running popcount of all blocks 178 // which means the nth value contains the popcounts of 179 // [0,n*kSecondaryBlockSize], however, the 0th element is omitted. 180 vector<uint32> primary_index_; 181 // The secondary index contains the running popcount of the associated 182 // bitmap. It is the same length (in units of uint16) as the 183 // bitmap's map is in units of uint64s. 184 vector<uint16> secondary_index_; 185 }; 186 187 } // end namespace fst 188 189 #endif // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_ 190