1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <algorithm> 21 #include <array> 22 #include <bitset> 23 #include <cassert> 24 #include <cstdint> 25 #include <cstring> 26 #include <memory> 27 #include <string> 28 #include <utility> 29 30 #include "arrow/buffer.h" 31 #include "arrow/util/bit_util.h" 32 #include "arrow/util/compare.h" 33 #include "arrow/util/endian.h" 34 #include "arrow/util/functional.h" 35 #include "arrow/util/string_builder.h" 36 #include "arrow/util/string_view.h" 37 #include "arrow/util/visibility.h" 38 39 namespace arrow { 40 41 class BooleanArray; 42 43 namespace internal { 44 45 class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>, 46 public util::EqualityComparable<Bitmap> { 47 public: 48 template <typename Word> 49 using View = util::basic_string_view<Word>; 50 51 Bitmap() = default; 52 Bitmap(std::shared_ptr<Buffer> buffer,int64_t offset,int64_t length)53 Bitmap(std::shared_ptr<Buffer> buffer, int64_t offset, int64_t length) 54 : buffer_(std::move(buffer)), offset_(offset), length_(length) {} 55 Bitmap(const void * data,int64_t offset,int64_t length)56 Bitmap(const void* data, int64_t offset, int64_t length) 57 : buffer_(std::make_shared<Buffer>(static_cast<const uint8_t*>(data), 58 BitUtil::BytesForBits(offset + length))), 59 offset_(offset), 60 length_(length) {} 61 Bitmap(void * data,int64_t offset,int64_t length)62 Bitmap(void* data, int64_t offset, int64_t length) 63 : buffer_(std::make_shared<MutableBuffer>(static_cast<uint8_t*>(data), 64 BitUtil::BytesForBits(offset + length))), 65 offset_(offset), 66 length_(length) {} 67 Slice(int64_t offset)68 Bitmap Slice(int64_t offset) const { 69 return Bitmap(buffer_, offset_ + offset, length_ - offset); 70 } 71 Slice(int64_t offset,int64_t length)72 Bitmap Slice(int64_t offset, int64_t length) const { 73 return Bitmap(buffer_, offset_ + offset, length); 74 } 75 76 std::string ToString() const; 77 78 bool Equals(const Bitmap& other) const; 79 80 std::string Diff(const Bitmap& other) const; 81 GetBit(int64_t i)82 bool GetBit(int64_t i) const { return BitUtil::GetBit(buffer_->data(), i + offset_); } 83 84 bool operator[](int64_t i) const { return GetBit(i); } 85 SetBitTo(int64_t i,bool v)86 void SetBitTo(int64_t i, bool v) const { 87 BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v); 88 } 89 SetBitsTo(bool v)90 void SetBitsTo(bool v) { 91 BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v); 92 } 93 94 void CopyFrom(const Bitmap& other); 95 void CopyFromInverted(const Bitmap& other); 96 97 /// \brief Visit bits from each bitmap as bitset<N> 98 /// 99 /// All bitmaps must have identical length. 100 template <size_t N, typename Visitor> VisitBits(const Bitmap (& bitmaps)[N],Visitor && visitor)101 static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) { 102 int64_t bit_length = BitLength(bitmaps, N); 103 std::bitset<N> bits; 104 for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) { 105 for (size_t i = 0; i < N; ++i) { 106 bits[i] = bitmaps[i].GetBit(bit_i); 107 } 108 visitor(bits); 109 } 110 } 111 112 /// \brief Visit words of bits from each bitmap as array<Word, N> 113 /// 114 /// All bitmaps must have identical length. The first bit in a visited bitmap 115 /// may be offset within the first visited word, but words will otherwise contain 116 /// densely packed bits loaded from the bitmap. That offset within the first word is 117 /// returned. 118 /// 119 /// TODO(bkietz) allow for early termination 120 // NOTE: this function is efficient on 3+ sufficiently large bitmaps. 121 // It also has a large prolog / epilog overhead and should be used 122 // carefully in other cases. 123 // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid 124 // and BitmapUInt64Reader. 125 template <size_t N, typename Visitor, 126 typename Word = typename std::decay< 127 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type> VisitWords(const Bitmap (& bitmaps_arg)[N],Visitor && visitor)128 static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) { 129 constexpr int64_t kBitWidth = sizeof(Word) * 8; 130 131 // local, mutable variables which will be sliced/decremented to represent consumption: 132 Bitmap bitmaps[N]; 133 int64_t offsets[N]; 134 int64_t bit_length = BitLength(bitmaps_arg, N); 135 View<Word> words[N]; 136 for (size_t i = 0; i < N; ++i) { 137 bitmaps[i] = bitmaps_arg[i]; 138 offsets[i] = bitmaps[i].template word_offset<Word>(); 139 assert(offsets[i] >= 0 && offsets[i] < kBitWidth); 140 words[i] = bitmaps[i].template words<Word>(); 141 } 142 143 auto consume = [&](int64_t consumed_bits) { 144 for (size_t i = 0; i < N; ++i) { 145 bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits); 146 offsets[i] = bitmaps[i].template word_offset<Word>(); 147 assert(offsets[i] >= 0 && offsets[i] < kBitWidth); 148 words[i] = bitmaps[i].template words<Word>(); 149 } 150 bit_length -= consumed_bits; 151 }; 152 153 std::array<Word, N> visited_words; 154 visited_words.fill(0); 155 156 if (bit_length <= kBitWidth * 2) { 157 // bitmaps fit into one or two words so don't bother with optimization 158 while (bit_length > 0) { 159 auto leading_bits = std::min(bit_length, kBitWidth); 160 SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words); 161 visitor(visited_words); 162 consume(leading_bits); 163 } 164 return 0; 165 } 166 167 int64_t max_offset = *std::max_element(offsets, offsets + N); 168 int64_t min_offset = *std::min_element(offsets, offsets + N); 169 if (max_offset > 0) { 170 // consume leading bits 171 auto leading_bits = kBitWidth - min_offset; 172 SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words); 173 visitor(visited_words); 174 consume(leading_bits); 175 } 176 assert(*std::min_element(offsets, offsets + N) == 0); 177 178 int64_t whole_word_count = bit_length / kBitWidth; 179 assert(whole_word_count >= 1); 180 181 if (min_offset == max_offset) { 182 // all offsets were identical, all leading bits have been consumed 183 assert( 184 std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; })); 185 186 for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) { 187 for (size_t i = 0; i < N; ++i) { 188 visited_words[i] = words[i][word_i]; 189 } 190 visitor(visited_words); 191 } 192 consume(whole_word_count * kBitWidth); 193 } else { 194 // leading bits from potentially incomplete words have been consumed 195 196 // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely 197 // within the bitmap for all i 198 for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) { 199 for (size_t i = 0; i < N; ++i) { 200 if (offsets[i] == 0) { 201 visited_words[i] = words[i][word_i]; 202 } else { 203 auto words0 = BitUtil::ToLittleEndian(words[i][word_i]); 204 auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]); 205 visited_words[i] = BitUtil::FromLittleEndian( 206 (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i]))); 207 } 208 } 209 visitor(visited_words); 210 } 211 consume((whole_word_count - 1) * kBitWidth); 212 213 SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words); 214 215 visitor(visited_words); 216 consume(kBitWidth); 217 } 218 219 // load remaining bits 220 if (bit_length > 0) { 221 SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words); 222 visitor(visited_words); 223 } 224 225 return min_offset; 226 } 227 buffer()228 const std::shared_ptr<Buffer>& buffer() const { return buffer_; } 229 230 /// offset of first bit relative to buffer().data() offset()231 int64_t offset() const { return offset_; } 232 233 /// number of bits in this Bitmap length()234 int64_t length() const { return length_; } 235 236 /// string_view of all bytes which contain any bit in this Bitmap bytes()237 util::bytes_view bytes() const { 238 auto byte_offset = offset_ / 8; 239 auto byte_count = BitUtil::CeilDiv(offset_ + length_, 8) - byte_offset; 240 return util::bytes_view(buffer_->data() + byte_offset, byte_count); 241 } 242 243 private: 244 /// string_view of all Words which contain any bit in this Bitmap 245 /// 246 /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36) 247 /// words() would span bits [16, 48). 248 /// 249 /// 0 16 32 48 64 250 /// |-------|-------|------|------| (buffer) 251 /// [ ] (bitmap) 252 /// |-------|------| (returned words) 253 /// 254 /// \warning The words may contain bytes which lie outside the buffer or are 255 /// uninitialized. 256 template <typename Word> words()257 View<Word> words() const { 258 auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data()); 259 auto words_addr = bytes_addr - bytes_addr % sizeof(Word); 260 auto word_byte_count = 261 BitUtil::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()), 262 static_cast<int64_t>(sizeof(Word))) - 263 words_addr; 264 return View<Word>(reinterpret_cast<const Word*>(words_addr), 265 word_byte_count / sizeof(Word)); 266 } 267 268 /// offset of first bit relative to words<Word>().data() 269 template <typename Word> word_offset()270 int64_t word_offset() const { 271 return offset_ + 8 * (reinterpret_cast<intptr_t>(buffer_->data()) - 272 reinterpret_cast<intptr_t>(words<Word>().data())); 273 } 274 275 /// load words from bitmaps bitwise 276 template <size_t N, typename Word> SafeLoadWords(const Bitmap (& bitmaps)[N],int64_t offset,int64_t out_length,bool set_trailing_bits,std::array<Word,N> * out)277 static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset, 278 int64_t out_length, bool set_trailing_bits, 279 std::array<Word, N>* out) { 280 out->fill(0); 281 282 int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0; 283 284 Bitmap slices[N], out_bitmaps[N]; 285 for (size_t i = 0; i < N; ++i) { 286 slices[i] = bitmaps[i].Slice(offset, out_length); 287 out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length); 288 } 289 290 int64_t bit_i = 0; 291 Bitmap::VisitBits(slices, [&](std::bitset<N> bits) { 292 for (size_t i = 0; i < N; ++i) { 293 out_bitmaps[i].SetBitTo(bit_i, bits[i]); 294 } 295 ++bit_i; 296 }); 297 } 298 299 std::shared_ptr<BooleanArray> ToArray() const; 300 301 /// assert bitmaps have identical length and return that length 302 static int64_t BitLength(const Bitmap* bitmaps, size_t N); 303 304 std::shared_ptr<Buffer> buffer_; 305 int64_t offset_ = 0, length_ = 0; 306 }; 307 308 } // namespace internal 309 } // namespace arrow 310