1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <algorithm>
21 #include <array>
22 #include <bitset>
23 #include <cassert>
24 #include <cstdint>
25 #include <cstring>
26 #include <memory>
27 #include <string>
28 #include <utility>
29 
30 #include "arrow/buffer.h"
31 #include "arrow/util/bit_util.h"
32 #include "arrow/util/compare.h"
33 #include "arrow/util/endian.h"
34 #include "arrow/util/functional.h"
35 #include "arrow/util/string_builder.h"
36 #include "arrow/util/string_view.h"
37 #include "arrow/util/visibility.h"
38 
39 namespace arrow {
40 
41 class BooleanArray;
42 
43 namespace internal {
44 
45 class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
46                             public util::EqualityComparable<Bitmap> {
47  public:
48   template <typename Word>
49   using View = util::basic_string_view<Word>;
50 
51   Bitmap() = default;
52 
Bitmap(std::shared_ptr<Buffer> buffer,int64_t offset,int64_t length)53   Bitmap(std::shared_ptr<Buffer> buffer, int64_t offset, int64_t length)
54       : buffer_(std::move(buffer)), offset_(offset), length_(length) {}
55 
Bitmap(const void * data,int64_t offset,int64_t length)56   Bitmap(const void* data, int64_t offset, int64_t length)
57       : buffer_(std::make_shared<Buffer>(static_cast<const uint8_t*>(data),
58                                          BitUtil::BytesForBits(offset + length))),
59         offset_(offset),
60         length_(length) {}
61 
Bitmap(void * data,int64_t offset,int64_t length)62   Bitmap(void* data, int64_t offset, int64_t length)
63       : buffer_(std::make_shared<MutableBuffer>(static_cast<uint8_t*>(data),
64                                                 BitUtil::BytesForBits(offset + length))),
65         offset_(offset),
66         length_(length) {}
67 
Slice(int64_t offset)68   Bitmap Slice(int64_t offset) const {
69     return Bitmap(buffer_, offset_ + offset, length_ - offset);
70   }
71 
Slice(int64_t offset,int64_t length)72   Bitmap Slice(int64_t offset, int64_t length) const {
73     return Bitmap(buffer_, offset_ + offset, length);
74   }
75 
76   std::string ToString() const;
77 
78   bool Equals(const Bitmap& other) const;
79 
80   std::string Diff(const Bitmap& other) const;
81 
GetBit(int64_t i)82   bool GetBit(int64_t i) const { return BitUtil::GetBit(buffer_->data(), i + offset_); }
83 
84   bool operator[](int64_t i) const { return GetBit(i); }
85 
SetBitTo(int64_t i,bool v)86   void SetBitTo(int64_t i, bool v) const {
87     BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v);
88   }
89 
SetBitsTo(bool v)90   void SetBitsTo(bool v) {
91     BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
92   }
93 
94   void CopyFrom(const Bitmap& other);
95   void CopyFromInverted(const Bitmap& other);
96 
97   /// \brief Visit bits from each bitmap as bitset<N>
98   ///
99   /// All bitmaps must have identical length.
100   template <size_t N, typename Visitor>
VisitBits(const Bitmap (& bitmaps)[N],Visitor && visitor)101   static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
102     int64_t bit_length = BitLength(bitmaps, N);
103     std::bitset<N> bits;
104     for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
105       for (size_t i = 0; i < N; ++i) {
106         bits[i] = bitmaps[i].GetBit(bit_i);
107       }
108       visitor(bits);
109     }
110   }
111 
112   /// \brief Visit words of bits from each bitmap as array<Word, N>
113   ///
114   /// All bitmaps must have identical length. The first bit in a visited bitmap
115   /// may be offset within the first visited word, but words will otherwise contain
116   /// densely packed bits loaded from the bitmap. That offset within the first word is
117   /// returned.
118   ///
119   /// TODO(bkietz) allow for early termination
120   // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
121   // It also has a large prolog / epilog overhead and should be used
122   // carefully in other cases.
123   // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
124   // and BitmapUInt64Reader.
125   template <size_t N, typename Visitor,
126             typename Word = typename std::decay<
127                 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
VisitWords(const Bitmap (& bitmaps_arg)[N],Visitor && visitor)128   static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
129     constexpr int64_t kBitWidth = sizeof(Word) * 8;
130 
131     // local, mutable variables which will be sliced/decremented to represent consumption:
132     Bitmap bitmaps[N];
133     int64_t offsets[N];
134     int64_t bit_length = BitLength(bitmaps_arg, N);
135     View<Word> words[N];
136     for (size_t i = 0; i < N; ++i) {
137       bitmaps[i] = bitmaps_arg[i];
138       offsets[i] = bitmaps[i].template word_offset<Word>();
139       assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
140       words[i] = bitmaps[i].template words<Word>();
141     }
142 
143     auto consume = [&](int64_t consumed_bits) {
144       for (size_t i = 0; i < N; ++i) {
145         bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
146         offsets[i] = bitmaps[i].template word_offset<Word>();
147         assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
148         words[i] = bitmaps[i].template words<Word>();
149       }
150       bit_length -= consumed_bits;
151     };
152 
153     std::array<Word, N> visited_words;
154     visited_words.fill(0);
155 
156     if (bit_length <= kBitWidth * 2) {
157       // bitmaps fit into one or two words so don't bother with optimization
158       while (bit_length > 0) {
159         auto leading_bits = std::min(bit_length, kBitWidth);
160         SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
161         visitor(visited_words);
162         consume(leading_bits);
163       }
164       return 0;
165     }
166 
167     int64_t max_offset = *std::max_element(offsets, offsets + N);
168     int64_t min_offset = *std::min_element(offsets, offsets + N);
169     if (max_offset > 0) {
170       // consume leading bits
171       auto leading_bits = kBitWidth - min_offset;
172       SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
173       visitor(visited_words);
174       consume(leading_bits);
175     }
176     assert(*std::min_element(offsets, offsets + N) == 0);
177 
178     int64_t whole_word_count = bit_length / kBitWidth;
179     assert(whole_word_count >= 1);
180 
181     if (min_offset == max_offset) {
182       // all offsets were identical, all leading bits have been consumed
183       assert(
184           std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
185 
186       for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
187         for (size_t i = 0; i < N; ++i) {
188           visited_words[i] = words[i][word_i];
189         }
190         visitor(visited_words);
191       }
192       consume(whole_word_count * kBitWidth);
193     } else {
194       // leading bits from potentially incomplete words have been consumed
195 
196       // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
197       // within the bitmap for all i
198       for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
199         for (size_t i = 0; i < N; ++i) {
200           if (offsets[i] == 0) {
201             visited_words[i] = words[i][word_i];
202           } else {
203             auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
204             auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
205             visited_words[i] = BitUtil::FromLittleEndian(
206                 (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
207           }
208         }
209         visitor(visited_words);
210       }
211       consume((whole_word_count - 1) * kBitWidth);
212 
213       SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
214 
215       visitor(visited_words);
216       consume(kBitWidth);
217     }
218 
219     // load remaining bits
220     if (bit_length > 0) {
221       SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
222       visitor(visited_words);
223     }
224 
225     return min_offset;
226   }
227 
buffer()228   const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
229 
230   /// offset of first bit relative to buffer().data()
offset()231   int64_t offset() const { return offset_; }
232 
233   /// number of bits in this Bitmap
length()234   int64_t length() const { return length_; }
235 
236   /// string_view of all bytes which contain any bit in this Bitmap
bytes()237   util::bytes_view bytes() const {
238     auto byte_offset = offset_ / 8;
239     auto byte_count = BitUtil::CeilDiv(offset_ + length_, 8) - byte_offset;
240     return util::bytes_view(buffer_->data() + byte_offset, byte_count);
241   }
242 
243  private:
244   /// string_view of all Words which contain any bit in this Bitmap
245   ///
246   /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36)
247   /// words() would span bits [16, 48).
248   ///
249   /// 0       16      32     48     64
250   /// |-------|-------|------|------| (buffer)
251   ///           [       ]             (bitmap)
252   ///         |-------|------|        (returned words)
253   ///
254   /// \warning The words may contain bytes which lie outside the buffer or are
255   /// uninitialized.
256   template <typename Word>
words()257   View<Word> words() const {
258     auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
259     auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
260     auto word_byte_count =
261         BitUtil::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
262                                    static_cast<int64_t>(sizeof(Word))) -
263         words_addr;
264     return View<Word>(reinterpret_cast<const Word*>(words_addr),
265                       word_byte_count / sizeof(Word));
266   }
267 
268   /// offset of first bit relative to words<Word>().data()
269   template <typename Word>
word_offset()270   int64_t word_offset() const {
271     return offset_ + 8 * (reinterpret_cast<intptr_t>(buffer_->data()) -
272                           reinterpret_cast<intptr_t>(words<Word>().data()));
273   }
274 
275   /// load words from bitmaps bitwise
276   template <size_t N, typename Word>
SafeLoadWords(const Bitmap (& bitmaps)[N],int64_t offset,int64_t out_length,bool set_trailing_bits,std::array<Word,N> * out)277   static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
278                             int64_t out_length, bool set_trailing_bits,
279                             std::array<Word, N>* out) {
280     out->fill(0);
281 
282     int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
283 
284     Bitmap slices[N], out_bitmaps[N];
285     for (size_t i = 0; i < N; ++i) {
286       slices[i] = bitmaps[i].Slice(offset, out_length);
287       out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
288     }
289 
290     int64_t bit_i = 0;
291     Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
292       for (size_t i = 0; i < N; ++i) {
293         out_bitmaps[i].SetBitTo(bit_i, bits[i]);
294       }
295       ++bit_i;
296     });
297   }
298 
299   std::shared_ptr<BooleanArray> ToArray() const;
300 
301   /// assert bitmaps have identical length and return that length
302   static int64_t BitLength(const Bitmap* bitmaps, size_t N);
303 
304   std::shared_ptr<Buffer> buffer_;
305   int64_t offset_ = 0, length_ = 0;
306 };
307 
308 }  // namespace internal
309 }  // namespace arrow
310