1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <algorithm>  // IWYU pragma: keep
21 #include <cstdint>
22 #include <limits>
23 #include <memory>
24 #include <utility>
25 #include <vector>
26 
27 #include "arrow/array/array_base.h"
28 #include "arrow/array/array_primitive.h"
29 #include "arrow/buffer.h"
30 #include "arrow/buffer_builder.h"
31 #include "arrow/status.h"
32 #include "arrow/type_fwd.h"
33 #include "arrow/util/macros.h"
34 #include "arrow/util/visibility.h"
35 
36 namespace arrow {
37 
38 constexpr int64_t kMinBuilderCapacity = 1 << 5;
39 constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
40 
41 /// Base class for all data array builders.
42 ///
43 /// This class provides a facilities for incrementally building the null bitmap
44 /// (see Append methods) and as a side effect the current number of slots and
45 /// the null count.
46 ///
47 /// \note Users are expected to use builders as one of the concrete types below.
48 /// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
49 class ARROW_EXPORT ArrayBuilder {
50  public:
ArrayBuilder(MemoryPool * pool)51   explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
52 
53   virtual ~ArrayBuilder() = default;
54 
55   /// For nested types. Since the objects are owned by this class instance, we
56   /// skip shared pointers and just return a raw pointer
child(int i)57   ArrayBuilder* child(int i) { return children_[i].get(); }
58 
child_builder(int i)59   const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
60 
num_children()61   int num_children() const { return static_cast<int>(children_.size()); }
62 
length()63   virtual int64_t length() const { return length_; }
null_count()64   int64_t null_count() const { return null_count_; }
capacity()65   int64_t capacity() const { return capacity_; }
66 
67   /// \brief Ensure that enough memory has been allocated to fit the indicated
68   /// number of total elements in the builder, including any that have already
69   /// been appended. Does not account for reallocations that may be due to
70   /// variable size data, like binary values. To make space for incremental
71   /// appends, use Reserve instead.
72   ///
73   /// \param[in] capacity the minimum number of total array values to
74   ///            accommodate. Must be greater than the current capacity.
75   /// \return Status
76   virtual Status Resize(int64_t capacity);
77 
78   /// \brief Ensure that there is enough space allocated to append the indicated
79   /// number of elements without any further reallocation. Overallocation is
80   /// used in order to minimize the impact of incremental Reserve() calls.
81   /// Note that additional_capacity is relative to the current number of elements
82   /// rather than to the current capacity, so calls to Reserve() which are not
83   /// interspersed with addition of new elements may not increase the capacity.
84   ///
85   /// \param[in] additional_capacity the number of additional array values
86   /// \return Status
Reserve(int64_t additional_capacity)87   Status Reserve(int64_t additional_capacity) {
88     auto current_capacity = capacity();
89     auto min_capacity = length() + additional_capacity;
90     if (min_capacity <= current_capacity) return Status::OK();
91 
92     // leave growth factor up to BufferBuilder
93     auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
94     return Resize(new_capacity);
95   }
96 
97   /// Reset the builder.
98   virtual void Reset();
99 
100   /// \brief Append a null value to builder
101   virtual Status AppendNull() = 0;
102   /// \brief Append a number of null values to builder
103   virtual Status AppendNulls(int64_t length) = 0;
104 
105   /// \brief Append a non-null value to builder
106   ///
107   /// The appended value is an implementation detail, but the corresponding
108   /// memory slot is guaranteed to be initialized.
109   /// This method is useful when appending a null value to a parent nested type.
110   virtual Status AppendEmptyValue() = 0;
111 
112   /// \brief Append a number of non-null values to builder
113   ///
114   /// The appended values are an implementation detail, but the corresponding
115   /// memory slot is guaranteed to be initialized.
116   /// This method is useful when appending null values to a parent nested type.
117   virtual Status AppendEmptyValues(int64_t length) = 0;
118 
119   /// For cases where raw data was memcpy'd into the internal buffers, allows us
120   /// to advance the length of the builder. It is your responsibility to use
121   /// this function responsibly.
122   Status Advance(int64_t elements);
123 
124   /// \brief Return result of builder as an internal generic ArrayData
125   /// object. Resets builder except for dictionary builder
126   ///
127   /// \param[out] out the finalized ArrayData object
128   /// \return Status
129   virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
130 
131   /// \brief Return result of builder as an Array object.
132   ///
133   /// The builder is reset except for DictionaryBuilder.
134   ///
135   /// \param[out] out the finalized Array object
136   /// \return Status
137   Status Finish(std::shared_ptr<Array>* out);
138 
139   /// \brief Return result of builder as an Array object.
140   ///
141   /// The builder is reset except for DictionaryBuilder.
142   ///
143   /// \return The finalized Array object
144   Result<std::shared_ptr<Array>> Finish();
145 
146   /// \brief Return the type of the built Array
147   virtual std::shared_ptr<DataType> type() const = 0;
148 
149  protected:
150   /// Append to null bitmap
151   Status AppendToBitmap(bool is_valid);
152 
153   /// Vector append. Treat each zero byte as a null.   If valid_bytes is null
154   /// assume all of length bits are valid.
155   Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
156 
157   /// Uniform append.  Append N times the same validity bit.
158   Status AppendToBitmap(int64_t num_bits, bool value);
159 
160   /// Set the next length bits to not null (i.e. valid).
161   Status SetNotNull(int64_t length);
162 
163   // Unsafe operations (don't check capacity/don't resize)
164 
UnsafeAppendNull()165   void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
166 
167   // Append to null bitmap, update the length
UnsafeAppendToBitmap(bool is_valid)168   void UnsafeAppendToBitmap(bool is_valid) {
169     null_bitmap_builder_.UnsafeAppend(is_valid);
170     ++length_;
171     if (!is_valid) ++null_count_;
172   }
173 
174   // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
175   // assume all of length bits are valid.
UnsafeAppendToBitmap(const uint8_t * valid_bytes,int64_t length)176   void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
177     if (valid_bytes == NULLPTR) {
178       return UnsafeSetNotNull(length);
179     }
180     null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
181     length_ += length;
182     null_count_ = null_bitmap_builder_.false_count();
183   }
184 
185   // Append the same validity value a given number of times.
UnsafeAppendToBitmap(const int64_t num_bits,bool value)186   void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
187     if (value) {
188       UnsafeSetNotNull(num_bits);
189     } else {
190       UnsafeSetNull(num_bits);
191     }
192   }
193 
194   void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
195 
196   // Set the next validity bits to not null (i.e. valid).
197   void UnsafeSetNotNull(int64_t length);
198 
199   // Set the next validity bits to null (i.e. invalid).
200   void UnsafeSetNull(int64_t length);
201 
202   static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
203 
204   /// \brief Finish to an array of the specified ArrayType
205   template <typename ArrayType>
FinishTyped(std::shared_ptr<ArrayType> * out)206   Status FinishTyped(std::shared_ptr<ArrayType>* out) {
207     std::shared_ptr<Array> out_untyped;
208     ARROW_RETURN_NOT_OK(Finish(&out_untyped));
209     *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
210     return Status::OK();
211   }
212 
213   // Check the requested capacity for validity
CheckCapacity(int64_t new_capacity)214   Status CheckCapacity(int64_t new_capacity) {
215     if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
216       return Status::Invalid(
217           "Resize capacity must be positive (requested: ", new_capacity, ")");
218     }
219 
220     if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
221       return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
222                              ", current length: ", length_, ")");
223     }
224 
225     return Status::OK();
226   }
227 
228   // Check for array type
229   Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
230                         const Array& array, const char* message);
231   Status CheckArrayType(Type::type expected_type, const Array& array,
232                         const char* message);
233 
234   MemoryPool* pool_;
235 
236   TypedBufferBuilder<bool> null_bitmap_builder_;
237   int64_t null_count_ = 0;
238 
239   // Array length, so far. Also, the index of the next element to be added
240   int64_t length_ = 0;
241   int64_t capacity_ = 0;
242 
243   // Child value array builders. These are owned by this class
244   std::vector<std::shared_ptr<ArrayBuilder>> children_;
245 
246  private:
247   ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
248 };
249 
250 /// \brief Construct an empty ArrayBuilder corresponding to the data
251 /// type
252 /// \param[in] pool the MemoryPool to use for allocations
253 /// \param[in] type the data type to create the builder for
254 /// \param[out] out the created ArrayBuilder
255 ARROW_EXPORT
256 Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
257                    std::unique_ptr<ArrayBuilder>* out);
258 
259 /// \brief Construct an empty DictionaryBuilder initialized optionally
260 /// with a pre-existing dictionary
261 /// \param[in] pool the MemoryPool to use for allocations
262 /// \param[in] type the dictionary type to create the builder for
263 /// \param[in] dictionary the initial dictionary, if any. May be nullptr
264 /// \param[out] out the created ArrayBuilder
265 ARROW_EXPORT
266 Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
267                              const std::shared_ptr<Array>& dictionary,
268                              std::unique_ptr<ArrayBuilder>* out);
269 
270 }  // namespace arrow
271