1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <algorithm> // IWYU pragma: keep 21 #include <cstdint> 22 #include <limits> 23 #include <memory> 24 #include <utility> 25 #include <vector> 26 27 #include "arrow/array/array_base.h" 28 #include "arrow/array/array_primitive.h" 29 #include "arrow/buffer.h" 30 #include "arrow/buffer_builder.h" 31 #include "arrow/status.h" 32 #include "arrow/type_fwd.h" 33 #include "arrow/util/macros.h" 34 #include "arrow/util/visibility.h" 35 36 namespace arrow { 37 38 constexpr int64_t kMinBuilderCapacity = 1 << 5; 39 constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1; 40 41 /// Base class for all data array builders. 42 /// 43 /// This class provides a facilities for incrementally building the null bitmap 44 /// (see Append methods) and as a side effect the current number of slots and 45 /// the null count. 46 /// 47 /// \note Users are expected to use builders as one of the concrete types below. 48 /// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. 49 class ARROW_EXPORT ArrayBuilder { 50 public: ArrayBuilder(MemoryPool * pool)51 explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {} 52 53 virtual ~ArrayBuilder() = default; 54 55 /// For nested types. Since the objects are owned by this class instance, we 56 /// skip shared pointers and just return a raw pointer child(int i)57 ArrayBuilder* child(int i) { return children_[i].get(); } 58 child_builder(int i)59 const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; } 60 num_children()61 int num_children() const { return static_cast<int>(children_.size()); } 62 length()63 virtual int64_t length() const { return length_; } null_count()64 int64_t null_count() const { return null_count_; } capacity()65 int64_t capacity() const { return capacity_; } 66 67 /// \brief Ensure that enough memory has been allocated to fit the indicated 68 /// number of total elements in the builder, including any that have already 69 /// been appended. Does not account for reallocations that may be due to 70 /// variable size data, like binary values. To make space for incremental 71 /// appends, use Reserve instead. 72 /// 73 /// \param[in] capacity the minimum number of total array values to 74 /// accommodate. Must be greater than the current capacity. 75 /// \return Status 76 virtual Status Resize(int64_t capacity); 77 78 /// \brief Ensure that there is enough space allocated to append the indicated 79 /// number of elements without any further reallocation. Overallocation is 80 /// used in order to minimize the impact of incremental Reserve() calls. 81 /// Note that additional_capacity is relative to the current number of elements 82 /// rather than to the current capacity, so calls to Reserve() which are not 83 /// interspersed with addition of new elements may not increase the capacity. 84 /// 85 /// \param[in] additional_capacity the number of additional array values 86 /// \return Status Reserve(int64_t additional_capacity)87 Status Reserve(int64_t additional_capacity) { 88 auto current_capacity = capacity(); 89 auto min_capacity = length() + additional_capacity; 90 if (min_capacity <= current_capacity) return Status::OK(); 91 92 // leave growth factor up to BufferBuilder 93 auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity); 94 return Resize(new_capacity); 95 } 96 97 /// Reset the builder. 98 virtual void Reset(); 99 100 /// \brief Append a null value to builder 101 virtual Status AppendNull() = 0; 102 /// \brief Append a number of null values to builder 103 virtual Status AppendNulls(int64_t length) = 0; 104 105 /// \brief Append a non-null value to builder 106 /// 107 /// The appended value is an implementation detail, but the corresponding 108 /// memory slot is guaranteed to be initialized. 109 /// This method is useful when appending a null value to a parent nested type. 110 virtual Status AppendEmptyValue() = 0; 111 112 /// \brief Append a number of non-null values to builder 113 /// 114 /// The appended values are an implementation detail, but the corresponding 115 /// memory slot is guaranteed to be initialized. 116 /// This method is useful when appending null values to a parent nested type. 117 virtual Status AppendEmptyValues(int64_t length) = 0; 118 119 /// For cases where raw data was memcpy'd into the internal buffers, allows us 120 /// to advance the length of the builder. It is your responsibility to use 121 /// this function responsibly. 122 Status Advance(int64_t elements); 123 124 /// \brief Return result of builder as an internal generic ArrayData 125 /// object. Resets builder except for dictionary builder 126 /// 127 /// \param[out] out the finalized ArrayData object 128 /// \return Status 129 virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0; 130 131 /// \brief Return result of builder as an Array object. 132 /// 133 /// The builder is reset except for DictionaryBuilder. 134 /// 135 /// \param[out] out the finalized Array object 136 /// \return Status 137 Status Finish(std::shared_ptr<Array>* out); 138 139 /// \brief Return result of builder as an Array object. 140 /// 141 /// The builder is reset except for DictionaryBuilder. 142 /// 143 /// \return The finalized Array object 144 Result<std::shared_ptr<Array>> Finish(); 145 146 /// \brief Return the type of the built Array 147 virtual std::shared_ptr<DataType> type() const = 0; 148 149 protected: 150 /// Append to null bitmap 151 Status AppendToBitmap(bool is_valid); 152 153 /// Vector append. Treat each zero byte as a null. If valid_bytes is null 154 /// assume all of length bits are valid. 155 Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); 156 157 /// Uniform append. Append N times the same validity bit. 158 Status AppendToBitmap(int64_t num_bits, bool value); 159 160 /// Set the next length bits to not null (i.e. valid). 161 Status SetNotNull(int64_t length); 162 163 // Unsafe operations (don't check capacity/don't resize) 164 UnsafeAppendNull()165 void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } 166 167 // Append to null bitmap, update the length UnsafeAppendToBitmap(bool is_valid)168 void UnsafeAppendToBitmap(bool is_valid) { 169 null_bitmap_builder_.UnsafeAppend(is_valid); 170 ++length_; 171 if (!is_valid) ++null_count_; 172 } 173 174 // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null 175 // assume all of length bits are valid. UnsafeAppendToBitmap(const uint8_t * valid_bytes,int64_t length)176 void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { 177 if (valid_bytes == NULLPTR) { 178 return UnsafeSetNotNull(length); 179 } 180 null_bitmap_builder_.UnsafeAppend(valid_bytes, length); 181 length_ += length; 182 null_count_ = null_bitmap_builder_.false_count(); 183 } 184 185 // Append the same validity value a given number of times. UnsafeAppendToBitmap(const int64_t num_bits,bool value)186 void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { 187 if (value) { 188 UnsafeSetNotNull(num_bits); 189 } else { 190 UnsafeSetNull(num_bits); 191 } 192 } 193 194 void UnsafeAppendToBitmap(const std::vector<bool>& is_valid); 195 196 // Set the next validity bits to not null (i.e. valid). 197 void UnsafeSetNotNull(int64_t length); 198 199 // Set the next validity bits to null (i.e. invalid). 200 void UnsafeSetNull(int64_t length); 201 202 static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); 203 204 /// \brief Finish to an array of the specified ArrayType 205 template <typename ArrayType> FinishTyped(std::shared_ptr<ArrayType> * out)206 Status FinishTyped(std::shared_ptr<ArrayType>* out) { 207 std::shared_ptr<Array> out_untyped; 208 ARROW_RETURN_NOT_OK(Finish(&out_untyped)); 209 *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped)); 210 return Status::OK(); 211 } 212 213 // Check the requested capacity for validity CheckCapacity(int64_t new_capacity)214 Status CheckCapacity(int64_t new_capacity) { 215 if (ARROW_PREDICT_FALSE(new_capacity < 0)) { 216 return Status::Invalid( 217 "Resize capacity must be positive (requested: ", new_capacity, ")"); 218 } 219 220 if (ARROW_PREDICT_FALSE(new_capacity < length_)) { 221 return Status::Invalid("Resize cannot downsize (requested: ", new_capacity, 222 ", current length: ", length_, ")"); 223 } 224 225 return Status::OK(); 226 } 227 228 // Check for array type 229 Status CheckArrayType(const std::shared_ptr<DataType>& expected_type, 230 const Array& array, const char* message); 231 Status CheckArrayType(Type::type expected_type, const Array& array, 232 const char* message); 233 234 MemoryPool* pool_; 235 236 TypedBufferBuilder<bool> null_bitmap_builder_; 237 int64_t null_count_ = 0; 238 239 // Array length, so far. Also, the index of the next element to be added 240 int64_t length_ = 0; 241 int64_t capacity_ = 0; 242 243 // Child value array builders. These are owned by this class 244 std::vector<std::shared_ptr<ArrayBuilder>> children_; 245 246 private: 247 ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); 248 }; 249 250 /// \brief Construct an empty ArrayBuilder corresponding to the data 251 /// type 252 /// \param[in] pool the MemoryPool to use for allocations 253 /// \param[in] type the data type to create the builder for 254 /// \param[out] out the created ArrayBuilder 255 ARROW_EXPORT 256 Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, 257 std::unique_ptr<ArrayBuilder>* out); 258 259 /// \brief Construct an empty DictionaryBuilder initialized optionally 260 /// with a pre-existing dictionary 261 /// \param[in] pool the MemoryPool to use for allocations 262 /// \param[in] type the dictionary type to create the builder for 263 /// \param[in] dictionary the initial dictionary, if any. May be nullptr 264 /// \param[out] out the created ArrayBuilder 265 ARROW_EXPORT 266 Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, 267 const std::shared_ptr<Array>& dictionary, 268 std::unique_ptr<ArrayBuilder>* out); 269 270 } // namespace arrow 271