1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <cstdint> 21 #include <limits> 22 #include <memory> 23 #include <utility> 24 #include <vector> 25 26 #include "arrow/array/array_nested.h" 27 #include "arrow/array/builder_base.h" 28 #include "arrow/array/data.h" 29 #include "arrow/buffer.h" 30 #include "arrow/buffer_builder.h" 31 #include "arrow/status.h" 32 #include "arrow/type.h" 33 #include "arrow/util/macros.h" 34 #include "arrow/util/visibility.h" 35 36 namespace arrow { 37 38 // ---------------------------------------------------------------------- 39 // List builder 40 41 template <typename TYPE> 42 class BaseListBuilder : public ArrayBuilder { 43 public: 44 using TypeClass = TYPE; 45 using offset_type = typename TypeClass::offset_type; 46 47 /// Use this constructor to incrementally build the value array along with offsets and 48 /// null bitmap. BaseListBuilder(MemoryPool * pool,std::shared_ptr<ArrayBuilder> const & value_builder,const std::shared_ptr<DataType> & type)49 BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder, 50 const std::shared_ptr<DataType>& type) 51 : ArrayBuilder(pool), 52 offsets_builder_(pool), 53 value_builder_(value_builder), 54 value_field_(type->field(0)->WithType(NULLPTR)) {} 55 BaseListBuilder(MemoryPool * pool,std::shared_ptr<ArrayBuilder> const & value_builder)56 BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder) 57 : BaseListBuilder(pool, value_builder, list(value_builder->type())) {} 58 Resize(int64_t capacity)59 Status Resize(int64_t capacity) override { 60 if (capacity > maximum_elements()) { 61 return Status::CapacityError("List array cannot reserve space for more than ", 62 maximum_elements(), " got ", capacity); 63 } 64 ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); 65 66 // One more than requested for offsets 67 ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); 68 return ArrayBuilder::Resize(capacity); 69 } 70 Reset()71 void Reset() override { 72 ArrayBuilder::Reset(); 73 offsets_builder_.Reset(); 74 value_builder_->Reset(); 75 } 76 77 /// \brief Vector append 78 /// 79 /// If passed, valid_bytes is of equal length to values, and any zero byte 80 /// will be considered as a null for that slot 81 Status AppendValues(const offset_type* offsets, int64_t length, 82 const uint8_t* valid_bytes = NULLPTR) { 83 ARROW_RETURN_NOT_OK(Reserve(length)); 84 UnsafeAppendToBitmap(valid_bytes, length); 85 offsets_builder_.UnsafeAppend(offsets, length); 86 return Status::OK(); 87 } 88 89 /// \brief Start a new variable-length list slot 90 /// 91 /// This function should be called before beginning to append elements to the 92 /// value builder 93 Status Append(bool is_valid = true) { 94 ARROW_RETURN_NOT_OK(Reserve(1)); 95 UnsafeAppendToBitmap(is_valid); 96 return AppendNextOffset(); 97 } 98 AppendNull()99 Status AppendNull() final { return Append(false); } 100 AppendNulls(int64_t length)101 Status AppendNulls(int64_t length) final { 102 ARROW_RETURN_NOT_OK(Reserve(length)); 103 ARROW_RETURN_NOT_OK(ValidateOverflow(0)); 104 UnsafeAppendToBitmap(length, false); 105 const int64_t num_values = value_builder_->length(); 106 for (int64_t i = 0; i < length; ++i) { 107 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values)); 108 } 109 return Status::OK(); 110 } 111 AppendEmptyValue()112 Status AppendEmptyValue() final { return Append(true); } 113 AppendEmptyValues(int64_t length)114 Status AppendEmptyValues(int64_t length) final { 115 ARROW_RETURN_NOT_OK(Reserve(length)); 116 ARROW_RETURN_NOT_OK(ValidateOverflow(0)); 117 UnsafeAppendToBitmap(length, true); 118 const int64_t num_values = value_builder_->length(); 119 for (int64_t i = 0; i < length; ++i) { 120 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values)); 121 } 122 return Status::OK(); 123 } 124 AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)125 Status AppendArraySlice(const ArrayData& array, int64_t offset, 126 int64_t length) override { 127 const offset_type* offsets = array.GetValues<offset_type>(1); 128 const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; 129 for (int64_t row = offset; row < offset + length; row++) { 130 if (!validity || BitUtil::GetBit(validity, array.offset + row)) { 131 ARROW_RETURN_NOT_OK(Append()); 132 int64_t slot_length = offsets[row + 1] - offsets[row]; 133 ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0], 134 offsets[row], slot_length)); 135 } else { 136 ARROW_RETURN_NOT_OK(AppendNull()); 137 } 138 } 139 return Status::OK(); 140 } 141 FinishInternal(std::shared_ptr<ArrayData> * out)142 Status FinishInternal(std::shared_ptr<ArrayData>* out) override { 143 ARROW_RETURN_NOT_OK(AppendNextOffset()); 144 145 // Offset padding zeroed by BufferBuilder 146 std::shared_ptr<Buffer> offsets, null_bitmap; 147 ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); 148 ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); 149 150 if (value_builder_->length() == 0) { 151 // Try to make sure we get a non-null values buffer (ARROW-2744) 152 ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); 153 } 154 155 std::shared_ptr<ArrayData> items; 156 ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); 157 158 *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, 159 null_count_); 160 Reset(); 161 return Status::OK(); 162 } 163 ValidateOverflow(int64_t new_elements)164 Status ValidateOverflow(int64_t new_elements) const { 165 auto new_length = value_builder_->length() + new_elements; 166 if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { 167 return Status::CapacityError("List array cannot contain more than ", 168 maximum_elements(), " elements, have ", new_elements); 169 } else { 170 return Status::OK(); 171 } 172 } 173 value_builder()174 ArrayBuilder* value_builder() const { return value_builder_.get(); } 175 176 // Cannot make this a static attribute because of linking issues maximum_elements()177 static constexpr int64_t maximum_elements() { 178 return std::numeric_limits<offset_type>::max() - 1; 179 } 180 type()181 std::shared_ptr<DataType> type() const override { 182 return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type())); 183 } 184 185 protected: 186 TypedBufferBuilder<offset_type> offsets_builder_; 187 std::shared_ptr<ArrayBuilder> value_builder_; 188 std::shared_ptr<Field> value_field_; 189 AppendNextOffset()190 Status AppendNextOffset() { 191 ARROW_RETURN_NOT_OK(ValidateOverflow(0)); 192 const int64_t num_values = value_builder_->length(); 193 return offsets_builder_.Append(static_cast<offset_type>(num_values)); 194 } 195 }; 196 197 /// \class ListBuilder 198 /// \brief Builder class for variable-length list array value types 199 /// 200 /// To use this class, you must append values to the child array builder and use 201 /// the Append function to delimit each distinct list value (once the values 202 /// have been appended to the child array) or use the bulk API to append 203 /// a sequence of offsets and null values. 204 /// 205 /// A note on types. Per arrow/type.h all types in the c++ implementation are 206 /// logical so even though this class always builds list array, this can 207 /// represent multiple different logical types. If no logical type is provided 208 /// at construction time, the class defaults to List<T> where t is taken from the 209 /// value_builder/values that the object is constructed with. 210 class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> { 211 public: 212 using BaseListBuilder::BaseListBuilder; 213 214 /// \cond FALSE 215 using ArrayBuilder::Finish; 216 /// \endcond 217 Finish(std::shared_ptr<ListArray> * out)218 Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); } 219 }; 220 221 /// \class LargeListBuilder 222 /// \brief Builder class for large variable-length list array value types 223 /// 224 /// Like ListBuilder, but to create large list arrays (with 64-bit offsets). 225 class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> { 226 public: 227 using BaseListBuilder::BaseListBuilder; 228 229 /// \cond FALSE 230 using ArrayBuilder::Finish; 231 /// \endcond 232 Finish(std::shared_ptr<LargeListArray> * out)233 Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); } 234 }; 235 236 // ---------------------------------------------------------------------- 237 // Map builder 238 239 /// \class MapBuilder 240 /// \brief Builder class for arrays of variable-size maps 241 /// 242 /// To use this class, you must append values to the key and item array builders 243 /// and use the Append function to delimit each distinct map (once the keys and items 244 /// have been appended) or use the bulk API to append a sequence of offsets and null 245 /// maps. 246 /// 247 /// Key uniqueness and ordering are not validated. 248 class ARROW_EXPORT MapBuilder : public ArrayBuilder { 249 public: 250 /// Use this constructor to define the built array's type explicitly. If key_builder 251 /// or item_builder has indeterminate type, this builder will also. 252 MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder, 253 const std::shared_ptr<ArrayBuilder>& item_builder, 254 const std::shared_ptr<DataType>& type); 255 256 /// Use this constructor to infer the built array's type. If key_builder or 257 /// item_builder has indeterminate type, this builder will also. 258 MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder, 259 const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false); 260 261 MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder, 262 const std::shared_ptr<DataType>& type); 263 264 Status Resize(int64_t capacity) override; 265 void Reset() override; 266 Status FinishInternal(std::shared_ptr<ArrayData>* out) override; 267 268 /// \cond FALSE 269 using ArrayBuilder::Finish; 270 /// \endcond 271 Finish(std::shared_ptr<MapArray> * out)272 Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); } 273 274 /// \brief Vector append 275 /// 276 /// If passed, valid_bytes is of equal length to values, and any zero byte 277 /// will be considered as a null for that slot 278 Status AppendValues(const int32_t* offsets, int64_t length, 279 const uint8_t* valid_bytes = NULLPTR); 280 281 /// \brief Start a new variable-length map slot 282 /// 283 /// This function should be called before beginning to append elements to the 284 /// key and item builders 285 Status Append(); 286 287 Status AppendNull() final; 288 289 Status AppendNulls(int64_t length) final; 290 291 Status AppendEmptyValue() final; 292 293 Status AppendEmptyValues(int64_t length) final; 294 AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)295 Status AppendArraySlice(const ArrayData& array, int64_t offset, 296 int64_t length) override { 297 const int32_t* offsets = array.GetValues<int32_t>(1); 298 const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; 299 for (int64_t row = offset; row < offset + length; row++) { 300 if (!validity || BitUtil::GetBit(validity, array.offset + row)) { 301 ARROW_RETURN_NOT_OK(Append()); 302 const int64_t slot_length = offsets[row + 1] - offsets[row]; 303 ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( 304 *array.child_data[0]->child_data[0], offsets[row], slot_length)); 305 ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( 306 *array.child_data[0]->child_data[1], offsets[row], slot_length)); 307 } else { 308 ARROW_RETURN_NOT_OK(AppendNull()); 309 } 310 } 311 return Status::OK(); 312 } 313 314 /// \brief Get builder to append keys. 315 /// 316 /// Append a key with this builder should be followed by appending 317 /// an item or null value with item_builder(). key_builder()318 ArrayBuilder* key_builder() const { return key_builder_.get(); } 319 320 /// \brief Get builder to append items 321 /// 322 /// Appending an item with this builder should have been preceded 323 /// by appending a key with key_builder(). item_builder()324 ArrayBuilder* item_builder() const { return item_builder_.get(); } 325 326 /// \brief Get builder to add Map entries as struct values. 327 /// 328 /// This is used instead of key_builder()/item_builder() and allows 329 /// the Map to be built as a list of struct values. value_builder()330 ArrayBuilder* value_builder() const { return list_builder_->value_builder(); } 331 type()332 std::shared_ptr<DataType> type() const override { 333 return map(key_builder_->type(), item_builder_->type(), keys_sorted_); 334 } 335 ValidateOverflow(int64_t new_elements)336 Status ValidateOverflow(int64_t new_elements) { 337 return list_builder_->ValidateOverflow(new_elements); 338 } 339 340 protected: 341 inline Status AdjustStructBuilderLength(); 342 343 protected: 344 bool keys_sorted_ = false; 345 std::shared_ptr<ListBuilder> list_builder_; 346 std::shared_ptr<ArrayBuilder> key_builder_; 347 std::shared_ptr<ArrayBuilder> item_builder_; 348 }; 349 350 // ---------------------------------------------------------------------- 351 // FixedSizeList builder 352 353 /// \class FixedSizeListBuilder 354 /// \brief Builder class for fixed-length list array value types 355 class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { 356 public: 357 /// Use this constructor to define the built array's type explicitly. If value_builder 358 /// has indeterminate type, this builder will also. 359 FixedSizeListBuilder(MemoryPool* pool, 360 std::shared_ptr<ArrayBuilder> const& value_builder, 361 int32_t list_size); 362 363 /// Use this constructor to infer the built array's type. If value_builder has 364 /// indeterminate type, this builder will also. 365 FixedSizeListBuilder(MemoryPool* pool, 366 std::shared_ptr<ArrayBuilder> const& value_builder, 367 const std::shared_ptr<DataType>& type); 368 369 Status Resize(int64_t capacity) override; 370 void Reset() override; 371 Status FinishInternal(std::shared_ptr<ArrayData>* out) override; 372 373 /// \cond FALSE 374 using ArrayBuilder::Finish; 375 /// \endcond 376 Finish(std::shared_ptr<FixedSizeListArray> * out)377 Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); } 378 379 /// \brief Append a valid fixed length list. 380 /// 381 /// This function affects only the validity bitmap; the child values must be appended 382 /// using the child array builder. 383 Status Append(); 384 385 /// \brief Vector append 386 /// 387 /// If passed, valid_bytes wil be read and any zero byte 388 /// will cause the corresponding slot to be null 389 /// 390 /// This function affects only the validity bitmap; the child values must be appended 391 /// using the child array builder. This includes appending nulls for null lists. 392 /// XXX this restriction is confusing, should this method be omitted? 393 Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR); 394 395 /// \brief Append a null fixed length list. 396 /// 397 /// The child array builder will have the appropriate number of nulls appended 398 /// automatically. 399 Status AppendNull() final; 400 401 /// \brief Append length null fixed length lists. 402 /// 403 /// The child array builder will have the appropriate number of nulls appended 404 /// automatically. 405 Status AppendNulls(int64_t length) final; 406 407 Status ValidateOverflow(int64_t new_elements); 408 409 Status AppendEmptyValue() final; 410 411 Status AppendEmptyValues(int64_t length) final; 412 AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)413 Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { 414 const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; 415 for (int64_t row = offset; row < offset + length; row++) { 416 if (!validity || BitUtil::GetBit(validity, array.offset + row)) { 417 ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( 418 *array.child_data[0], list_size_ * (array.offset + row), list_size_)); 419 ARROW_RETURN_NOT_OK(Append()); 420 } else { 421 ARROW_RETURN_NOT_OK(AppendNull()); 422 } 423 } 424 return Status::OK(); 425 } 426 value_builder()427 ArrayBuilder* value_builder() const { return value_builder_.get(); } 428 type()429 std::shared_ptr<DataType> type() const override { 430 return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); 431 } 432 433 // Cannot make this a static attribute because of linking issues maximum_elements()434 static constexpr int64_t maximum_elements() { 435 return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1; 436 } 437 438 protected: 439 std::shared_ptr<Field> value_field_; 440 const int32_t list_size_; 441 std::shared_ptr<ArrayBuilder> value_builder_; 442 }; 443 444 // ---------------------------------------------------------------------- 445 // Struct 446 447 // --------------------------------------------------------------------------------- 448 // StructArray builder 449 /// Append, Resize and Reserve methods are acting on StructBuilder. 450 /// Please make sure all these methods of all child-builders' are consistently 451 /// called to maintain data-structure consistency. 452 class ARROW_EXPORT StructBuilder : public ArrayBuilder { 453 public: 454 /// If any of field_builders has indeterminate type, this builder will also 455 StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool, 456 std::vector<std::shared_ptr<ArrayBuilder>> field_builders); 457 458 Status FinishInternal(std::shared_ptr<ArrayData>* out) override; 459 460 /// \cond FALSE 461 using ArrayBuilder::Finish; 462 /// \endcond 463 Finish(std::shared_ptr<StructArray> * out)464 Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); } 465 466 /// Null bitmap is of equal length to every child field, and any zero byte 467 /// will be considered as a null for that field, but users must using app- 468 /// end methods or advance methods of the child builders' independently to 469 /// insert data. AppendValues(int64_t length,const uint8_t * valid_bytes)470 Status AppendValues(int64_t length, const uint8_t* valid_bytes) { 471 ARROW_RETURN_NOT_OK(Reserve(length)); 472 UnsafeAppendToBitmap(valid_bytes, length); 473 return Status::OK(); 474 } 475 476 /// Append an element to the Struct. All child-builders' Append method must 477 /// be called independently to maintain data-structure consistency. 478 Status Append(bool is_valid = true) { 479 ARROW_RETURN_NOT_OK(Reserve(1)); 480 UnsafeAppendToBitmap(is_valid); 481 return Status::OK(); 482 } 483 484 /// \brief Append a null value. Automatically appends an empty value to each child 485 /// builder. AppendNull()486 Status AppendNull() final { 487 for (const auto& field : children_) { 488 ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); 489 } 490 return Append(false); 491 } 492 493 /// \brief Append multiple null values. Automatically appends empty values to each 494 /// child builder. AppendNulls(int64_t length)495 Status AppendNulls(int64_t length) final { 496 for (const auto& field : children_) { 497 ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); 498 } 499 ARROW_RETURN_NOT_OK(Reserve(length)); 500 UnsafeAppendToBitmap(length, false); 501 return Status::OK(); 502 } 503 AppendEmptyValue()504 Status AppendEmptyValue() final { 505 for (const auto& field : children_) { 506 ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); 507 } 508 return Append(true); 509 } 510 AppendEmptyValues(int64_t length)511 Status AppendEmptyValues(int64_t length) final { 512 for (const auto& field : children_) { 513 ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); 514 } 515 ARROW_RETURN_NOT_OK(Reserve(length)); 516 UnsafeAppendToBitmap(length, true); 517 return Status::OK(); 518 } 519 AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)520 Status AppendArraySlice(const ArrayData& array, int64_t offset, 521 int64_t length) override { 522 for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) { 523 ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i], 524 array.offset + offset, length)); 525 } 526 const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; 527 ARROW_RETURN_NOT_OK(Reserve(length)); 528 UnsafeAppendToBitmap(validity, array.offset + offset, length); 529 return Status::OK(); 530 } 531 532 void Reset() override; 533 field_builder(int i)534 ArrayBuilder* field_builder(int i) const { return children_[i].get(); } 535 num_fields()536 int num_fields() const { return static_cast<int>(children_.size()); } 537 538 std::shared_ptr<DataType> type() const override; 539 540 private: 541 std::shared_ptr<DataType> type_; 542 }; 543 544 } // namespace arrow 545