1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <cstdint>
21 #include <limits>
22 #include <memory>
23 #include <utility>
24 #include <vector>
25 
26 #include "arrow/array/array_nested.h"
27 #include "arrow/array/builder_base.h"
28 #include "arrow/array/data.h"
29 #include "arrow/buffer.h"
30 #include "arrow/buffer_builder.h"
31 #include "arrow/status.h"
32 #include "arrow/type.h"
33 #include "arrow/util/macros.h"
34 #include "arrow/util/visibility.h"
35 
36 namespace arrow {
37 
38 // ----------------------------------------------------------------------
39 // List builder
40 
41 template <typename TYPE>
42 class BaseListBuilder : public ArrayBuilder {
43  public:
44   using TypeClass = TYPE;
45   using offset_type = typename TypeClass::offset_type;
46 
47   /// Use this constructor to incrementally build the value array along with offsets and
48   /// null bitmap.
BaseListBuilder(MemoryPool * pool,std::shared_ptr<ArrayBuilder> const & value_builder,const std::shared_ptr<DataType> & type)49   BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
50                   const std::shared_ptr<DataType>& type)
51       : ArrayBuilder(pool),
52         offsets_builder_(pool),
53         value_builder_(value_builder),
54         value_field_(type->field(0)->WithType(NULLPTR)) {}
55 
BaseListBuilder(MemoryPool * pool,std::shared_ptr<ArrayBuilder> const & value_builder)56   BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder)
57       : BaseListBuilder(pool, value_builder, list(value_builder->type())) {}
58 
Resize(int64_t capacity)59   Status Resize(int64_t capacity) override {
60     if (capacity > maximum_elements()) {
61       return Status::CapacityError("List array cannot reserve space for more than ",
62                                    maximum_elements(), " got ", capacity);
63     }
64     ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
65 
66     // One more than requested for offsets
67     ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
68     return ArrayBuilder::Resize(capacity);
69   }
70 
Reset()71   void Reset() override {
72     ArrayBuilder::Reset();
73     offsets_builder_.Reset();
74     value_builder_->Reset();
75   }
76 
77   /// \brief Vector append
78   ///
79   /// If passed, valid_bytes is of equal length to values, and any zero byte
80   /// will be considered as a null for that slot
81   Status AppendValues(const offset_type* offsets, int64_t length,
82                       const uint8_t* valid_bytes = NULLPTR) {
83     ARROW_RETURN_NOT_OK(Reserve(length));
84     UnsafeAppendToBitmap(valid_bytes, length);
85     offsets_builder_.UnsafeAppend(offsets, length);
86     return Status::OK();
87   }
88 
89   /// \brief Start a new variable-length list slot
90   ///
91   /// This function should be called before beginning to append elements to the
92   /// value builder
93   Status Append(bool is_valid = true) {
94     ARROW_RETURN_NOT_OK(Reserve(1));
95     UnsafeAppendToBitmap(is_valid);
96     return AppendNextOffset();
97   }
98 
AppendNull()99   Status AppendNull() final { return Append(false); }
100 
AppendNulls(int64_t length)101   Status AppendNulls(int64_t length) final {
102     ARROW_RETURN_NOT_OK(Reserve(length));
103     ARROW_RETURN_NOT_OK(ValidateOverflow(0));
104     UnsafeAppendToBitmap(length, false);
105     const int64_t num_values = value_builder_->length();
106     for (int64_t i = 0; i < length; ++i) {
107       offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
108     }
109     return Status::OK();
110   }
111 
AppendEmptyValue()112   Status AppendEmptyValue() final { return Append(true); }
113 
AppendEmptyValues(int64_t length)114   Status AppendEmptyValues(int64_t length) final {
115     ARROW_RETURN_NOT_OK(Reserve(length));
116     ARROW_RETURN_NOT_OK(ValidateOverflow(0));
117     UnsafeAppendToBitmap(length, true);
118     const int64_t num_values = value_builder_->length();
119     for (int64_t i = 0; i < length; ++i) {
120       offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
121     }
122     return Status::OK();
123   }
124 
AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)125   Status AppendArraySlice(const ArrayData& array, int64_t offset,
126                           int64_t length) override {
127     const offset_type* offsets = array.GetValues<offset_type>(1);
128     const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
129     for (int64_t row = offset; row < offset + length; row++) {
130       if (!validity || BitUtil::GetBit(validity, array.offset + row)) {
131         ARROW_RETURN_NOT_OK(Append());
132         int64_t slot_length = offsets[row + 1] - offsets[row];
133         ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0],
134                                                              offsets[row], slot_length));
135       } else {
136         ARROW_RETURN_NOT_OK(AppendNull());
137       }
138     }
139     return Status::OK();
140   }
141 
FinishInternal(std::shared_ptr<ArrayData> * out)142   Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
143     ARROW_RETURN_NOT_OK(AppendNextOffset());
144 
145     // Offset padding zeroed by BufferBuilder
146     std::shared_ptr<Buffer> offsets, null_bitmap;
147     ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
148     ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
149 
150     if (value_builder_->length() == 0) {
151       // Try to make sure we get a non-null values buffer (ARROW-2744)
152       ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
153     }
154 
155     std::shared_ptr<ArrayData> items;
156     ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
157 
158     *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
159                            null_count_);
160     Reset();
161     return Status::OK();
162   }
163 
ValidateOverflow(int64_t new_elements)164   Status ValidateOverflow(int64_t new_elements) const {
165     auto new_length = value_builder_->length() + new_elements;
166     if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
167       return Status::CapacityError("List array cannot contain more than ",
168                                    maximum_elements(), " elements, have ", new_elements);
169     } else {
170       return Status::OK();
171     }
172   }
173 
value_builder()174   ArrayBuilder* value_builder() const { return value_builder_.get(); }
175 
176   // Cannot make this a static attribute because of linking issues
maximum_elements()177   static constexpr int64_t maximum_elements() {
178     return std::numeric_limits<offset_type>::max() - 1;
179   }
180 
type()181   std::shared_ptr<DataType> type() const override {
182     return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
183   }
184 
185  protected:
186   TypedBufferBuilder<offset_type> offsets_builder_;
187   std::shared_ptr<ArrayBuilder> value_builder_;
188   std::shared_ptr<Field> value_field_;
189 
AppendNextOffset()190   Status AppendNextOffset() {
191     ARROW_RETURN_NOT_OK(ValidateOverflow(0));
192     const int64_t num_values = value_builder_->length();
193     return offsets_builder_.Append(static_cast<offset_type>(num_values));
194   }
195 };
196 
197 /// \class ListBuilder
198 /// \brief Builder class for variable-length list array value types
199 ///
200 /// To use this class, you must append values to the child array builder and use
201 /// the Append function to delimit each distinct list value (once the values
202 /// have been appended to the child array) or use the bulk API to append
203 /// a sequence of offsets and null values.
204 ///
205 /// A note on types.  Per arrow/type.h all types in the c++ implementation are
206 /// logical so even though this class always builds list array, this can
207 /// represent multiple different logical types.  If no logical type is provided
208 /// at construction time, the class defaults to List<T> where t is taken from the
209 /// value_builder/values that the object is constructed with.
210 class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
211  public:
212   using BaseListBuilder::BaseListBuilder;
213 
214   /// \cond FALSE
215   using ArrayBuilder::Finish;
216   /// \endcond
217 
Finish(std::shared_ptr<ListArray> * out)218   Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
219 };
220 
221 /// \class LargeListBuilder
222 /// \brief Builder class for large variable-length list array value types
223 ///
224 /// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
225 class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
226  public:
227   using BaseListBuilder::BaseListBuilder;
228 
229   /// \cond FALSE
230   using ArrayBuilder::Finish;
231   /// \endcond
232 
Finish(std::shared_ptr<LargeListArray> * out)233   Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
234 };
235 
236 // ----------------------------------------------------------------------
237 // Map builder
238 
239 /// \class MapBuilder
240 /// \brief Builder class for arrays of variable-size maps
241 ///
242 /// To use this class, you must append values to the key and item array builders
243 /// and use the Append function to delimit each distinct map (once the keys and items
244 /// have been appended) or use the bulk API to append a sequence of offsets and null
245 /// maps.
246 ///
247 /// Key uniqueness and ordering are not validated.
248 class ARROW_EXPORT MapBuilder : public ArrayBuilder {
249  public:
250   /// Use this constructor to define the built array's type explicitly. If key_builder
251   /// or item_builder has indeterminate type, this builder will also.
252   MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
253              const std::shared_ptr<ArrayBuilder>& item_builder,
254              const std::shared_ptr<DataType>& type);
255 
256   /// Use this constructor to infer the built array's type. If key_builder or
257   /// item_builder has indeterminate type, this builder will also.
258   MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
259              const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
260 
261   MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
262              const std::shared_ptr<DataType>& type);
263 
264   Status Resize(int64_t capacity) override;
265   void Reset() override;
266   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
267 
268   /// \cond FALSE
269   using ArrayBuilder::Finish;
270   /// \endcond
271 
Finish(std::shared_ptr<MapArray> * out)272   Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
273 
274   /// \brief Vector append
275   ///
276   /// If passed, valid_bytes is of equal length to values, and any zero byte
277   /// will be considered as a null for that slot
278   Status AppendValues(const int32_t* offsets, int64_t length,
279                       const uint8_t* valid_bytes = NULLPTR);
280 
281   /// \brief Start a new variable-length map slot
282   ///
283   /// This function should be called before beginning to append elements to the
284   /// key and item builders
285   Status Append();
286 
287   Status AppendNull() final;
288 
289   Status AppendNulls(int64_t length) final;
290 
291   Status AppendEmptyValue() final;
292 
293   Status AppendEmptyValues(int64_t length) final;
294 
AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)295   Status AppendArraySlice(const ArrayData& array, int64_t offset,
296                           int64_t length) override {
297     const int32_t* offsets = array.GetValues<int32_t>(1);
298     const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
299     for (int64_t row = offset; row < offset + length; row++) {
300       if (!validity || BitUtil::GetBit(validity, array.offset + row)) {
301         ARROW_RETURN_NOT_OK(Append());
302         const int64_t slot_length = offsets[row + 1] - offsets[row];
303         ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
304             *array.child_data[0]->child_data[0], offsets[row], slot_length));
305         ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
306             *array.child_data[0]->child_data[1], offsets[row], slot_length));
307       } else {
308         ARROW_RETURN_NOT_OK(AppendNull());
309       }
310     }
311     return Status::OK();
312   }
313 
314   /// \brief Get builder to append keys.
315   ///
316   /// Append a key with this builder should be followed by appending
317   /// an item or null value with item_builder().
key_builder()318   ArrayBuilder* key_builder() const { return key_builder_.get(); }
319 
320   /// \brief Get builder to append items
321   ///
322   /// Appending an item with this builder should have been preceded
323   /// by appending a key with key_builder().
item_builder()324   ArrayBuilder* item_builder() const { return item_builder_.get(); }
325 
326   /// \brief Get builder to add Map entries as struct values.
327   ///
328   /// This is used instead of key_builder()/item_builder() and allows
329   /// the Map to be built as a list of struct values.
value_builder()330   ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
331 
type()332   std::shared_ptr<DataType> type() const override {
333     return map(key_builder_->type(), item_builder_->type(), keys_sorted_);
334   }
335 
ValidateOverflow(int64_t new_elements)336   Status ValidateOverflow(int64_t new_elements) {
337     return list_builder_->ValidateOverflow(new_elements);
338   }
339 
340  protected:
341   inline Status AdjustStructBuilderLength();
342 
343  protected:
344   bool keys_sorted_ = false;
345   std::shared_ptr<ListBuilder> list_builder_;
346   std::shared_ptr<ArrayBuilder> key_builder_;
347   std::shared_ptr<ArrayBuilder> item_builder_;
348 };
349 
350 // ----------------------------------------------------------------------
351 // FixedSizeList builder
352 
353 /// \class FixedSizeListBuilder
354 /// \brief Builder class for fixed-length list array value types
355 class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
356  public:
357   /// Use this constructor to define the built array's type explicitly. If value_builder
358   /// has indeterminate type, this builder will also.
359   FixedSizeListBuilder(MemoryPool* pool,
360                        std::shared_ptr<ArrayBuilder> const& value_builder,
361                        int32_t list_size);
362 
363   /// Use this constructor to infer the built array's type. If value_builder has
364   /// indeterminate type, this builder will also.
365   FixedSizeListBuilder(MemoryPool* pool,
366                        std::shared_ptr<ArrayBuilder> const& value_builder,
367                        const std::shared_ptr<DataType>& type);
368 
369   Status Resize(int64_t capacity) override;
370   void Reset() override;
371   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
372 
373   /// \cond FALSE
374   using ArrayBuilder::Finish;
375   /// \endcond
376 
Finish(std::shared_ptr<FixedSizeListArray> * out)377   Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
378 
379   /// \brief Append a valid fixed length list.
380   ///
381   /// This function affects only the validity bitmap; the child values must be appended
382   /// using the child array builder.
383   Status Append();
384 
385   /// \brief Vector append
386   ///
387   /// If passed, valid_bytes wil be read and any zero byte
388   /// will cause the corresponding slot to be null
389   ///
390   /// This function affects only the validity bitmap; the child values must be appended
391   /// using the child array builder. This includes appending nulls for null lists.
392   /// XXX this restriction is confusing, should this method be omitted?
393   Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
394 
395   /// \brief Append a null fixed length list.
396   ///
397   /// The child array builder will have the appropriate number of nulls appended
398   /// automatically.
399   Status AppendNull() final;
400 
401   /// \brief Append length null fixed length lists.
402   ///
403   /// The child array builder will have the appropriate number of nulls appended
404   /// automatically.
405   Status AppendNulls(int64_t length) final;
406 
407   Status ValidateOverflow(int64_t new_elements);
408 
409   Status AppendEmptyValue() final;
410 
411   Status AppendEmptyValues(int64_t length) final;
412 
AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)413   Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
414     const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
415     for (int64_t row = offset; row < offset + length; row++) {
416       if (!validity || BitUtil::GetBit(validity, array.offset + row)) {
417         ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
418             *array.child_data[0], list_size_ * (array.offset + row), list_size_));
419         ARROW_RETURN_NOT_OK(Append());
420       } else {
421         ARROW_RETURN_NOT_OK(AppendNull());
422       }
423     }
424     return Status::OK();
425   }
426 
value_builder()427   ArrayBuilder* value_builder() const { return value_builder_.get(); }
428 
type()429   std::shared_ptr<DataType> type() const override {
430     return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
431   }
432 
433   // Cannot make this a static attribute because of linking issues
maximum_elements()434   static constexpr int64_t maximum_elements() {
435     return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
436   }
437 
438  protected:
439   std::shared_ptr<Field> value_field_;
440   const int32_t list_size_;
441   std::shared_ptr<ArrayBuilder> value_builder_;
442 };
443 
444 // ----------------------------------------------------------------------
445 // Struct
446 
447 // ---------------------------------------------------------------------------------
448 // StructArray builder
449 /// Append, Resize and Reserve methods are acting on StructBuilder.
450 /// Please make sure all these methods of all child-builders' are consistently
451 /// called to maintain data-structure consistency.
452 class ARROW_EXPORT StructBuilder : public ArrayBuilder {
453  public:
454   /// If any of field_builders has indeterminate type, this builder will also
455   StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
456                 std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
457 
458   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
459 
460   /// \cond FALSE
461   using ArrayBuilder::Finish;
462   /// \endcond
463 
Finish(std::shared_ptr<StructArray> * out)464   Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
465 
466   /// Null bitmap is of equal length to every child field, and any zero byte
467   /// will be considered as a null for that field, but users must using app-
468   /// end methods or advance methods of the child builders' independently to
469   /// insert data.
AppendValues(int64_t length,const uint8_t * valid_bytes)470   Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
471     ARROW_RETURN_NOT_OK(Reserve(length));
472     UnsafeAppendToBitmap(valid_bytes, length);
473     return Status::OK();
474   }
475 
476   /// Append an element to the Struct. All child-builders' Append method must
477   /// be called independently to maintain data-structure consistency.
478   Status Append(bool is_valid = true) {
479     ARROW_RETURN_NOT_OK(Reserve(1));
480     UnsafeAppendToBitmap(is_valid);
481     return Status::OK();
482   }
483 
484   /// \brief Append a null value. Automatically appends an empty value to each child
485   /// builder.
AppendNull()486   Status AppendNull() final {
487     for (const auto& field : children_) {
488       ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
489     }
490     return Append(false);
491   }
492 
493   /// \brief Append multiple null values. Automatically appends empty values to each
494   /// child builder.
AppendNulls(int64_t length)495   Status AppendNulls(int64_t length) final {
496     for (const auto& field : children_) {
497       ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
498     }
499     ARROW_RETURN_NOT_OK(Reserve(length));
500     UnsafeAppendToBitmap(length, false);
501     return Status::OK();
502   }
503 
AppendEmptyValue()504   Status AppendEmptyValue() final {
505     for (const auto& field : children_) {
506       ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
507     }
508     return Append(true);
509   }
510 
AppendEmptyValues(int64_t length)511   Status AppendEmptyValues(int64_t length) final {
512     for (const auto& field : children_) {
513       ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
514     }
515     ARROW_RETURN_NOT_OK(Reserve(length));
516     UnsafeAppendToBitmap(length, true);
517     return Status::OK();
518   }
519 
AppendArraySlice(const ArrayData & array,int64_t offset,int64_t length)520   Status AppendArraySlice(const ArrayData& array, int64_t offset,
521                           int64_t length) override {
522     for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
523       ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i],
524                                                          array.offset + offset, length));
525     }
526     const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
527     ARROW_RETURN_NOT_OK(Reserve(length));
528     UnsafeAppendToBitmap(validity, array.offset + offset, length);
529     return Status::OK();
530   }
531 
532   void Reset() override;
533 
field_builder(int i)534   ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
535 
num_fields()536   int num_fields() const { return static_cast<int>(children_.size()); }
537 
538   std::shared_ptr<DataType> type() const override;
539 
540  private:
541   std::shared_ptr<DataType> type_;
542 };
543 
544 }  // namespace arrow
545