1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <atomic> // IWYU pragma: export 21 #include <cstdint> 22 #include <iosfwd> 23 #include <memory> 24 #include <string> 25 #include <type_traits> 26 #include <utility> 27 #include <vector> 28 29 #include "arrow/compare.h" 30 #include "arrow/type.h" 31 #include "arrow/type_fwd.h" 32 #include "arrow/type_traits.h" 33 #include "arrow/util/bit_util.h" 34 #include "arrow/util/checked_cast.h" 35 #include "arrow/util/macros.h" 36 #include "arrow/util/string_view.h" // IWYU pragma: export 37 #include "arrow/util/visibility.h" 38 39 namespace arrow { 40 41 class Array; 42 class ArrayVisitor; 43 44 // When slicing, we do not know the null count of the sliced range without 45 // doing some computation. To avoid doing this eagerly, we set the null count 46 // to -1 (any negative number will do). When Array::null_count is called the 47 // first time, the null count will be computed. See ARROW-33 48 constexpr int64_t kUnknownNullCount = -1; 49 50 class MemoryPool; 51 class Status; 52 53 // ---------------------------------------------------------------------- 54 // Generic array data container 55 56 /// \class ArrayData 57 /// \brief Mutable container for generic Arrow array data 58 /// 59 /// This data structure is a self-contained representation of the memory and 60 /// metadata inside an Arrow array data structure (called vectors in Java). The 61 /// classes arrow::Array and its subclasses provide strongly-typed accessors 62 /// with support for the visitor pattern and other affordances. 63 /// 64 /// This class is designed for easy internal data manipulation, analytical data 65 /// processing, and data transport to and from IPC messages. For example, we 66 /// could cast from int64 to float64 like so: 67 /// 68 /// Int64Array arr = GetMyData(); 69 /// auto new_data = arr.data()->Copy(); 70 /// new_data->type = arrow::float64(); 71 /// DoubleArray double_arr(new_data); 72 /// 73 /// This object is also useful in an analytics setting where memory may be 74 /// reused. For example, if we had a group of operations all returning doubles, 75 /// say: 76 /// 77 /// Log(Sqrt(Expr(arr))) 78 /// 79 /// Then the low-level implementations of each of these functions could have 80 /// the signatures 81 /// 82 /// void Log(const ArrayData& values, ArrayData* out); 83 /// 84 /// As another example a function may consume one or more memory buffers in an 85 /// input array and replace them with newly-allocated data, changing the output 86 /// data type as well. 87 struct ARROW_EXPORT ArrayData { ArrayDataArrayData88 ArrayData() : length(0), null_count(0), offset(0) {} 89 90 ArrayData(const std::shared_ptr<DataType>& type, int64_t length, 91 int64_t null_count = kUnknownNullCount, int64_t offset = 0) typeArrayData92 : type(type), length(length), null_count(null_count), offset(offset) {} 93 94 ArrayData(const std::shared_ptr<DataType>& type, int64_t length, 95 std::vector<std::shared_ptr<Buffer>> buffers, 96 int64_t null_count = kUnknownNullCount, int64_t offset = 0) ArrayDataArrayData97 : ArrayData(type, length, null_count, offset) { 98 this->buffers = std::move(buffers); 99 } 100 101 ArrayData(const std::shared_ptr<DataType>& type, int64_t length, 102 std::vector<std::shared_ptr<Buffer>> buffers, 103 std::vector<std::shared_ptr<ArrayData>> child_data, 104 int64_t null_count = kUnknownNullCount, int64_t offset = 0) ArrayDataArrayData105 : ArrayData(type, length, null_count, offset) { 106 this->buffers = std::move(buffers); 107 this->child_data = std::move(child_data); 108 } 109 110 static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type, 111 int64_t length, 112 std::vector<std::shared_ptr<Buffer>> buffers, 113 int64_t null_count = kUnknownNullCount, 114 int64_t offset = 0); 115 116 static std::shared_ptr<ArrayData> Make( 117 const std::shared_ptr<DataType>& type, int64_t length, 118 std::vector<std::shared_ptr<Buffer>> buffers, 119 std::vector<std::shared_ptr<ArrayData>> child_data, 120 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 121 122 static std::shared_ptr<ArrayData> Make( 123 const std::shared_ptr<DataType>& type, int64_t length, 124 std::vector<std::shared_ptr<Buffer>> buffers, 125 std::vector<std::shared_ptr<ArrayData>> child_data, 126 std::shared_ptr<Array> dictionary, int64_t null_count = kUnknownNullCount, 127 int64_t offset = 0); 128 129 static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type, 130 int64_t length, 131 int64_t null_count = kUnknownNullCount, 132 int64_t offset = 0); 133 134 // Move constructor ArrayDataArrayData135 ArrayData(ArrayData&& other) noexcept 136 : type(std::move(other.type)), 137 length(other.length), 138 offset(other.offset), 139 buffers(std::move(other.buffers)), 140 child_data(std::move(other.child_data)), 141 dictionary(std::move(other.dictionary)) { 142 SetNullCount(other.null_count); 143 } 144 145 // Copy constructor ArrayDataArrayData146 ArrayData(const ArrayData& other) noexcept 147 : type(other.type), 148 length(other.length), 149 offset(other.offset), 150 buffers(other.buffers), 151 child_data(other.child_data), 152 dictionary(other.dictionary) { 153 SetNullCount(other.null_count); 154 } 155 156 // Move assignment 157 ArrayData& operator=(ArrayData&& other) { 158 type = std::move(other.type); 159 length = other.length; 160 SetNullCount(other.null_count); 161 offset = other.offset; 162 buffers = std::move(other.buffers); 163 child_data = std::move(other.child_data); 164 dictionary = std::move(other.dictionary); 165 return *this; 166 } 167 168 // Copy assignment 169 ArrayData& operator=(const ArrayData& other) { 170 type = other.type; 171 length = other.length; 172 SetNullCount(other.null_count); 173 offset = other.offset; 174 buffers = other.buffers; 175 child_data = other.child_data; 176 dictionary = other.dictionary; 177 return *this; 178 } 179 CopyArrayData180 std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); } 181 182 // Access a buffer's data as a typed C pointer 183 template <typename T> GetValuesArrayData184 inline const T* GetValues(int i, int64_t absolute_offset) const { 185 if (buffers[i]) { 186 return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset; 187 } else { 188 return NULLPTR; 189 } 190 } 191 192 template <typename T> GetValuesArrayData193 inline const T* GetValues(int i) const { 194 return GetValues<T>(i, offset); 195 } 196 197 // Access a buffer's data as a typed C pointer 198 template <typename T> GetMutableValuesArrayData199 inline T* GetMutableValues(int i, int64_t absolute_offset) { 200 if (buffers[i]) { 201 return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset; 202 } else { 203 return NULLPTR; 204 } 205 } 206 207 template <typename T> GetMutableValuesArrayData208 inline T* GetMutableValues(int i) { 209 return GetMutableValues<T>(i, offset); 210 } 211 212 // Construct a zero-copy slice of the data with the indicated offset and length 213 ArrayData Slice(int64_t offset, int64_t length) const; 214 SetNullCountArrayData215 void SetNullCount(int64_t v) { null_count.store(v); } 216 217 /// \brief Return null count, or compute and set it if it's not known 218 int64_t GetNullCount() const; 219 220 std::shared_ptr<DataType> type; 221 int64_t length; 222 mutable std::atomic<int64_t> null_count; 223 // The logical start point into the physical buffers (in values, not bytes). 224 // Note that, for child data, this must be *added* to the child data's own offset. 225 int64_t offset; 226 std::vector<std::shared_ptr<Buffer>> buffers; 227 std::vector<std::shared_ptr<ArrayData>> child_data; 228 229 // The dictionary for this Array, if any. Only used for dictionary 230 // type 231 std::shared_ptr<Array> dictionary; 232 }; 233 234 /// \brief Create a strongly-typed Array instance from generic ArrayData 235 /// \param[in] data the array contents 236 /// \return the resulting Array instance 237 ARROW_EXPORT 238 std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data); 239 240 /// \brief Create a strongly-typed Array instance with all elements null 241 /// \param[in] type the array type 242 /// \param[in] length the array length 243 /// \param[in] pool the memory pool to allocate memory from 244 ARROW_EXPORT 245 Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type, 246 int64_t length, 247 MemoryPool* pool = default_memory_pool()); 248 249 /// \brief Create an Array instance whose slots are the given scalar 250 /// \param[in] scalar the value with which to fill the array 251 /// \param[in] length the array length 252 /// \param[in] pool the memory pool to allocate memory from 253 ARROW_EXPORT 254 Result<std::shared_ptr<Array>> MakeArrayFromScalar( 255 const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool()); 256 257 /// \brief Create a strongly-typed Array instance with all elements null 258 /// \param[in] type the array type 259 /// \param[in] length the array length 260 /// \param[out] out resulting Array instance 261 ARROW_DEPRECATED("Use Result-returning version") 262 ARROW_EXPORT 263 Status MakeArrayOfNull(const std::shared_ptr<DataType>& type, int64_t length, 264 std::shared_ptr<Array>* out); 265 266 /// \brief Create a strongly-typed Array instance with all elements null 267 /// \param[in] pool the pool from which memory for this array will be allocated 268 /// \param[in] type the array type 269 /// \param[in] length the array length 270 /// \param[out] out resulting Array instance 271 ARROW_DEPRECATED("Use Result-returning version") 272 ARROW_EXPORT 273 Status MakeArrayOfNull(MemoryPool* pool, const std::shared_ptr<DataType>& type, 274 int64_t length, std::shared_ptr<Array>* out); 275 276 /// \brief Create an Array instance whose slots are the given scalar 277 /// \param[in] scalar the value with which to fill the array 278 /// \param[in] length the array length 279 /// \param[out] out resulting Array instance 280 ARROW_DEPRECATED("Use Result-returning version") 281 ARROW_EXPORT 282 Status MakeArrayFromScalar(const Scalar& scalar, int64_t length, 283 std::shared_ptr<Array>* out); 284 285 /// \brief Create a strongly-typed Array instance with all elements null 286 /// \param[in] pool the pool from which memory for this array will be allocated 287 /// \param[in] scalar the value with which to fill the array 288 /// \param[in] length the array length 289 /// \param[out] out resulting Array instance 290 ARROW_DEPRECATED("Use Result-returning version") 291 ARROW_EXPORT 292 Status MakeArrayFromScalar(MemoryPool* pool, const Scalar& scalar, int64_t length, 293 std::shared_ptr<Array>* out); 294 295 // ---------------------------------------------------------------------- 296 // User array accessor types 297 298 /// \brief Array base type 299 /// Immutable data array with some logical type and some length. 300 /// 301 /// Any memory is owned by the respective Buffer instance (or its parents). 302 /// 303 /// The base class is only required to have a null bitmap buffer if the null 304 /// count is greater than 0 305 /// 306 /// If known, the null count can be provided in the base Array constructor. If 307 /// the null count is not known, pass -1 to indicate that the null count is to 308 /// be computed on the first call to null_count() 309 class ARROW_EXPORT Array { 310 public: 311 virtual ~Array() = default; 312 313 /// \brief Return true if value at index is null. Does not boundscheck IsNull(int64_t i)314 bool IsNull(int64_t i) const { 315 return null_bitmap_data_ != NULLPTR && 316 !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); 317 } 318 319 /// \brief Return true if value at index is valid (not null). Does not 320 /// boundscheck IsValid(int64_t i)321 bool IsValid(int64_t i) const { 322 return null_bitmap_data_ == NULLPTR || 323 BitUtil::GetBit(null_bitmap_data_, i + data_->offset); 324 } 325 326 /// \brief Return a Scalar containing the value of this array at i 327 Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const; 328 329 /// Size in the number of elements this array contains. length()330 int64_t length() const { return data_->length; } 331 332 /// A relative position into another array's data, to enable zero-copy 333 /// slicing. This value defaults to zero offset()334 int64_t offset() const { return data_->offset; } 335 336 /// The number of null entries in the array. If the null count was not known 337 /// at time of construction (and set to a negative value), then the null 338 /// count will be computed and cached on the first invocation of this 339 /// function 340 int64_t null_count() const; 341 type()342 std::shared_ptr<DataType> type() const { return data_->type; } type_id()343 Type::type type_id() const { return data_->type->id(); } 344 345 /// Buffer for the null bitmap. 346 /// 347 /// Note that for `null_count == 0`, this can be null. 348 /// This buffer does not account for any slice offset null_bitmap()349 std::shared_ptr<Buffer> null_bitmap() const { return data_->buffers[0]; } 350 351 /// Raw pointer to the null bitmap. 352 /// 353 /// Note that for `null_count == 0`, this can be null. 354 /// This buffer does not account for any slice offset null_bitmap_data()355 const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } 356 357 /// Equality comparison with another array 358 bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; 359 bool Equals(const std::shared_ptr<Array>& arr, 360 const EqualOptions& = EqualOptions::Defaults()) const; 361 362 /// \brief Return the formatted unified diff of arrow::Diff between this 363 /// Array and another Array 364 std::string Diff(const Array& other) const; 365 366 /// Approximate equality comparison with another array 367 /// 368 /// epsilon is only used if this is FloatArray or DoubleArray 369 bool ApproxEquals(const std::shared_ptr<Array>& arr, 370 const EqualOptions& = EqualOptions::Defaults()) const; 371 bool ApproxEquals(const Array& arr, 372 const EqualOptions& = EqualOptions::Defaults()) const; 373 374 /// Compare if the range of slots specified are equal for the given array and 375 /// this array. end_idx exclusive. This methods does not bounds check. 376 bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, 377 const Array& other) const; 378 bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, 379 const std::shared_ptr<Array>& other) const; 380 bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, 381 int64_t other_start_idx) const; 382 bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx, 383 int64_t end_idx, int64_t other_start_idx) const; 384 385 Status Accept(ArrayVisitor* visitor) const; 386 387 /// Construct a zero-copy view of this array with the given type. 388 /// 389 /// This method checks if the types are layout-compatible. 390 /// Nested types are traversed in depth-first order. Data buffers must have 391 /// the same item sizes, even though the logical types may be different. 392 /// An error is returned if the types are not layout-compatible. 393 Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const; 394 395 ARROW_DEPRECATED("Use Result-returning version") 396 Status View(const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) const; 397 398 /// Construct a zero-copy slice of the array with the indicated offset and 399 /// length 400 /// 401 /// \param[in] offset the position of the first element in the constructed 402 /// slice 403 /// \param[in] length the length of the slice. If there are not enough 404 /// elements in the array, the length will be adjusted accordingly 405 /// 406 /// \return a new object wrapped in std::shared_ptr<Array> 407 std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const; 408 409 /// Slice from offset until end of the array 410 std::shared_ptr<Array> Slice(int64_t offset) const; 411 data()412 std::shared_ptr<ArrayData> data() const { return data_; } 413 num_fields()414 int num_fields() const { return static_cast<int>(data_->child_data.size()); } 415 416 /// \return PrettyPrint representation of array suitable for debugging 417 std::string ToString() const; 418 419 /// \brief Perform cheap validation checks to determine obvious inconsistencies 420 /// within the array's internal data. 421 /// 422 /// This is O(k) where k is the number of descendents. 423 /// 424 /// \return Status 425 Status Validate() const; 426 427 /// \brief Perform extensive validation checks to determine inconsistencies 428 /// within the array's internal data. 429 /// 430 /// This is potentially O(k*n) where k is the number of descendents and n 431 /// is the array length. 432 /// 433 /// \return Status 434 Status ValidateFull() const; 435 436 protected: Array()437 Array() : null_bitmap_data_(NULLPTR) {} 438 439 std::shared_ptr<ArrayData> data_; 440 const uint8_t* null_bitmap_data_; 441 442 /// Protected method for constructors SetData(const std::shared_ptr<ArrayData> & data)443 inline void SetData(const std::shared_ptr<ArrayData>& data) { 444 if (data->buffers.size() > 0 && data->buffers[0]) { 445 null_bitmap_data_ = data->buffers[0]->data(); 446 } else { 447 null_bitmap_data_ = NULLPTR; 448 } 449 data_ = data; 450 } 451 452 private: 453 ARROW_DISALLOW_COPY_AND_ASSIGN(Array); 454 }; 455 456 namespace internal { 457 458 /// Given a number of ArrayVectors, treat each ArrayVector as the 459 /// chunks of a chunked array. Then rechunk each ArrayVector such that 460 /// all ArrayVectors are chunked identically. It is mandatory that 461 /// all ArrayVectors contain the same total number of elements. 462 ARROW_EXPORT 463 std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&); 464 465 } // namespace internal 466 467 static inline std::ostream& operator<<(std::ostream& os, const Array& x) { 468 os << x.ToString(); 469 return os; 470 } 471 472 /// Base class for non-nested arrays 473 class ARROW_EXPORT FlatArray : public Array { 474 protected: 475 using Array::Array; 476 }; 477 478 /// Degenerate null type Array 479 class ARROW_EXPORT NullArray : public FlatArray { 480 public: 481 using TypeClass = NullType; 482 NullArray(const std::shared_ptr<ArrayData> & data)483 explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); } 484 explicit NullArray(int64_t length); 485 486 private: SetData(const std::shared_ptr<ArrayData> & data)487 inline void SetData(const std::shared_ptr<ArrayData>& data) { 488 null_bitmap_data_ = NULLPTR; 489 data->null_count = data->length; 490 data_ = data; 491 } 492 }; 493 494 /// Base class for arrays of fixed-size logical types 495 class ARROW_EXPORT PrimitiveArray : public FlatArray { 496 public: 497 PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length, 498 const std::shared_ptr<Buffer>& data, 499 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 500 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 501 502 /// Does not account for any slice offset values()503 std::shared_ptr<Buffer> values() const { return data_->buffers[1]; } 504 505 protected: PrimitiveArray()506 PrimitiveArray() : raw_values_(NULLPTR) {} 507 SetData(const std::shared_ptr<ArrayData> & data)508 inline void SetData(const std::shared_ptr<ArrayData>& data) { 509 auto values = data->buffers[1]; 510 this->Array::SetData(data); 511 raw_values_ = values == NULLPTR ? NULLPTR : values->data(); 512 } 513 PrimitiveArray(const std::shared_ptr<ArrayData> & data)514 explicit inline PrimitiveArray(const std::shared_ptr<ArrayData>& data) { 515 SetData(data); 516 } 517 518 const uint8_t* raw_values_; 519 }; 520 521 /// Concrete Array class for numeric data. 522 template <typename TYPE> 523 class NumericArray : public PrimitiveArray { 524 public: 525 using TypeClass = TYPE; 526 using value_type = typename TypeClass::c_type; 527 NumericArray(const std::shared_ptr<ArrayData> & data)528 explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {} 529 530 // Only enable this constructor without a type argument for types without additional 531 // metadata 532 template <typename T1 = TYPE> 533 NumericArray(enable_if_parameter_free<T1, int64_t> length, 534 const std::shared_ptr<Buffer>& data, 535 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 536 int64_t null_count = kUnknownNullCount, int64_t offset = 0) PrimitiveArray(TypeTraits<T1>::type_singleton (),length,data,null_bitmap,null_count,offset)537 : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap, 538 null_count, offset) {} 539 raw_values()540 const value_type* raw_values() const { 541 return reinterpret_cast<const value_type*>(raw_values_) + data_->offset; 542 } 543 Value(int64_t i)544 value_type Value(int64_t i) const { return raw_values()[i]; } 545 546 // For API compatibility with BinaryArray etc. GetView(int64_t i)547 value_type GetView(int64_t i) const { return Value(i); } 548 549 protected: 550 using PrimitiveArray::PrimitiveArray; 551 }; 552 553 /// Concrete Array class for boolean data 554 class ARROW_EXPORT BooleanArray : public PrimitiveArray { 555 public: 556 using TypeClass = BooleanType; 557 558 explicit BooleanArray(const std::shared_ptr<ArrayData>& data); 559 560 BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data, 561 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 562 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 563 Value(int64_t i)564 bool Value(int64_t i) const { 565 return BitUtil::GetBit(reinterpret_cast<const uint8_t*>(raw_values_), 566 i + data_->offset); 567 } 568 GetView(int64_t i)569 bool GetView(int64_t i) const { return Value(i); } 570 571 protected: 572 using PrimitiveArray::PrimitiveArray; 573 }; 574 575 // ---------------------------------------------------------------------- 576 // ListArray 577 578 /// Base class for variable-sized list arrays, regardless of offset size. 579 template <typename TYPE> 580 class BaseListArray : public Array { 581 public: 582 using TypeClass = TYPE; 583 using offset_type = typename TypeClass::offset_type; 584 list_type()585 const TypeClass* list_type() const { return list_type_; } 586 587 /// \brief Return array object containing the list's values values()588 std::shared_ptr<Array> values() const { return values_; } 589 590 /// Note that this buffer does not account for any slice offset value_offsets()591 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } 592 value_type()593 std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); } 594 595 /// Return pointer to raw value offsets accounting for any slice offset raw_value_offsets()596 const offset_type* raw_value_offsets() const { 597 return raw_value_offsets_ + data_->offset; 598 } 599 600 // The following functions will not perform boundschecking value_offset(int64_t i)601 offset_type value_offset(int64_t i) const { 602 return raw_value_offsets_[i + data_->offset]; 603 } value_length(int64_t i)604 offset_type value_length(int64_t i) const { 605 i += data_->offset; 606 return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; 607 } value_slice(int64_t i)608 std::shared_ptr<Array> value_slice(int64_t i) const { 609 return values_->Slice(value_offset(i), value_length(i)); 610 } 611 612 protected: 613 const TypeClass* list_type_ = NULLPTR; 614 std::shared_ptr<Array> values_; 615 const offset_type* raw_value_offsets_ = NULLPTR; 616 }; 617 618 /// Concrete Array class for list data 619 class ARROW_EXPORT ListArray : public BaseListArray<ListType> { 620 public: 621 explicit ListArray(std::shared_ptr<ArrayData> data); 622 623 ListArray(std::shared_ptr<DataType> type, int64_t length, 624 std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values, 625 std::shared_ptr<Buffer> null_bitmap = NULLPTR, 626 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 627 628 /// \brief Construct ListArray from array of offsets and child value array 629 /// 630 /// This function does the bare minimum of validation of the offsets and 631 /// input types, and will allocate a new offsets array if necessary (i.e. if 632 /// the offsets contain any nulls). If the offsets do not have nulls, they 633 /// are assumed to be well-formed 634 /// 635 /// \param[in] offsets Array containing n + 1 offsets encoding length and 636 /// size. Must be of int32 type 637 /// \param[in] values Array containing list values 638 /// \param[in] pool MemoryPool in case new offsets array needs to be 639 /// allocated because of null values 640 static Result<std::shared_ptr<Array>> FromArrays( 641 const Array& offsets, const Array& values, 642 MemoryPool* pool = default_memory_pool()); 643 644 ARROW_DEPRECATED("Use Result-returning version") 645 static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, 646 std::shared_ptr<Array>* out); 647 648 /// \brief Return an Array that is a concatenation of the lists in this array. 649 /// 650 /// Note that it's different from `values()` in that it takes into 651 /// consideration of this array's offsets as well as null elements backed 652 /// by non-empty lists (they are skipped, thus copying may be needed). 653 Result<std::shared_ptr<Array>> Flatten( 654 MemoryPool* memory_pool = default_memory_pool()) const; 655 656 protected: 657 // This constructor defers SetData to a derived array class 658 ListArray() = default; 659 void SetData(const std::shared_ptr<ArrayData>& data, 660 Type::type expected_type_id = Type::LIST); 661 }; 662 663 /// Concrete Array class for large list data (with 64-bit offsets) 664 class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> { 665 public: 666 explicit LargeListArray(const std::shared_ptr<ArrayData>& data); 667 668 LargeListArray(const std::shared_ptr<DataType>& type, int64_t length, 669 const std::shared_ptr<Buffer>& value_offsets, 670 const std::shared_ptr<Array>& values, 671 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 672 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 673 674 /// \brief Construct LargeListArray from array of offsets and child value array 675 /// 676 /// This function does the bare minimum of validation of the offsets and 677 /// input types, and will allocate a new offsets array if necessary (i.e. if 678 /// the offsets contain any nulls). If the offsets do not have nulls, they 679 /// are assumed to be well-formed 680 /// 681 /// \param[in] offsets Array containing n + 1 offsets encoding length and 682 /// size. Must be of int64 type 683 /// \param[in] values Array containing list values 684 /// \param[in] pool MemoryPool in case new offsets array needs to be 685 /// allocated because of null values 686 static Result<std::shared_ptr<Array>> FromArrays( 687 const Array& offsets, const Array& values, 688 MemoryPool* pool = default_memory_pool()); 689 690 ARROW_DEPRECATED("Use Result-returning version") 691 static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, 692 std::shared_ptr<Array>* out); 693 694 /// \brief Return an Array that is a concatenation of the lists in this array. 695 /// 696 /// Note that it's different from `values()` in that it takes into 697 /// consideration of this array's offsets as well as null elements backed 698 /// by non-empty lists (they are skipped, thus copying may be needed). 699 Result<std::shared_ptr<Array>> Flatten( 700 MemoryPool* memory_pool = default_memory_pool()) const; 701 702 protected: 703 void SetData(const std::shared_ptr<ArrayData>& data); 704 }; 705 706 // ---------------------------------------------------------------------- 707 // MapArray 708 709 /// Concrete Array class for map data 710 /// 711 /// NB: "value" in this context refers to a pair of a key and the corresponding item 712 class ARROW_EXPORT MapArray : public ListArray { 713 public: 714 using TypeClass = MapType; 715 716 explicit MapArray(const std::shared_ptr<ArrayData>& data); 717 718 MapArray(const std::shared_ptr<DataType>& type, int64_t length, 719 const std::shared_ptr<Buffer>& value_offsets, 720 const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items, 721 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 722 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 723 724 MapArray(const std::shared_ptr<DataType>& type, int64_t length, 725 const std::shared_ptr<Buffer>& value_offsets, 726 const std::shared_ptr<Array>& values, 727 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 728 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 729 730 /// \brief Construct MapArray from array of offsets and child key, item arrays 731 /// 732 /// This function does the bare minimum of validation of the offsets and 733 /// input types, and will allocate a new offsets array if necessary (i.e. if 734 /// the offsets contain any nulls). If the offsets do not have nulls, they 735 /// are assumed to be well-formed 736 /// 737 /// \param[in] offsets Array containing n + 1 offsets encoding length and 738 /// size. Must be of int32 type 739 /// \param[in] keys Array containing key values 740 /// \param[in] items Array containing item values 741 /// \param[in] pool MemoryPool in case new offsets array needs to be 742 /// allocated because of null values 743 static Result<std::shared_ptr<Array>> FromArrays( 744 const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys, 745 const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool()); 746 747 ARROW_DEPRECATED("Use Result-returning version") 748 static Status FromArrays(const std::shared_ptr<Array>& offsets, 749 const std::shared_ptr<Array>& keys, 750 const std::shared_ptr<Array>& items, MemoryPool* pool, 751 std::shared_ptr<Array>* out); 752 map_type()753 const MapType* map_type() const { return map_type_; } 754 755 /// \brief Return array object containing all map keys keys()756 std::shared_ptr<Array> keys() const { return keys_; } 757 758 /// \brief Return array object containing all mapped items items()759 std::shared_ptr<Array> items() const { return items_; } 760 761 /// Validate child data before constructing the actual MapArray. 762 static Status ValidateChildData( 763 const std::vector<std::shared_ptr<ArrayData>>& child_data); 764 765 protected: 766 void SetData(const std::shared_ptr<ArrayData>& data); 767 768 private: 769 const MapType* map_type_; 770 std::shared_ptr<Array> keys_, items_; 771 }; 772 773 // ---------------------------------------------------------------------- 774 // FixedSizeListArray 775 776 /// Concrete Array class for fixed size list data 777 class ARROW_EXPORT FixedSizeListArray : public Array { 778 public: 779 using TypeClass = FixedSizeListType; 780 using offset_type = TypeClass::offset_type; 781 782 explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data); 783 784 FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length, 785 const std::shared_ptr<Array>& values, 786 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 787 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 788 789 const FixedSizeListType* list_type() const; 790 791 /// \brief Return array object containing the list's values 792 std::shared_ptr<Array> values() const; 793 794 std::shared_ptr<DataType> value_type() const; 795 796 // The following functions will not perform boundschecking value_offset(int64_t i)797 int32_t value_offset(int64_t i) const { 798 i += data_->offset; 799 return static_cast<int32_t>(list_size_ * i); 800 } 801 int32_t value_length(int64_t i = 0) const { return list_size_; } value_slice(int64_t i)802 std::shared_ptr<Array> value_slice(int64_t i) const { 803 return values_->Slice(value_offset(i), value_length(i)); 804 } 805 806 /// \brief Construct FixedSizeListArray from child value array and value_length 807 /// 808 /// \param[in] values Array containing list values 809 /// \param[in] list_size The fixed length of each list 810 /// \return Will have length equal to values.length() / list_size 811 static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values, 812 int32_t list_size); 813 814 protected: 815 void SetData(const std::shared_ptr<ArrayData>& data); 816 int32_t list_size_; 817 818 private: 819 std::shared_ptr<Array> values_; 820 }; 821 822 // ---------------------------------------------------------------------- 823 // Binary and String 824 825 /// Base class for variable-sized binary arrays, regardless of offset size 826 /// and logical interpretation. 827 template <typename TYPE> 828 class BaseBinaryArray : public FlatArray { 829 public: 830 using TypeClass = TYPE; 831 using offset_type = typename TypeClass::offset_type; 832 833 /// Return the pointer to the given elements bytes 834 // XXX should GetValue(int64_t i) return a string_view? GetValue(int64_t i,offset_type * out_length)835 const uint8_t* GetValue(int64_t i, offset_type* out_length) const { 836 // Account for base offset 837 i += data_->offset; 838 const offset_type pos = raw_value_offsets_[i]; 839 *out_length = raw_value_offsets_[i + 1] - pos; 840 return raw_data_ + pos; 841 } 842 843 /// \brief Get binary value as a string_view 844 /// 845 /// \param i the value index 846 /// \return the view over the selected value GetView(int64_t i)847 util::string_view GetView(int64_t i) const { 848 // Account for base offset 849 i += data_->offset; 850 const offset_type pos = raw_value_offsets_[i]; 851 return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos), 852 raw_value_offsets_[i + 1] - pos); 853 } 854 855 /// \brief Get binary value as a std::string 856 /// 857 /// \param i the value index 858 /// \return the value copied into a std::string GetString(int64_t i)859 std::string GetString(int64_t i) const { return std::string(GetView(i)); } 860 861 /// Note that this buffer does not account for any slice offset value_offsets()862 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } 863 864 /// Note that this buffer does not account for any slice offset value_data()865 std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; } 866 raw_value_offsets()867 const offset_type* raw_value_offsets() const { 868 return raw_value_offsets_ + data_->offset; 869 } 870 871 // Neither of these functions will perform boundschecking value_offset(int64_t i)872 offset_type value_offset(int64_t i) const { 873 return raw_value_offsets_[i + data_->offset]; 874 } value_length(int64_t i)875 offset_type value_length(int64_t i) const { 876 i += data_->offset; 877 return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; 878 } 879 880 protected: 881 // For subclasses BaseBinaryArray()882 BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} 883 884 // Protected method for constructors SetData(const std::shared_ptr<ArrayData> & data)885 void SetData(const std::shared_ptr<ArrayData>& data) { 886 auto value_offsets = data->buffers[1]; 887 auto value_data = data->buffers[2]; 888 this->Array::SetData(data); 889 raw_data_ = value_data == NULLPTR ? NULLPTR : value_data->data(); 890 raw_value_offsets_ = 891 value_offsets == NULLPTR 892 ? NULLPTR 893 : reinterpret_cast<const offset_type*>(value_offsets->data()); 894 } 895 896 const offset_type* raw_value_offsets_; 897 const uint8_t* raw_data_; 898 }; 899 900 /// Concrete Array class for variable-size binary data 901 class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> { 902 public: 903 explicit BinaryArray(const std::shared_ptr<ArrayData>& data); 904 905 BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 906 const std::shared_ptr<Buffer>& data, 907 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 908 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 909 910 protected: 911 // For subclasses such as StringArray BinaryArray()912 BinaryArray() : BaseBinaryArray() {} 913 }; 914 915 /// Concrete Array class for variable-size string (utf-8) data 916 class ARROW_EXPORT StringArray : public BinaryArray { 917 public: 918 using TypeClass = StringType; 919 920 explicit StringArray(const std::shared_ptr<ArrayData>& data); 921 922 StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 923 const std::shared_ptr<Buffer>& data, 924 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 925 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 926 }; 927 928 /// Concrete Array class for large variable-size binary data 929 class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> { 930 public: 931 explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data); 932 933 LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 934 const std::shared_ptr<Buffer>& data, 935 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 936 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 937 938 protected: 939 // For subclasses such as LargeStringArray LargeBinaryArray()940 LargeBinaryArray() : BaseBinaryArray() {} 941 }; 942 943 /// Concrete Array class for large variable-size string (utf-8) data 944 class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { 945 public: 946 using TypeClass = LargeStringType; 947 948 explicit LargeStringArray(const std::shared_ptr<ArrayData>& data); 949 950 LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 951 const std::shared_ptr<Buffer>& data, 952 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 953 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 954 }; 955 956 // ---------------------------------------------------------------------- 957 // Fixed width binary 958 959 /// Concrete Array class for fixed-size binary data 960 class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { 961 public: 962 using TypeClass = FixedSizeBinaryType; 963 964 explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data); 965 966 FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length, 967 const std::shared_ptr<Buffer>& data, 968 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 969 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 970 971 const uint8_t* GetValue(int64_t i) const; Value(int64_t i)972 const uint8_t* Value(int64_t i) const { return GetValue(i); } 973 GetView(int64_t i)974 util::string_view GetView(int64_t i) const { 975 return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width()); 976 } 977 GetString(int64_t i)978 std::string GetString(int64_t i) const { return std::string(GetView(i)); } 979 byte_width()980 int32_t byte_width() const { return byte_width_; } 981 raw_values()982 const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } 983 984 protected: SetData(const std::shared_ptr<ArrayData> & data)985 inline void SetData(const std::shared_ptr<ArrayData>& data) { 986 this->PrimitiveArray::SetData(data); 987 byte_width_ = 988 internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width(); 989 } 990 991 int32_t byte_width_; 992 }; 993 994 /// DayTimeArray 995 /// --------------------- 996 /// \brief Array of Day and Millisecond values. 997 class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { 998 public: 999 using TypeClass = DayTimeIntervalType; 1000 1001 explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data); 1002 1003 DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length, 1004 const std::shared_ptr<Buffer>& data, 1005 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 1006 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 1007 1008 TypeClass::DayMilliseconds GetValue(int64_t i) const; Value(int64_t i)1009 TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); } 1010 1011 // For compatibility with Take kernel. GetView(int64_t i)1012 TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } 1013 byte_width()1014 int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } 1015 raw_values()1016 const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } 1017 1018 protected: SetData(const std::shared_ptr<ArrayData> & data)1019 inline void SetData(const std::shared_ptr<ArrayData>& data) { 1020 this->PrimitiveArray::SetData(data); 1021 } 1022 }; 1023 1024 // ---------------------------------------------------------------------- 1025 // Decimal128Array 1026 1027 /// Concrete Array class for 128-bit decimal data 1028 class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { 1029 public: 1030 using TypeClass = Decimal128Type; 1031 1032 using FixedSizeBinaryArray::FixedSizeBinaryArray; 1033 1034 /// \brief Construct Decimal128Array from ArrayData instance 1035 explicit Decimal128Array(const std::shared_ptr<ArrayData>& data); 1036 1037 std::string FormatValue(int64_t i) const; 1038 }; 1039 1040 // Backward compatibility 1041 using DecimalArray = Decimal128Array; 1042 1043 // ---------------------------------------------------------------------- 1044 // Struct 1045 1046 /// Concrete Array class for struct data 1047 class ARROW_EXPORT StructArray : public Array { 1048 public: 1049 using TypeClass = StructType; 1050 1051 explicit StructArray(const std::shared_ptr<ArrayData>& data); 1052 1053 StructArray(const std::shared_ptr<DataType>& type, int64_t length, 1054 const std::vector<std::shared_ptr<Array>>& children, 1055 std::shared_ptr<Buffer> null_bitmap = NULLPTR, 1056 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 1057 1058 /// \brief Return a StructArray from child arrays and field names. 1059 /// 1060 /// The length and data type are automatically inferred from the arguments. 1061 /// There should be at least one child array. 1062 static Result<std::shared_ptr<StructArray>> Make( 1063 const std::vector<std::shared_ptr<Array>>& children, 1064 const std::vector<std::string>& field_names, 1065 std::shared_ptr<Buffer> null_bitmap = NULLPTR, 1066 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 1067 1068 /// \brief Return a StructArray from child arrays and fields. 1069 /// 1070 /// The length is automatically inferred from the arguments. 1071 /// There should be at least one child array. This method does not 1072 /// check that field types and child array types are consistent. 1073 static Result<std::shared_ptr<StructArray>> Make( 1074 const std::vector<std::shared_ptr<Array>>& children, 1075 const std::vector<std::shared_ptr<Field>>& fields, 1076 std::shared_ptr<Buffer> null_bitmap = NULLPTR, 1077 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 1078 1079 const StructType* struct_type() const; 1080 1081 // Return a shared pointer in case the requestor desires to share ownership 1082 // with this array. The returned array has its offset, length and null 1083 // count adjusted. 1084 std::shared_ptr<Array> field(int pos) const; 1085 1086 const ArrayVector& fields() const; 1087 1088 /// Returns null if name not found 1089 std::shared_ptr<Array> GetFieldByName(const std::string& name) const; 1090 1091 /// \brief Flatten this array as a vector of arrays, one for each field 1092 /// 1093 /// \param[in] pool The pool to allocate null bitmaps from, if necessary 1094 Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const; 1095 1096 ARROW_DEPRECATED("Use Result-returning version") 1097 Status Flatten(MemoryPool* pool, ArrayVector* out) const; 1098 1099 private: 1100 // For caching boxed child data 1101 // XXX This is not handled in a thread-safe manner. 1102 mutable ArrayVector boxed_fields_; 1103 }; 1104 1105 // ---------------------------------------------------------------------- 1106 // Union 1107 1108 /// Concrete Array class for union data 1109 class ARROW_EXPORT UnionArray : public Array { 1110 public: 1111 using TypeClass = UnionType; 1112 1113 using type_code_t = int8_t; 1114 1115 explicit UnionArray(const std::shared_ptr<ArrayData>& data); 1116 1117 UnionArray(const std::shared_ptr<DataType>& type, int64_t length, 1118 const std::vector<std::shared_ptr<Array>>& children, 1119 const std::shared_ptr<Buffer>& type_ids, 1120 const std::shared_ptr<Buffer>& value_offsets = NULLPTR, 1121 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 1122 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 1123 1124 /// \brief Construct Dense UnionArray from types_ids, value_offsets and children 1125 /// 1126 /// This function does the bare minimum of validation of the offsets and 1127 /// input types. The value_offsets are assumed to be well-formed. 1128 /// 1129 /// \param[in] type_ids An array of logical type ids for the union type 1130 /// \param[in] value_offsets An array of signed int32 values indicating the 1131 /// relative offset into the respective child array for the type in a given slot. 1132 /// The respective offsets for each child value array must be in order / increasing. 1133 /// \param[in] children Vector of children Arrays containing the data for each type. 1134 /// \param[in] field_names Vector of strings containing the name of each field. 1135 /// \param[in] type_codes Vector of type codes. 1136 static Result<std::shared_ptr<Array>> MakeDense( 1137 const Array& type_ids, const Array& value_offsets, 1138 const std::vector<std::shared_ptr<Array>>& children, 1139 const std::vector<std::string>& field_names = {}, 1140 const std::vector<type_code_t>& type_codes = {}); 1141 1142 /// \brief Construct Dense UnionArray from types_ids, value_offsets and children 1143 /// 1144 /// This function does the bare minimum of validation of the offsets and 1145 /// input types. The value_offsets are assumed to be well-formed. 1146 /// 1147 /// \param[in] type_ids An array of logical type ids for the union type 1148 /// \param[in] value_offsets An array of signed int32 values indicating the 1149 /// relative offset into the respective child array for the type in a given slot. 1150 /// The respective offsets for each child value array must be in order / increasing. 1151 /// \param[in] children Vector of children Arrays containing the data for each type. 1152 /// \param[in] type_codes Vector of type codes. MakeDense(const Array & type_ids,const Array & value_offsets,const std::vector<std::shared_ptr<Array>> & children,const std::vector<type_code_t> & type_codes)1153 static Result<std::shared_ptr<Array>> MakeDense( 1154 const Array& type_ids, const Array& value_offsets, 1155 const std::vector<std::shared_ptr<Array>>& children, 1156 const std::vector<type_code_t>& type_codes) { 1157 return MakeDense(type_ids, value_offsets, children, std::vector<std::string>{}, 1158 type_codes); 1159 } 1160 1161 ARROW_DEPRECATED("Use Result-returning version") MakeDense(const Array & type_ids,const Array & value_offsets,const std::vector<std::shared_ptr<Array>> & children,const std::vector<std::string> & field_names,const std::vector<type_code_t> & type_codes,std::shared_ptr<Array> * out)1162 static Status MakeDense(const Array& type_ids, const Array& value_offsets, 1163 const std::vector<std::shared_ptr<Array>>& children, 1164 const std::vector<std::string>& field_names, 1165 const std::vector<type_code_t>& type_codes, 1166 std::shared_ptr<Array>* out) { 1167 return MakeDense(type_ids, value_offsets, children, field_names, type_codes) 1168 .Value(out); 1169 } 1170 1171 ARROW_DEPRECATED("Use Result-returning version") MakeDense(const Array & type_ids,const Array & value_offsets,const std::vector<std::shared_ptr<Array>> & children,const std::vector<std::string> & field_names,std::shared_ptr<Array> * out)1172 static Status MakeDense(const Array& type_ids, const Array& value_offsets, 1173 const std::vector<std::shared_ptr<Array>>& children, 1174 const std::vector<std::string>& field_names, 1175 std::shared_ptr<Array>* out) { 1176 return MakeDense(type_ids, value_offsets, children, field_names).Value(out); 1177 } 1178 1179 ARROW_DEPRECATED("Use Result-returning version") MakeDense(const Array & type_ids,const Array & value_offsets,const std::vector<std::shared_ptr<Array>> & children,const std::vector<type_code_t> & type_codes,std::shared_ptr<Array> * out)1180 static Status MakeDense(const Array& type_ids, const Array& value_offsets, 1181 const std::vector<std::shared_ptr<Array>>& children, 1182 const std::vector<type_code_t>& type_codes, 1183 std::shared_ptr<Array>* out) { 1184 return MakeDense(type_ids, value_offsets, children, type_codes).Value(out); 1185 } 1186 1187 ARROW_DEPRECATED("Use Result-returning version") MakeDense(const Array & type_ids,const Array & value_offsets,const std::vector<std::shared_ptr<Array>> & children,std::shared_ptr<Array> * out)1188 static Status MakeDense(const Array& type_ids, const Array& value_offsets, 1189 const std::vector<std::shared_ptr<Array>>& children, 1190 std::shared_ptr<Array>* out) { 1191 return MakeDense(type_ids, value_offsets, children).Value(out); 1192 } 1193 1194 /// \brief Construct Sparse UnionArray from type_ids and children 1195 /// 1196 /// This function does the bare minimum of validation of the offsets and 1197 /// input types. 1198 /// 1199 /// \param[in] type_ids An array of logical type ids for the union type 1200 /// \param[in] children Vector of children Arrays containing the data for each type. 1201 /// \param[in] field_names Vector of strings containing the name of each field. 1202 /// \param[in] type_codes Vector of type codes. 1203 static Result<std::shared_ptr<Array>> MakeSparse( 1204 const Array& type_ids, const std::vector<std::shared_ptr<Array>>& children, 1205 const std::vector<std::string>& field_names = {}, 1206 const std::vector<type_code_t>& type_codes = {}); 1207 1208 /// \brief Construct Sparse UnionArray from type_ids and children 1209 /// 1210 /// This function does the bare minimum of validation of the offsets and 1211 /// input types. 1212 /// 1213 /// \param[in] type_ids An array of logical type ids for the union type 1214 /// \param[in] children Vector of children Arrays containing the data for each type. 1215 /// \param[in] type_codes Vector of type codes. MakeSparse(const Array & type_ids,const std::vector<std::shared_ptr<Array>> & children,const std::vector<type_code_t> & type_codes)1216 static Result<std::shared_ptr<Array>> MakeSparse( 1217 const Array& type_ids, const std::vector<std::shared_ptr<Array>>& children, 1218 const std::vector<type_code_t>& type_codes) { 1219 return MakeSparse(type_ids, children, std::vector<std::string>{}, type_codes); 1220 } 1221 1222 ARROW_DEPRECATED("Use Result-returning version") MakeSparse(const Array & type_ids,const std::vector<std::shared_ptr<Array>> & children,const std::vector<std::string> & field_names,const std::vector<type_code_t> & type_codes,std::shared_ptr<Array> * out)1223 static Status MakeSparse(const Array& type_ids, 1224 const std::vector<std::shared_ptr<Array>>& children, 1225 const std::vector<std::string>& field_names, 1226 const std::vector<type_code_t>& type_codes, 1227 std::shared_ptr<Array>* out) { 1228 return MakeSparse(type_ids, children, field_names, type_codes).Value(out); 1229 } 1230 1231 ARROW_DEPRECATED("Use Result-returning version") MakeSparse(const Array & type_ids,const std::vector<std::shared_ptr<Array>> & children,const std::vector<std::string> & field_names,std::shared_ptr<Array> * out)1232 static Status MakeSparse(const Array& type_ids, 1233 const std::vector<std::shared_ptr<Array>>& children, 1234 const std::vector<std::string>& field_names, 1235 std::shared_ptr<Array>* out) { 1236 return MakeSparse(type_ids, children, field_names).Value(out); 1237 } 1238 1239 ARROW_DEPRECATED("Use Result-returning version") MakeSparse(const Array & type_ids,const std::vector<std::shared_ptr<Array>> & children,const std::vector<type_code_t> & type_codes,std::shared_ptr<Array> * out)1240 static Status MakeSparse(const Array& type_ids, 1241 const std::vector<std::shared_ptr<Array>>& children, 1242 const std::vector<type_code_t>& type_codes, 1243 std::shared_ptr<Array>* out) { 1244 return MakeSparse(type_ids, children, type_codes).Value(out); 1245 } 1246 1247 ARROW_DEPRECATED("Use Result-returning version") MakeSparse(const Array & type_ids,const std::vector<std::shared_ptr<Array>> & children,std::shared_ptr<Array> * out)1248 static Status MakeSparse(const Array& type_ids, 1249 const std::vector<std::shared_ptr<Array>>& children, 1250 std::shared_ptr<Array>* out) { 1251 return MakeSparse(type_ids, children).Value(out); 1252 } 1253 1254 /// Note that this buffer does not account for any slice offset type_codes()1255 std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; } 1256 raw_type_codes()1257 const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; } 1258 1259 /// The physical child id containing value at index. child_id(int64_t i)1260 int child_id(int64_t i) const { 1261 return union_type_->child_ids()[raw_type_codes_[i + data_->offset]]; 1262 } 1263 1264 /// For dense arrays only. 1265 /// Note that this buffer does not account for any slice offset value_offsets()1266 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; } 1267 1268 /// For dense arrays only. value_offset(int64_t i)1269 int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } 1270 1271 /// For dense arrays only. raw_value_offsets()1272 const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } 1273 union_type()1274 const UnionType* union_type() const { return union_type_; } 1275 mode()1276 UnionMode::type mode() const { return union_type_->mode(); } 1277 1278 // Return the given field as an individual array. 1279 // For sparse unions, the returned array has its offset, length and null 1280 // count adjusted. 1281 ARROW_DEPRECATED("Use field(pos)") 1282 std::shared_ptr<Array> child(int pos) const; 1283 1284 /// \brief Return the given field as an individual array. 1285 /// 1286 /// For sparse unions, the returned array has its offset, length and null 1287 /// count adjusted. 1288 std::shared_ptr<Array> field(int pos) const; 1289 1290 protected: 1291 void SetData(const std::shared_ptr<ArrayData>& data); 1292 1293 const type_code_t* raw_type_codes_; 1294 const int32_t* raw_value_offsets_; 1295 const UnionType* union_type_; 1296 1297 // For caching boxed child data 1298 mutable std::vector<std::shared_ptr<Array>> boxed_fields_; 1299 }; 1300 1301 // ---------------------------------------------------------------------- 1302 // DictionaryArray 1303 1304 /// \brief Array type for dictionary-encoded data with a 1305 /// data-dependent dictionary 1306 /// 1307 /// A dictionary array contains an array of non-negative integers (the 1308 /// "dictionary indices") along with a data type containing a "dictionary" 1309 /// corresponding to the distinct values represented in the data. 1310 /// 1311 /// For example, the array 1312 /// 1313 /// ["foo", "bar", "foo", "bar", "foo", "bar"] 1314 /// 1315 /// with dictionary ["bar", "foo"], would have dictionary array representation 1316 /// 1317 /// indices: [1, 0, 1, 0, 1, 0] 1318 /// dictionary: ["bar", "foo"] 1319 /// 1320 /// The indices in principle may have any integer type (signed or unsigned), 1321 /// though presently data in IPC exchanges must be signed int32. 1322 class ARROW_EXPORT DictionaryArray : public Array { 1323 public: 1324 using TypeClass = DictionaryType; 1325 1326 explicit DictionaryArray(const std::shared_ptr<ArrayData>& data); 1327 1328 DictionaryArray(const std::shared_ptr<DataType>& type, 1329 const std::shared_ptr<Array>& indices, 1330 const std::shared_ptr<Array>& dictionary); 1331 1332 /// \brief Construct DictionaryArray from dictionary and indices 1333 /// array and validate 1334 /// 1335 /// This function does the validation of the indices and input type. It checks if 1336 /// all indices are non-negative and smaller than the size of the dictionary 1337 /// 1338 /// \param[in] type a dictionary type 1339 /// \param[in] dictionary the dictionary with same value type as the 1340 /// type object 1341 /// \param[in] indices an array of non-negative signed 1342 /// integers smaller than the size of the dictionary 1343 static Result<std::shared_ptr<Array>> FromArrays( 1344 const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices, 1345 const std::shared_ptr<Array>& dictionary); 1346 1347 ARROW_DEPRECATED("Use Result-returning version") 1348 static Status FromArrays(const std::shared_ptr<DataType>& type, 1349 const std::shared_ptr<Array>& indices, 1350 const std::shared_ptr<Array>& dictionary, 1351 std::shared_ptr<Array>* out); 1352 1353 /// \brief Transpose this DictionaryArray 1354 /// 1355 /// This method constructs a new dictionary array with the given dictionary type, 1356 /// transposing indices using the transpose map. 1357 /// The type and the transpose map are typically computed using 1358 /// DictionaryUnifier. 1359 /// 1360 /// \param[in] type the new type object 1361 /// \param[in] dictionary the new dictionary 1362 /// \param[in] transpose_map transposition array of this array's indices 1363 /// into the target array's indices 1364 /// \param[in] pool a pool to allocate the array data from 1365 Result<std::shared_ptr<Array>> Transpose( 1366 const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary, 1367 const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; 1368 1369 ARROW_DEPRECATED("Use Result-returning version") 1370 Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type, 1371 const std::shared_ptr<Array>& dictionary, const int32_t* transpose_map, 1372 std::shared_ptr<Array>* out) const; 1373 1374 /// \brief Determine whether dictionary arrays may be compared without unification 1375 bool CanCompareIndices(const DictionaryArray& other) const; 1376 1377 /// \brief Return the dictionary for this array, which is stored as 1378 /// a member of the ArrayData internal structure 1379 std::shared_ptr<Array> dictionary() const; 1380 std::shared_ptr<Array> indices() const; 1381 1382 /// \brief Return the ith value of indices, cast to int64_t 1383 int64_t GetValueIndex(int64_t i) const; 1384 dict_type()1385 const DictionaryType* dict_type() const { return dict_type_; } 1386 1387 private: 1388 void SetData(const std::shared_ptr<ArrayData>& data); 1389 const DictionaryType* dict_type_; 1390 std::shared_ptr<Array> indices_; 1391 }; 1392 1393 } // namespace arrow 1394