1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "arrow/type.h"
19
20 #include <algorithm>
21 #include <climits>
22 #include <cstddef>
23 #include <ostream>
24 #include <sstream> // IWYU pragma: keep
25 #include <string>
26 #include <unordered_map>
27 #include <unordered_set>
28 #include <utility>
29 #include <vector>
30
31 #include "arrow/array.h"
32 #include "arrow/compare.h"
33 #include "arrow/record_batch.h"
34 #include "arrow/result.h"
35 #include "arrow/status.h"
36 #include "arrow/table.h"
37 #include "arrow/util/checked_cast.h"
38 #include "arrow/util/hash_util.h"
39 #include "arrow/util/hashing.h"
40 #include "arrow/util/key_value_metadata.h"
41 #include "arrow/util/logging.h"
42 #include "arrow/util/make_unique.h"
43 #include "arrow/util/vector.h"
44 #include "arrow/visitor_inline.h"
45
46 namespace arrow {
47
48 constexpr Type::type NullType::type_id;
49 constexpr Type::type ListType::type_id;
50 constexpr Type::type LargeListType::type_id;
51
52 constexpr Type::type MapType::type_id;
53
54 constexpr Type::type FixedSizeListType::type_id;
55
56 constexpr Type::type BinaryType::type_id;
57
58 constexpr Type::type LargeBinaryType::type_id;
59
60 constexpr Type::type StringType::type_id;
61
62 constexpr Type::type LargeStringType::type_id;
63
64 constexpr Type::type FixedSizeBinaryType::type_id;
65
66 constexpr Type::type StructType::type_id;
67
68 constexpr Type::type Decimal128Type::type_id;
69
70 constexpr Type::type UnionType::type_id;
71
72 constexpr Type::type Date32Type::type_id;
73
74 constexpr Type::type Date64Type::type_id;
75
76 constexpr Type::type Time32Type::type_id;
77
78 constexpr Type::type Time64Type::type_id;
79
80 constexpr Type::type TimestampType::type_id;
81
82 constexpr Type::type MonthIntervalType::type_id;
83
84 constexpr Type::type DayTimeIntervalType::type_id;
85
86 constexpr Type::type DurationType::type_id;
87
88 constexpr Type::type DictionaryType::type_id;
89
90 namespace internal {
91
ToString(Type::type id)92 std::string ToString(Type::type id) {
93 switch (id) {
94 case Type::NA:
95 return "NA";
96 case Type::BOOL:
97 return "BOOL";
98 case Type::UINT8:
99 return "UINT8";
100 case Type::INT8:
101 return "INT8";
102 case Type::UINT16:
103 return "UINT16";
104 case Type::INT16:
105 return "INT16";
106 case Type::UINT32:
107 return "UINT32";
108 case Type::INT32:
109 return "INT32";
110 case Type::UINT64:
111 return "UINT64";
112 case Type::INT64:
113 return "INT64";
114 case Type::HALF_FLOAT:
115 return "HALF_FLOAT";
116 case Type::FLOAT:
117 return "FLOAT";
118 case Type::DOUBLE:
119 return "DOUBLE";
120 case Type::STRING:
121 return "UTF8";
122 case Type::BINARY:
123 return "BINARY";
124 case Type::FIXED_SIZE_BINARY:
125 return "FIXED_SIZE_BINARY";
126 case Type::DATE64:
127 return "DATE64";
128 case Type::TIMESTAMP:
129 return "TIMESTAMP";
130 case Type::TIME32:
131 return "TIME32";
132 case Type::TIME64:
133 return "TIME64";
134 case Type::INTERVAL_MONTHS:
135 return "INTERVAL_MONTHS";
136 case Type::INTERVAL_DAY_TIME:
137 return "INTERVAL_DAY_TIME";
138 case Type::DECIMAL:
139 return "DECIMAL";
140 case Type::LIST:
141 return "LIST";
142 case Type::STRUCT:
143 return "STRUCT";
144 case Type::UNION:
145 return "UNION";
146 case Type::DICTIONARY:
147 return "DICTIONARY";
148 case Type::MAP:
149 return "MAP";
150 case Type::EXTENSION:
151 return "EXTENSION";
152 case Type::FIXED_SIZE_LIST:
153 return "FIXED_SIZE_LIST";
154 case Type::DURATION:
155 return "DURATION";
156 case Type::LARGE_BINARY:
157 return "LARGE_BINARY";
158 case Type::LARGE_LIST:
159 return "LARGE_LIST";
160 default:
161 DCHECK(false) << "Should not be able to reach here";
162 return "unknown";
163 }
164 }
165
ToString(TimeUnit::type unit)166 std::string ToString(TimeUnit::type unit) {
167 switch (unit) {
168 case TimeUnit::SECOND:
169 return "s";
170 case TimeUnit::MILLI:
171 return "ms";
172 case TimeUnit::MICRO:
173 return "us";
174 case TimeUnit::NANO:
175 return "ns";
176 default:
177 DCHECK(false);
178 return "";
179 }
180 }
181
182 } // namespace internal
183
184 namespace {
185
186 using internal::checked_cast;
187
188 // Merges `existing` and `other` if one of them is of NullType, otherwise
189 // returns nullptr.
190 // - if `other` if of NullType or is nullable, the unified field will be nullable.
191 // - if `existing` is of NullType but other is not, the unified field will
192 // have `other`'s type and will be nullable
MaybePromoteNullTypes(const Field & existing,const Field & other)193 std::shared_ptr<Field> MaybePromoteNullTypes(const Field& existing, const Field& other) {
194 if (existing.type()->id() != Type::NA && other.type()->id() != Type::NA) {
195 return nullptr;
196 }
197 if (existing.type()->id() == Type::NA) {
198 return other.WithNullable(true)->WithMetadata(existing.metadata());
199 }
200 // `other` must be null.
201 return existing.WithNullable(true);
202 }
203 } // namespace
204
~Field()205 Field::~Field() {}
206
HasMetadata() const207 bool Field::HasMetadata() const {
208 return (metadata_ != nullptr) && (metadata_->size() > 0);
209 }
210
WithMetadata(const std::shared_ptr<const KeyValueMetadata> & metadata) const211 std::shared_ptr<Field> Field::WithMetadata(
212 const std::shared_ptr<const KeyValueMetadata>& metadata) const {
213 return std::make_shared<Field>(name_, type_, nullable_, metadata);
214 }
215
WithMergedMetadata(const std::shared_ptr<const KeyValueMetadata> & metadata) const216 std::shared_ptr<Field> Field::WithMergedMetadata(
217 const std::shared_ptr<const KeyValueMetadata>& metadata) const {
218 std::shared_ptr<const KeyValueMetadata> merged_metadata;
219 if (metadata_) {
220 merged_metadata = metadata_->Merge(*metadata);
221 } else {
222 merged_metadata = metadata;
223 }
224 return std::make_shared<Field>(name_, type_, nullable_, merged_metadata);
225 }
226
RemoveMetadata() const227 std::shared_ptr<Field> Field::RemoveMetadata() const {
228 return std::make_shared<Field>(name_, type_, nullable_);
229 }
230
WithType(const std::shared_ptr<DataType> & type) const231 std::shared_ptr<Field> Field::WithType(const std::shared_ptr<DataType>& type) const {
232 return std::make_shared<Field>(name_, type, nullable_, metadata_);
233 }
234
WithName(const std::string & name) const235 std::shared_ptr<Field> Field::WithName(const std::string& name) const {
236 return std::make_shared<Field>(name, type_, nullable_, metadata_);
237 }
238
WithNullable(const bool nullable) const239 std::shared_ptr<Field> Field::WithNullable(const bool nullable) const {
240 return std::make_shared<Field>(name_, type_, nullable, metadata_);
241 }
242
MergeWith(const Field & other,MergeOptions options) const243 Result<std::shared_ptr<Field>> Field::MergeWith(const Field& other,
244 MergeOptions options) const {
245 if (name() != other.name()) {
246 return Status::Invalid("Field ", name(), " doesn't have the same name as ",
247 other.name());
248 }
249
250 if (Equals(other, /*check_metadata=*/false)) {
251 return Copy();
252 }
253
254 if (options.promote_nullability) {
255 if (type()->Equals(other.type())) {
256 return Copy()->WithNullable(nullable() || other.nullable());
257 }
258 std::shared_ptr<Field> promoted = MaybePromoteNullTypes(*this, other);
259 if (promoted) return promoted;
260 }
261
262 return Status::Invalid("Unable to merge: Field ", name(),
263 " has incompatible types: ", type()->ToString(), " vs ",
264 other.type()->ToString());
265 }
266
MergeWith(const std::shared_ptr<Field> & other,MergeOptions options) const267 Result<std::shared_ptr<Field>> Field::MergeWith(const std::shared_ptr<Field>& other,
268 MergeOptions options) const {
269 DCHECK_NE(other, nullptr);
270 return MergeWith(*other, options);
271 }
272
Flatten() const273 std::vector<std::shared_ptr<Field>> Field::Flatten() const {
274 std::vector<std::shared_ptr<Field>> flattened;
275 if (type_->id() == Type::STRUCT) {
276 for (const auto& child : type_->fields()) {
277 auto flattened_child = child->Copy();
278 flattened.push_back(flattened_child);
279 flattened_child->name_.insert(0, name() + ".");
280 flattened_child->nullable_ |= nullable_;
281 }
282 } else {
283 flattened.push_back(this->Copy());
284 }
285 return flattened;
286 }
287
Copy() const288 std::shared_ptr<Field> Field::Copy() const {
289 return ::arrow::field(name_, type_, nullable_, metadata_);
290 }
291
Equals(const Field & other,bool check_metadata) const292 bool Field::Equals(const Field& other, bool check_metadata) const {
293 if (this == &other) {
294 return true;
295 }
296 if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
297 this->type_->Equals(*other.type_.get(), check_metadata)) {
298 if (!check_metadata) {
299 return true;
300 } else if (this->HasMetadata() && other.HasMetadata()) {
301 return metadata_->Equals(*other.metadata_);
302 } else if (!this->HasMetadata() && !other.HasMetadata()) {
303 return true;
304 } else {
305 return false;
306 }
307 }
308 return false;
309 }
310
Equals(const std::shared_ptr<Field> & other,bool check_metadata) const311 bool Field::Equals(const std::shared_ptr<Field>& other, bool check_metadata) const {
312 return Equals(*other.get(), check_metadata);
313 }
314
IsCompatibleWith(const Field & other) const315 bool Field::IsCompatibleWith(const Field& other) const { return MergeWith(other).ok(); }
316
IsCompatibleWith(const std::shared_ptr<Field> & other) const317 bool Field::IsCompatibleWith(const std::shared_ptr<Field>& other) const {
318 DCHECK_NE(other, nullptr);
319 return IsCompatibleWith(*other);
320 }
321
ToString(bool show_metadata) const322 std::string Field::ToString(bool show_metadata) const {
323 std::stringstream ss;
324 ss << name_ << ": " << type_->ToString();
325 if (!nullable_) {
326 ss << " not null";
327 }
328 if (show_metadata && metadata_) {
329 ss << metadata_->ToString();
330 }
331 return ss.str();
332 }
333
~DataType()334 DataType::~DataType() {}
335
Equals(const DataType & other,bool check_metadata) const336 bool DataType::Equals(const DataType& other, bool check_metadata) const {
337 return TypeEquals(*this, other, check_metadata);
338 }
339
Equals(const std::shared_ptr<DataType> & other) const340 bool DataType::Equals(const std::shared_ptr<DataType>& other) const {
341 if (!other) {
342 return false;
343 }
344 return Equals(*other.get());
345 }
346
Hash() const347 size_t DataType::Hash() const {
348 static constexpr size_t kHashSeed = 0;
349 size_t result = kHashSeed;
350 internal::hash_combine(result, this->ComputeFingerprint());
351 return result;
352 }
353
operator <<(std::ostream & os,const DataType & type)354 std::ostream& operator<<(std::ostream& os, const DataType& type) {
355 os << type.ToString();
356 return os;
357 }
358
precision() const359 FloatingPointType::Precision HalfFloatType::precision() const {
360 return FloatingPointType::HALF;
361 }
362
precision() const363 FloatingPointType::Precision FloatType::precision() const {
364 return FloatingPointType::SINGLE;
365 }
366
precision() const367 FloatingPointType::Precision DoubleType::precision() const {
368 return FloatingPointType::DOUBLE;
369 }
370
ToString() const371 std::string ListType::ToString() const {
372 std::stringstream s;
373 s << "list<" << value_field()->ToString() << ">";
374 return s.str();
375 }
376
ToString() const377 std::string LargeListType::ToString() const {
378 std::stringstream s;
379 s << "large_list<" << value_field()->ToString() << ">";
380 return s.str();
381 }
382
MapType(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<DataType> & item_type,bool keys_sorted)383 MapType::MapType(const std::shared_ptr<DataType>& key_type,
384 const std::shared_ptr<DataType>& item_type, bool keys_sorted)
385 : MapType(key_type, ::arrow::field("value", item_type), keys_sorted) {}
386
MapType(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<Field> & item_field,bool keys_sorted)387 MapType::MapType(const std::shared_ptr<DataType>& key_type,
388 const std::shared_ptr<Field>& item_field, bool keys_sorted)
389 : ListType(::arrow::field(
390 "entries",
391 struct_({std::make_shared<Field>("key", key_type, false), item_field}), false)),
392 keys_sorted_(keys_sorted) {
393 id_ = type_id;
394 }
395
ToString() const396 std::string MapType::ToString() const {
397 std::stringstream s;
398 s << "map<" << key_type()->ToString() << ", " << item_type()->ToString();
399 if (keys_sorted_) {
400 s << ", keys_sorted";
401 }
402 s << ">";
403 return s.str();
404 }
405
ToString() const406 std::string FixedSizeListType::ToString() const {
407 std::stringstream s;
408 s << "fixed_size_list<" << value_field()->ToString() << ">[" << list_size_ << "]";
409 return s.str();
410 }
411
ToString() const412 std::string BinaryType::ToString() const { return "binary"; }
413
ToString() const414 std::string LargeBinaryType::ToString() const { return "large_binary"; }
415
ToString() const416 std::string StringType::ToString() const { return "string"; }
417
ToString() const418 std::string LargeStringType::ToString() const { return "large_string"; }
419
bit_width() const420 int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
421
ToString() const422 std::string FixedSizeBinaryType::ToString() const {
423 std::stringstream ss;
424 ss << "fixed_size_binary[" << byte_width_ << "]";
425 return ss.str();
426 }
427
428 // ----------------------------------------------------------------------
429 // Date types
430
DateType(Type::type type_id)431 DateType::DateType(Type::type type_id) : TemporalType(type_id) {}
432
Date32Type()433 Date32Type::Date32Type() : DateType(Type::DATE32) {}
434
Date64Type()435 Date64Type::Date64Type() : DateType(Type::DATE64) {}
436
ToString() const437 std::string Date64Type::ToString() const { return std::string("date64[ms]"); }
438
ToString() const439 std::string Date32Type::ToString() const { return std::string("date32[day]"); }
440
441 // ----------------------------------------------------------------------
442 // Time types
443
TimeType(Type::type type_id,TimeUnit::type unit)444 TimeType::TimeType(Type::type type_id, TimeUnit::type unit)
445 : TemporalType(type_id), unit_(unit) {}
446
Time32Type(TimeUnit::type unit)447 Time32Type::Time32Type(TimeUnit::type unit) : TimeType(Type::TIME32, unit) {
448 ARROW_CHECK(unit == TimeUnit::SECOND || unit == TimeUnit::MILLI)
449 << "Must be seconds or milliseconds";
450 }
451
ToString() const452 std::string Time32Type::ToString() const {
453 std::stringstream ss;
454 ss << "time32[" << this->unit_ << "]";
455 return ss.str();
456 }
457
Time64Type(TimeUnit::type unit)458 Time64Type::Time64Type(TimeUnit::type unit) : TimeType(Type::TIME64, unit) {
459 ARROW_CHECK(unit == TimeUnit::MICRO || unit == TimeUnit::NANO)
460 << "Must be microseconds or nanoseconds";
461 }
462
ToString() const463 std::string Time64Type::ToString() const {
464 std::stringstream ss;
465 ss << "time64[" << this->unit_ << "]";
466 return ss.str();
467 }
468
operator <<(std::ostream & os,TimeUnit::type unit)469 std::ostream& operator<<(std::ostream& os, TimeUnit::type unit) {
470 switch (unit) {
471 case TimeUnit::SECOND:
472 os << "s";
473 break;
474 case TimeUnit::MILLI:
475 os << "ms";
476 break;
477 case TimeUnit::MICRO:
478 os << "us";
479 break;
480 case TimeUnit::NANO:
481 os << "ns";
482 break;
483 }
484 return os;
485 }
486
487 // ----------------------------------------------------------------------
488 // Timestamp types
489
ToString() const490 std::string TimestampType::ToString() const {
491 std::stringstream ss;
492 ss << "timestamp[" << this->unit_;
493 if (this->timezone_.size() > 0) {
494 ss << ", tz=" << this->timezone_;
495 }
496 ss << "]";
497 return ss.str();
498 }
499
500 // Duration types
ToString() const501 std::string DurationType::ToString() const {
502 std::stringstream ss;
503 ss << "duration[" << this->unit_ << "]";
504 return ss.str();
505 }
506
507 // ----------------------------------------------------------------------
508 // Union type
509
510 constexpr int8_t UnionType::kMaxTypeCode;
511 constexpr int UnionType::kInvalidChildId;
512
UnionType(const std::vector<std::shared_ptr<Field>> & fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)513 UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
514 const std::vector<int8_t>& type_codes, UnionMode::type mode)
515 : NestedType(Type::UNION),
516 mode_(mode),
517 type_codes_(type_codes),
518 child_ids_(kMaxTypeCode + 1, kInvalidChildId) {
519 DCHECK_OK(ValidateParameters(fields, type_codes, mode));
520 children_ = fields;
521 for (int child_id = 0; child_id < static_cast<int>(type_codes_.size()); ++child_id) {
522 const auto type_code = type_codes_[child_id];
523 child_ids_[type_code] = child_id;
524 }
525 }
526
Make(const std::vector<std::shared_ptr<Field>> & fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)527 Result<std::shared_ptr<DataType>> UnionType::Make(
528 const std::vector<std::shared_ptr<Field>>& fields,
529 const std::vector<int8_t>& type_codes, UnionMode::type mode) {
530 RETURN_NOT_OK(ValidateParameters(fields, type_codes, mode));
531 return std::make_shared<UnionType>(fields, type_codes, mode);
532 }
533
ValidateParameters(const std::vector<std::shared_ptr<Field>> & fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)534 Status UnionType::ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields,
535 const std::vector<int8_t>& type_codes,
536 UnionMode::type mode) {
537 if (fields.size() != type_codes.size()) {
538 return Status::Invalid("Union should get the same number of fields as type codes");
539 }
540 for (const auto type_code : type_codes) {
541 if (type_code < 0 || type_code > kMaxTypeCode) {
542 return Status::Invalid("Union type code out of bounds");
543 }
544 }
545 return Status::OK();
546 }
547
layout() const548 DataTypeLayout UnionType::layout() const {
549 if (mode_ == UnionMode::SPARSE) {
550 return DataTypeLayout({DataTypeLayout::Bitmap(),
551 DataTypeLayout::FixedWidth(sizeof(uint8_t)),
552 DataTypeLayout::AlwaysNull()});
553 } else {
554 return DataTypeLayout({DataTypeLayout::Bitmap(),
555 DataTypeLayout::FixedWidth(sizeof(uint8_t)),
556 DataTypeLayout::FixedWidth(sizeof(int32_t))});
557 }
558 }
559
max_type_code() const560 uint8_t UnionType::max_type_code() const {
561 return type_codes_.size() == 0
562 ? 0
563 : *std::max_element(type_codes_.begin(), type_codes_.end());
564 }
565
ToString() const566 std::string UnionType::ToString() const {
567 std::stringstream s;
568
569 if (mode_ == UnionMode::SPARSE) {
570 s << "union[sparse]<";
571 } else {
572 s << "union[dense]<";
573 }
574
575 for (size_t i = 0; i < children_.size(); ++i) {
576 if (i) {
577 s << ", ";
578 }
579 s << children_[i]->ToString() << "=" << static_cast<int>(type_codes_[i]);
580 }
581 s << ">";
582 return s.str();
583 }
584
585 // ----------------------------------------------------------------------
586 // Struct type
587
588 namespace {
589
CreateNameToIndexMap(const std::vector<std::shared_ptr<Field>> & fields)590 std::unordered_multimap<std::string, int> CreateNameToIndexMap(
591 const std::vector<std::shared_ptr<Field>>& fields) {
592 std::unordered_multimap<std::string, int> name_to_index;
593 for (size_t i = 0; i < fields.size(); ++i) {
594 name_to_index.emplace(fields[i]->name(), static_cast<int>(i));
595 }
596 return name_to_index;
597 }
598
599 template <int NotFoundValue = -1, int DuplicateFoundValue = -1>
LookupNameIndex(const std::unordered_multimap<std::string,int> & name_to_index,const std::string & name)600 int LookupNameIndex(const std::unordered_multimap<std::string, int>& name_to_index,
601 const std::string& name) {
602 auto p = name_to_index.equal_range(name);
603 auto it = p.first;
604 if (it == p.second) {
605 // Not found
606 return NotFoundValue;
607 }
608 auto index = it->second;
609 if (++it != p.second) {
610 // Duplicate field name
611 return DuplicateFoundValue;
612 }
613 return index;
614 }
615
616 } // namespace
617
618 class StructType::Impl {
619 public:
Impl(const std::vector<std::shared_ptr<Field>> & fields)620 explicit Impl(const std::vector<std::shared_ptr<Field>>& fields)
621 : name_to_index_(CreateNameToIndexMap(fields)) {}
622
623 const std::unordered_multimap<std::string, int> name_to_index_;
624 };
625
StructType(const std::vector<std::shared_ptr<Field>> & fields)626 StructType::StructType(const std::vector<std::shared_ptr<Field>>& fields)
627 : NestedType(Type::STRUCT), impl_(new Impl(fields)) {
628 children_ = fields;
629 }
630
~StructType()631 StructType::~StructType() {}
632
ToString() const633 std::string StructType::ToString() const {
634 std::stringstream s;
635 s << "struct<";
636 for (int i = 0; i < this->num_fields(); ++i) {
637 if (i > 0) {
638 s << ", ";
639 }
640 std::shared_ptr<Field> field = this->field(i);
641 s << field->ToString();
642 }
643 s << ">";
644 return s.str();
645 }
646
GetFieldByName(const std::string & name) const647 std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const {
648 int i = GetFieldIndex(name);
649 return i == -1 ? nullptr : children_[i];
650 }
651
GetFieldIndex(const std::string & name) const652 int StructType::GetFieldIndex(const std::string& name) const {
653 return LookupNameIndex(impl_->name_to_index_, name);
654 }
655
GetAllFieldIndices(const std::string & name) const656 std::vector<int> StructType::GetAllFieldIndices(const std::string& name) const {
657 std::vector<int> result;
658 auto p = impl_->name_to_index_.equal_range(name);
659 for (auto it = p.first; it != p.second; ++it) {
660 result.push_back(it->second);
661 }
662 if (result.size() > 1) {
663 std::sort(result.begin(), result.end());
664 }
665 return result;
666 }
667
GetAllFieldsByName(const std::string & name) const668 std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName(
669 const std::string& name) const {
670 std::vector<std::shared_ptr<Field>> result;
671 auto p = impl_->name_to_index_.equal_range(name);
672 for (auto it = p.first; it != p.second; ++it) {
673 result.push_back(children_[it->second]);
674 }
675 return result;
676 }
677
678 // ----------------------------------------------------------------------
679 // Decimal128 type
680
Decimal128Type(int32_t precision,int32_t scale)681 Decimal128Type::Decimal128Type(int32_t precision, int32_t scale)
682 : DecimalType(16, precision, scale) {
683 ARROW_CHECK_GE(precision, kMinPrecision);
684 ARROW_CHECK_LE(precision, kMaxPrecision);
685 }
686
Make(int32_t precision,int32_t scale)687 Result<std::shared_ptr<DataType>> Decimal128Type::Make(int32_t precision, int32_t scale) {
688 if (precision < kMinPrecision || precision > kMaxPrecision) {
689 return Status::Invalid("Decimal precision out of range: ", precision);
690 }
691 return std::make_shared<Decimal128Type>(precision, scale);
692 }
693
Make(int32_t precision,int32_t scale,std::shared_ptr<DataType> * out)694 Status Decimal128Type::Make(int32_t precision, int32_t scale,
695 std::shared_ptr<DataType>* out) {
696 return Make(precision, scale).Value(out);
697 }
698
699 // ----------------------------------------------------------------------
700 // Dictionary-encoded type
701
ValidateParameters(const DataType & index_type,const DataType & value_type)702 Status DictionaryType::ValidateParameters(const DataType& index_type,
703 const DataType& value_type) {
704 const bool index_type_ok = is_integer(index_type.id()) &&
705 checked_cast<const IntegerType&>(index_type).is_signed();
706 if (!index_type_ok) {
707 return Status::TypeError("Dictionary index type should be signed integer, got ",
708 index_type.ToString());
709 }
710 return Status::OK();
711 }
712
bit_width() const713 int DictionaryType::bit_width() const {
714 return checked_cast<const FixedWidthType&>(*index_type_).bit_width();
715 }
716
Make(const std::shared_ptr<DataType> & index_type,const std::shared_ptr<DataType> & value_type,bool ordered)717 Result<std::shared_ptr<DataType>> DictionaryType::Make(
718 const std::shared_ptr<DataType>& index_type,
719 const std::shared_ptr<DataType>& value_type, bool ordered) {
720 RETURN_NOT_OK(ValidateParameters(*index_type, *value_type));
721 return std::make_shared<DictionaryType>(index_type, value_type, ordered);
722 }
723
DictionaryType(const std::shared_ptr<DataType> & index_type,const std::shared_ptr<DataType> & value_type,bool ordered)724 DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type,
725 const std::shared_ptr<DataType>& value_type, bool ordered)
726 : FixedWidthType(Type::DICTIONARY),
727 index_type_(index_type),
728 value_type_(value_type),
729 ordered_(ordered) {
730 ARROW_CHECK_OK(ValidateParameters(*index_type_, *value_type_));
731 }
732
layout() const733 DataTypeLayout DictionaryType::layout() const {
734 auto layout = index_type_->layout();
735 layout.has_dictionary = true;
736 return layout;
737 }
738
ToString() const739 std::string DictionaryType::ToString() const {
740 std::stringstream ss;
741 ss << this->name() << "<values=" << value_type_->ToString()
742 << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">";
743 return ss.str();
744 }
745
746 // ----------------------------------------------------------------------
747 // Null type
748
ToString() const749 std::string NullType::ToString() const { return name(); }
750
751 // ----------------------------------------------------------------------
752 // FieldRef
753
hash() const754 size_t FieldPath::hash() const {
755 return internal::ComputeStringHash<0>(indices().data(), indices().size() * sizeof(int));
756 }
757
ToString() const758 std::string FieldPath::ToString() const {
759 std::string repr = "FieldPath(";
760 for (auto index : this->indices()) {
761 repr += std::to_string(index) + " ";
762 }
763 repr.resize(repr.size() - 1);
764 repr += ")";
765 return repr;
766 }
767
768 struct FieldPathGetImpl {
GetTypearrow::FieldPathGetImpl769 static const DataType& GetType(const ArrayData& data) { return *data.type; }
770
GetTypearrow::FieldPathGetImpl771 static const DataType& GetType(const ChunkedArray& array) { return *array.type(); }
772
Summarizearrow::FieldPathGetImpl773 static void Summarize(const FieldVector& fields, std::stringstream* ss) {
774 *ss << "{ ";
775 for (const auto& field : fields) {
776 *ss << field->ToString() << ", ";
777 }
778 *ss << "}";
779 }
780
781 template <typename T>
Summarizearrow::FieldPathGetImpl782 static void Summarize(const std::vector<T>& columns, std::stringstream* ss) {
783 *ss << "{ ";
784 for (const auto& column : columns) {
785 *ss << GetType(*column) << ", ";
786 }
787 *ss << "}";
788 }
789
790 template <typename T>
IndexErrorarrow::FieldPathGetImpl791 static Status IndexError(const FieldPath* path, int out_of_range_depth,
792 const std::vector<T>& children) {
793 std::stringstream ss;
794 ss << "index out of range. ";
795
796 ss << "indices=[ ";
797 int depth = 0;
798 for (int i : path->indices()) {
799 if (depth != out_of_range_depth) {
800 ss << i << " ";
801 continue;
802 }
803 ss << ">" << i << "< ";
804 ++depth;
805 }
806 ss << "] ";
807
808 if (std::is_same<T, std::shared_ptr<Field>>::value) {
809 ss << "fields were: ";
810 } else {
811 ss << "columns had types: ";
812 }
813 Summarize(children, &ss);
814
815 return Status::IndexError(ss.str());
816 }
817
818 template <typename T, typename GetChildren>
Getarrow::FieldPathGetImpl819 static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
820 GetChildren&& get_children, int* out_of_range_depth) {
821 if (path->indices().empty()) {
822 return Status::Invalid("empty indices cannot be traversed");
823 }
824
825 int depth = 0;
826 const T* out;
827 for (int index : path->indices()) {
828 if (index < 0 || static_cast<size_t>(index) >= children->size()) {
829 *out_of_range_depth = depth;
830 return nullptr;
831 }
832
833 out = &children->at(index);
834 children = get_children(*out);
835 ++depth;
836 }
837
838 return *out;
839 }
840
841 template <typename T, typename GetChildren>
Getarrow::FieldPathGetImpl842 static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
843 GetChildren&& get_children) {
844 int out_of_range_depth = -1;
845 ARROW_ASSIGN_OR_RAISE(auto child,
846 Get(path, children, std::forward<GetChildren>(get_children),
847 &out_of_range_depth));
848 if (child != nullptr) {
849 return std::move(child);
850 }
851 return IndexError(path, out_of_range_depth, *children);
852 }
853
Getarrow::FieldPathGetImpl854 static Result<std::shared_ptr<Field>> Get(const FieldPath* path,
855 const FieldVector& fields) {
856 return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr<Field>& field) {
857 return &field->type()->fields();
858 });
859 }
860
Getarrow::FieldPathGetImpl861 static Result<std::shared_ptr<ArrayData>> Get(const FieldPath* path,
862 const ArrayDataVector& child_data) {
863 return FieldPathGetImpl::Get(
864 path, &child_data,
865 [](const std::shared_ptr<ArrayData>& data) { return &data->child_data; });
866 }
867
Getarrow::FieldPathGetImpl868 static Result<std::shared_ptr<ChunkedArray>> Get(
869 const FieldPath* path, const ChunkedArrayVector& columns_arg) {
870 ChunkedArrayVector columns = columns_arg;
871
872 return FieldPathGetImpl::Get(
873 path, &columns, [&](const std::shared_ptr<ChunkedArray>& a) {
874 columns.clear();
875
876 for (int i = 0; i < a->type()->num_fields(); ++i) {
877 ArrayVector child_chunks;
878
879 for (const auto& chunk : a->chunks()) {
880 auto child_chunk = MakeArray(chunk->data()->child_data[i]);
881 child_chunks.push_back(std::move(child_chunk));
882 }
883
884 auto child_column = std::make_shared<ChunkedArray>(
885 std::move(child_chunks), a->type()->field(i)->type());
886
887 columns.emplace_back(std::move(child_column));
888 }
889
890 return &columns;
891 });
892 }
893 };
894
Get(const Schema & schema) const895 Result<std::shared_ptr<Field>> FieldPath::Get(const Schema& schema) const {
896 return FieldPathGetImpl::Get(this, schema.fields());
897 }
898
Get(const Field & field) const899 Result<std::shared_ptr<Field>> FieldPath::Get(const Field& field) const {
900 return FieldPathGetImpl::Get(this, field.type()->fields());
901 }
902
Get(const DataType & type) const903 Result<std::shared_ptr<Field>> FieldPath::Get(const DataType& type) const {
904 return FieldPathGetImpl::Get(this, type.fields());
905 }
906
Get(const FieldVector & fields) const907 Result<std::shared_ptr<Field>> FieldPath::Get(const FieldVector& fields) const {
908 return FieldPathGetImpl::Get(this, fields);
909 }
910
Get(const RecordBatch & batch) const911 Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const {
912 ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data()));
913 return MakeArray(data);
914 }
915
Get(const Table & table) const916 Result<std::shared_ptr<ChunkedArray>> FieldPath::Get(const Table& table) const {
917 return FieldPathGetImpl::Get(this, table.columns());
918 }
919
FieldRef(FieldPath indices)920 FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
921 DCHECK_GT(util::get<FieldPath>(impl_).indices().size(), 0);
922 }
923
Flatten(std::vector<FieldRef> children)924 void FieldRef::Flatten(std::vector<FieldRef> children) {
925 // flatten children
926 struct Visitor {
927 void operator()(std::string&& name) { *out++ = FieldRef(std::move(name)); }
928
929 void operator()(FieldPath&& indices) { *out++ = FieldRef(std::move(indices)); }
930
931 void operator()(std::vector<FieldRef>&& children) {
932 for (auto& child : children) {
933 util::visit(*this, std::move(child.impl_));
934 }
935 }
936
937 std::back_insert_iterator<std::vector<FieldRef>> out;
938 };
939
940 std::vector<FieldRef> out;
941 Visitor visitor{std::back_inserter(out)};
942 visitor(std::move(children));
943
944 DCHECK(!out.empty());
945 DCHECK(std::none_of(out.begin(), out.end(),
946 [](const FieldRef& ref) { return ref.IsNested(); }));
947
948 if (out.size() == 1) {
949 impl_ = std::move(out[0].impl_);
950 } else {
951 impl_ = std::move(out);
952 }
953 }
954
FromDotPath(const std::string & dot_path_arg)955 Result<FieldRef> FieldRef::FromDotPath(const std::string& dot_path_arg) {
956 if (dot_path_arg.empty()) {
957 return Status::Invalid("Dot path was empty");
958 }
959
960 std::vector<FieldRef> children;
961
962 util::string_view dot_path = dot_path_arg;
963
964 auto parse_name = [&] {
965 std::string name;
966 for (;;) {
967 auto segment_end = dot_path.find_first_of("\\[.");
968 if (segment_end == util::string_view::npos) {
969 // dot_path doesn't contain any other special characters; consume all
970 name.append(dot_path.begin(), dot_path.end());
971 dot_path = "";
972 break;
973 }
974
975 if (dot_path[segment_end] != '\\') {
976 // segment_end points to a subscript for a new FieldRef
977 name.append(dot_path.begin(), segment_end);
978 dot_path = dot_path.substr(segment_end);
979 break;
980 }
981
982 if (dot_path.size() == segment_end + 1) {
983 // dot_path ends with backslash; consume it all
984 name.append(dot_path.begin(), dot_path.end());
985 dot_path = "";
986 break;
987 }
988
989 // append all characters before backslash, then the character which follows it
990 name.append(dot_path.begin(), segment_end);
991 name.push_back(dot_path[segment_end + 1]);
992 dot_path = dot_path.substr(segment_end + 2);
993 }
994 return name;
995 };
996
997 while (!dot_path.empty()) {
998 auto subscript = dot_path[0];
999 dot_path = dot_path.substr(1);
1000 switch (subscript) {
1001 case '.': {
1002 // next element is a name
1003 children.emplace_back(parse_name());
1004 continue;
1005 }
1006 case '[': {
1007 auto subscript_end = dot_path.find_first_not_of("0123456789");
1008 if (subscript_end == util::string_view::npos || dot_path[subscript_end] != ']') {
1009 return Status::Invalid("Dot path '", dot_path_arg,
1010 "' contained an unterminated index");
1011 }
1012 children.emplace_back(std::atoi(dot_path.data()));
1013 dot_path = dot_path.substr(subscript_end + 1);
1014 continue;
1015 }
1016 default:
1017 return Status::Invalid("Dot path must begin with '[' or '.', got '", dot_path_arg,
1018 "'");
1019 }
1020 }
1021
1022 FieldRef out;
1023 out.Flatten(std::move(children));
1024 return out;
1025 }
1026
hash() const1027 size_t FieldRef::hash() const {
1028 struct Visitor : std::hash<std::string> {
1029 using std::hash<std::string>::operator();
1030
1031 size_t operator()(const FieldPath& path) { return path.hash(); }
1032
1033 size_t operator()(const std::vector<FieldRef>& children) {
1034 size_t hash = 0;
1035
1036 for (const FieldRef& child : children) {
1037 hash ^= child.hash();
1038 }
1039
1040 return hash;
1041 }
1042 };
1043
1044 return util::visit(Visitor{}, impl_);
1045 }
1046
ToString() const1047 std::string FieldRef::ToString() const {
1048 struct Visitor {
1049 std::string operator()(const FieldPath& path) { return path.ToString(); }
1050
1051 std::string operator()(const std::string& name) { return "Name(" + name + ")"; }
1052
1053 std::string operator()(const std::vector<FieldRef>& children) {
1054 std::string repr = "Nested(";
1055 for (const auto& child : children) {
1056 repr += child.ToString() + " ";
1057 }
1058 repr.resize(repr.size() - 1);
1059 repr += ")";
1060 return repr;
1061 }
1062 };
1063
1064 return "FieldRef." + util::visit(Visitor{}, impl_);
1065 }
1066
FindAll(const Schema & schema) const1067 std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const {
1068 return FindAll(schema.fields());
1069 }
1070
FindAll(const Field & field) const1071 std::vector<FieldPath> FieldRef::FindAll(const Field& field) const {
1072 return FindAll(field.type()->fields());
1073 }
1074
FindAll(const DataType & type) const1075 std::vector<FieldPath> FieldRef::FindAll(const DataType& type) const {
1076 return FindAll(type.fields());
1077 }
1078
FindAll(const FieldVector & fields) const1079 std::vector<FieldPath> FieldRef::FindAll(const FieldVector& fields) const {
1080 struct Visitor {
1081 std::vector<FieldPath> operator()(const FieldPath& path) {
1082 // skip long IndexError construction if path is out of range
1083 int out_of_range_depth;
1084 auto maybe_field = FieldPathGetImpl::Get(
1085 &path, &fields_,
1086 [](const std::shared_ptr<Field>& field) { return &field->type()->fields(); },
1087 &out_of_range_depth);
1088
1089 DCHECK_OK(maybe_field.status());
1090
1091 if (maybe_field.ValueOrDie() != nullptr) {
1092 return {path};
1093 }
1094 return {};
1095 }
1096
1097 std::vector<FieldPath> operator()(const std::string& name) {
1098 std::vector<FieldPath> out;
1099
1100 for (int i = 0; i < static_cast<int>(fields_.size()); ++i) {
1101 if (fields_[i]->name() == name) {
1102 out.push_back({i});
1103 }
1104 }
1105
1106 return out;
1107 }
1108
1109 struct Matches {
1110 // referents[i] is referenced by prefixes[i]
1111 std::vector<FieldPath> prefixes;
1112 FieldVector referents;
1113
1114 Matches(std::vector<FieldPath> matches, const FieldVector& fields) {
1115 for (auto& match : matches) {
1116 Add({}, std::move(match), fields);
1117 }
1118 }
1119
1120 Matches() = default;
1121
1122 size_t size() const { return referents.size(); }
1123
1124 void Add(FieldPath prefix, const FieldPath& match, const FieldVector& fields) {
1125 auto maybe_field = match.Get(fields);
1126 DCHECK_OK(maybe_field.status());
1127
1128 prefix.indices().resize(prefix.indices().size() + match.indices().size());
1129 std::copy(match.indices().begin(), match.indices().end(),
1130 prefix.indices().end() - match.indices().size());
1131 prefixes.push_back(std::move(prefix));
1132 referents.push_back(std::move(maybe_field).ValueOrDie());
1133 }
1134 };
1135
1136 std::vector<FieldPath> operator()(const std::vector<FieldRef>& refs) {
1137 DCHECK_GE(refs.size(), 1);
1138 Matches matches(refs.front().FindAll(fields_), fields_);
1139
1140 for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) {
1141 Matches next_matches;
1142 for (size_t i = 0; i < matches.size(); ++i) {
1143 const auto& referent = *matches.referents[i];
1144
1145 for (const FieldPath& match : ref_it->FindAll(referent)) {
1146 next_matches.Add(matches.prefixes[i], match, referent.type()->fields());
1147 }
1148 }
1149 matches = std::move(next_matches);
1150 }
1151
1152 return matches.prefixes;
1153 }
1154
1155 const FieldVector& fields_;
1156 };
1157
1158 return util::visit(Visitor{fields}, impl_);
1159 }
1160
PrintTo(const FieldRef & ref,std::ostream * os)1161 void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); }
1162
1163 // ----------------------------------------------------------------------
1164 // Schema implementation
1165
1166 class Schema::Impl {
1167 public:
Impl(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata)1168 Impl(std::vector<std::shared_ptr<Field>> fields,
1169 std::shared_ptr<const KeyValueMetadata> metadata)
1170 : fields_(std::move(fields)),
1171 name_to_index_(CreateNameToIndexMap(fields_)),
1172 metadata_(std::move(metadata)) {}
1173
1174 std::vector<std::shared_ptr<Field>> fields_;
1175 std::unordered_multimap<std::string, int> name_to_index_;
1176 std::shared_ptr<const KeyValueMetadata> metadata_;
1177 };
1178
Schema(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata)1179 Schema::Schema(std::vector<std::shared_ptr<Field>> fields,
1180 std::shared_ptr<const KeyValueMetadata> metadata)
1181 : detail::Fingerprintable(),
1182 impl_(new Impl(std::move(fields), std::move(metadata))) {}
1183
Schema(const Schema & schema)1184 Schema::Schema(const Schema& schema)
1185 : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {}
1186
~Schema()1187 Schema::~Schema() {}
1188
num_fields() const1189 int Schema::num_fields() const { return static_cast<int>(impl_->fields_.size()); }
1190
field(int i) const1191 const std::shared_ptr<Field>& Schema::field(int i) const {
1192 DCHECK_GE(i, 0);
1193 DCHECK_LT(i, num_fields());
1194 return impl_->fields_[i];
1195 }
1196
fields() const1197 const std::vector<std::shared_ptr<Field>>& Schema::fields() const {
1198 return impl_->fields_;
1199 }
1200
Equals(const Schema & other,bool check_metadata) const1201 bool Schema::Equals(const Schema& other, bool check_metadata) const {
1202 if (this == &other) {
1203 return true;
1204 }
1205
1206 // checks field equality
1207 if (num_fields() != other.num_fields()) {
1208 return false;
1209 }
1210
1211 if (check_metadata) {
1212 const auto& metadata_fp = metadata_fingerprint();
1213 const auto& other_metadata_fp = other.metadata_fingerprint();
1214 if (metadata_fp != other_metadata_fp) {
1215 return false;
1216 }
1217 }
1218
1219 // Fast path using fingerprints, if possible
1220 const auto& fp = fingerprint();
1221 const auto& other_fp = other.fingerprint();
1222 if (!fp.empty() && !other_fp.empty()) {
1223 return fp == other_fp;
1224 }
1225
1226 // Fall back on field-by-field comparison
1227 for (int i = 0; i < num_fields(); ++i) {
1228 if (!field(i)->Equals(*other.field(i).get(), check_metadata)) {
1229 return false;
1230 }
1231 }
1232
1233 return true;
1234 }
1235
Equals(const std::shared_ptr<Schema> & other,bool check_metadata) const1236 bool Schema::Equals(const std::shared_ptr<Schema>& other, bool check_metadata) const {
1237 if (other == nullptr) {
1238 return false;
1239 }
1240
1241 return Equals(*other, check_metadata);
1242 }
1243
GetFieldByName(const std::string & name) const1244 std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) const {
1245 int i = GetFieldIndex(name);
1246 return i == -1 ? nullptr : impl_->fields_[i];
1247 }
1248
GetFieldIndex(const std::string & name) const1249 int Schema::GetFieldIndex(const std::string& name) const {
1250 return LookupNameIndex(impl_->name_to_index_, name);
1251 }
1252
GetAllFieldIndices(const std::string & name) const1253 std::vector<int> Schema::GetAllFieldIndices(const std::string& name) const {
1254 std::vector<int> result;
1255 auto p = impl_->name_to_index_.equal_range(name);
1256 for (auto it = p.first; it != p.second; ++it) {
1257 result.push_back(it->second);
1258 }
1259 if (result.size() > 1) {
1260 std::sort(result.begin(), result.end());
1261 }
1262 return result;
1263 }
1264
CanReferenceFieldsByNames(const std::vector<std::string> & names) const1265 Status Schema::CanReferenceFieldsByNames(const std::vector<std::string>& names) const {
1266 for (const auto& name : names) {
1267 if (GetFieldByName(name) == nullptr) {
1268 return Status::Invalid("Field named '", name,
1269 "' not found or not unique in the schema.");
1270 }
1271 }
1272
1273 return Status::OK();
1274 }
1275
GetAllFieldsByName(const std::string & name) const1276 std::vector<std::shared_ptr<Field>> Schema::GetAllFieldsByName(
1277 const std::string& name) const {
1278 std::vector<std::shared_ptr<Field>> result;
1279 auto p = impl_->name_to_index_.equal_range(name);
1280 for (auto it = p.first; it != p.second; ++it) {
1281 result.push_back(impl_->fields_[it->second]);
1282 }
1283 return result;
1284 }
1285
AddField(int i,const std::shared_ptr<Field> & field) const1286 Result<std::shared_ptr<Schema>> Schema::AddField(
1287 int i, const std::shared_ptr<Field>& field) const {
1288 if (i < 0 || i > this->num_fields()) {
1289 return Status::Invalid("Invalid column index to add field.");
1290 }
1291
1292 return std::make_shared<Schema>(internal::AddVectorElement(impl_->fields_, i, field),
1293 impl_->metadata_);
1294 }
1295
SetField(int i,const std::shared_ptr<Field> & field) const1296 Result<std::shared_ptr<Schema>> Schema::SetField(
1297 int i, const std::shared_ptr<Field>& field) const {
1298 if (i < 0 || i > this->num_fields()) {
1299 return Status::Invalid("Invalid column index to add field.");
1300 }
1301
1302 return std::make_shared<Schema>(
1303 internal::ReplaceVectorElement(impl_->fields_, i, field), impl_->metadata_);
1304 }
1305
RemoveField(int i) const1306 Result<std::shared_ptr<Schema>> Schema::RemoveField(int i) const {
1307 if (i < 0 || i >= this->num_fields()) {
1308 return Status::Invalid("Invalid column index to remove field.");
1309 }
1310
1311 return std::make_shared<Schema>(internal::DeleteVectorElement(impl_->fields_, i),
1312 impl_->metadata_);
1313 }
1314
AddField(int i,const std::shared_ptr<Field> & field,std::shared_ptr<Schema> * out) const1315 Status Schema::AddField(int i, const std::shared_ptr<Field>& field,
1316 std::shared_ptr<Schema>* out) const {
1317 return AddField(i, field).Value(out);
1318 }
1319
SetField(int i,const std::shared_ptr<Field> & field,std::shared_ptr<Schema> * out) const1320 Status Schema::SetField(int i, const std::shared_ptr<Field>& field,
1321 std::shared_ptr<Schema>* out) const {
1322 return SetField(i, field).Value(out);
1323 }
1324
RemoveField(int i,std::shared_ptr<Schema> * out) const1325 Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const {
1326 return RemoveField(i).Value(out);
1327 }
1328
HasMetadata() const1329 bool Schema::HasMetadata() const {
1330 return (impl_->metadata_ != nullptr) && (impl_->metadata_->size() > 0);
1331 }
1332
HasDistinctFieldNames() const1333 bool Schema::HasDistinctFieldNames() const {
1334 auto fields = field_names();
1335 std::unordered_set<std::string> names{fields.cbegin(), fields.cend()};
1336 return names.size() == fields.size();
1337 }
1338
WithMetadata(const std::shared_ptr<const KeyValueMetadata> & metadata) const1339 std::shared_ptr<Schema> Schema::WithMetadata(
1340 const std::shared_ptr<const KeyValueMetadata>& metadata) const {
1341 return std::make_shared<Schema>(impl_->fields_, metadata);
1342 }
1343
metadata() const1344 std::shared_ptr<const KeyValueMetadata> Schema::metadata() const {
1345 return impl_->metadata_;
1346 }
1347
RemoveMetadata() const1348 std::shared_ptr<Schema> Schema::RemoveMetadata() const {
1349 return std::make_shared<Schema>(impl_->fields_);
1350 }
1351
ToString(bool show_metadata) const1352 std::string Schema::ToString(bool show_metadata) const {
1353 std::stringstream buffer;
1354
1355 int i = 0;
1356 for (const auto& field : impl_->fields_) {
1357 if (i > 0) {
1358 buffer << std::endl;
1359 }
1360 buffer << field->ToString(show_metadata);
1361 ++i;
1362 }
1363
1364 if (show_metadata && HasMetadata()) {
1365 buffer << impl_->metadata_->ToString();
1366 }
1367
1368 return buffer.str();
1369 }
1370
field_names() const1371 std::vector<std::string> Schema::field_names() const {
1372 std::vector<std::string> names;
1373 for (const auto& field : impl_->fields_) {
1374 names.push_back(field->name());
1375 }
1376 return names;
1377 }
1378
1379 class SchemaBuilder::Impl {
1380 public:
1381 friend class SchemaBuilder;
Impl(ConflictPolicy policy,Field::MergeOptions field_merge_options)1382 Impl(ConflictPolicy policy, Field::MergeOptions field_merge_options)
1383 : policy_(policy), field_merge_options_(field_merge_options) {}
1384
Impl(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata,ConflictPolicy conflict_policy,Field::MergeOptions field_merge_options)1385 Impl(std::vector<std::shared_ptr<Field>> fields,
1386 std::shared_ptr<const KeyValueMetadata> metadata, ConflictPolicy conflict_policy,
1387 Field::MergeOptions field_merge_options)
1388 : fields_(std::move(fields)),
1389 name_to_index_(CreateNameToIndexMap(fields_)),
1390 metadata_(std::move(metadata)),
1391 policy_(conflict_policy),
1392 field_merge_options_(field_merge_options) {}
1393
AddField(const std::shared_ptr<Field> & field)1394 Status AddField(const std::shared_ptr<Field>& field) {
1395 DCHECK_NE(field, nullptr);
1396
1397 // Short-circuit, no lookup needed.
1398 if (policy_ == CONFLICT_APPEND) {
1399 return AppendField(field);
1400 }
1401
1402 auto name = field->name();
1403 constexpr int kNotFound = -1;
1404 constexpr int kDuplicateFound = -2;
1405 auto i = LookupNameIndex<kNotFound, kDuplicateFound>(name_to_index_, name);
1406
1407 if (i == kNotFound) {
1408 return AppendField(field);
1409 }
1410
1411 // From this point, there's one or more field in the builder that exists with
1412 // the same name.
1413
1414 if (policy_ == CONFLICT_IGNORE) {
1415 // The ignore policy is more generous when there's duplicate in the builder.
1416 return Status::OK();
1417 } else if (policy_ == CONFLICT_ERROR) {
1418 return Status::Invalid("Duplicate found, policy dictate to treat as an error");
1419 }
1420
1421 if (i == kDuplicateFound) {
1422 // Cannot merge/replace when there's more than one field in the builder
1423 // because we can't decide which to merge/replace.
1424 return Status::Invalid("Cannot merge field ", name,
1425 " more than one field with same name exists");
1426 }
1427
1428 DCHECK_GE(i, 0);
1429
1430 if (policy_ == CONFLICT_REPLACE) {
1431 fields_[i] = field;
1432 } else if (policy_ == CONFLICT_MERGE) {
1433 ARROW_ASSIGN_OR_RAISE(fields_[i], fields_[i]->MergeWith(field));
1434 }
1435
1436 return Status::OK();
1437 }
1438
AppendField(const std::shared_ptr<Field> & field)1439 Status AppendField(const std::shared_ptr<Field>& field) {
1440 name_to_index_.emplace(field->name(), static_cast<int>(fields_.size()));
1441 fields_.push_back(field);
1442 return Status::OK();
1443 }
1444
Reset()1445 void Reset() {
1446 fields_.clear();
1447 name_to_index_.clear();
1448 metadata_.reset();
1449 }
1450
1451 private:
1452 std::vector<std::shared_ptr<Field>> fields_;
1453 std::unordered_multimap<std::string, int> name_to_index_;
1454 std::shared_ptr<const KeyValueMetadata> metadata_;
1455 ConflictPolicy policy_;
1456 Field::MergeOptions field_merge_options_;
1457 };
1458
SchemaBuilder(ConflictPolicy policy,Field::MergeOptions field_merge_options)1459 SchemaBuilder::SchemaBuilder(ConflictPolicy policy,
1460 Field::MergeOptions field_merge_options) {
1461 impl_ = internal::make_unique<Impl>(policy, field_merge_options);
1462 }
1463
SchemaBuilder(std::vector<std::shared_ptr<Field>> fields,ConflictPolicy policy,Field::MergeOptions field_merge_options)1464 SchemaBuilder::SchemaBuilder(std::vector<std::shared_ptr<Field>> fields,
1465 ConflictPolicy policy,
1466 Field::MergeOptions field_merge_options) {
1467 impl_ = internal::make_unique<Impl>(std::move(fields), nullptr, policy,
1468 field_merge_options);
1469 }
1470
SchemaBuilder(const std::shared_ptr<Schema> & schema,ConflictPolicy policy,Field::MergeOptions field_merge_options)1471 SchemaBuilder::SchemaBuilder(const std::shared_ptr<Schema>& schema, ConflictPolicy policy,
1472 Field::MergeOptions field_merge_options) {
1473 std::shared_ptr<const KeyValueMetadata> metadata;
1474 if (schema->HasMetadata()) {
1475 metadata = schema->metadata()->Copy();
1476 }
1477
1478 impl_ = internal::make_unique<Impl>(schema->fields(), std::move(metadata), policy,
1479 field_merge_options);
1480 }
1481
~SchemaBuilder()1482 SchemaBuilder::~SchemaBuilder() {}
1483
policy() const1484 SchemaBuilder::ConflictPolicy SchemaBuilder::policy() const { return impl_->policy_; }
1485
SetPolicy(SchemaBuilder::ConflictPolicy resolution)1486 void SchemaBuilder::SetPolicy(SchemaBuilder::ConflictPolicy resolution) {
1487 impl_->policy_ = resolution;
1488 }
1489
AddField(const std::shared_ptr<Field> & field)1490 Status SchemaBuilder::AddField(const std::shared_ptr<Field>& field) {
1491 return impl_->AddField(field);
1492 }
1493
AddFields(const std::vector<std::shared_ptr<Field>> & fields)1494 Status SchemaBuilder::AddFields(const std::vector<std::shared_ptr<Field>>& fields) {
1495 for (const auto& field : fields) {
1496 RETURN_NOT_OK(AddField(field));
1497 }
1498
1499 return Status::OK();
1500 }
1501
AddSchema(const std::shared_ptr<Schema> & schema)1502 Status SchemaBuilder::AddSchema(const std::shared_ptr<Schema>& schema) {
1503 DCHECK_NE(schema, nullptr);
1504 return AddFields(schema->fields());
1505 }
1506
AddSchemas(const std::vector<std::shared_ptr<Schema>> & schemas)1507 Status SchemaBuilder::AddSchemas(const std::vector<std::shared_ptr<Schema>>& schemas) {
1508 for (const auto& schema : schemas) {
1509 RETURN_NOT_OK(AddSchema(schema));
1510 }
1511
1512 return Status::OK();
1513 }
1514
AddMetadata(const KeyValueMetadata & metadata)1515 Status SchemaBuilder::AddMetadata(const KeyValueMetadata& metadata) {
1516 impl_->metadata_ = metadata.Copy();
1517 return Status::OK();
1518 }
1519
Finish() const1520 Result<std::shared_ptr<Schema>> SchemaBuilder::Finish() const {
1521 return schema(impl_->fields_, impl_->metadata_);
1522 }
1523
Reset()1524 void SchemaBuilder::Reset() { impl_->Reset(); }
1525
Merge(const std::vector<std::shared_ptr<Schema>> & schemas,ConflictPolicy policy)1526 Result<std::shared_ptr<Schema>> SchemaBuilder::Merge(
1527 const std::vector<std::shared_ptr<Schema>>& schemas, ConflictPolicy policy) {
1528 SchemaBuilder builder{policy};
1529 RETURN_NOT_OK(builder.AddSchemas(schemas));
1530 return builder.Finish();
1531 }
1532
AreCompatible(const std::vector<std::shared_ptr<Schema>> & schemas,ConflictPolicy policy)1533 Status SchemaBuilder::AreCompatible(const std::vector<std::shared_ptr<Schema>>& schemas,
1534 ConflictPolicy policy) {
1535 return Merge(schemas, policy).status();
1536 }
1537
schema(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata)1538 std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
1539 std::shared_ptr<const KeyValueMetadata> metadata) {
1540 return std::make_shared<Schema>(std::move(fields), std::move(metadata));
1541 }
1542
UnifySchemas(const std::vector<std::shared_ptr<Schema>> & schemas,const Field::MergeOptions field_merge_options)1543 Result<std::shared_ptr<Schema>> UnifySchemas(
1544 const std::vector<std::shared_ptr<Schema>>& schemas,
1545 const Field::MergeOptions field_merge_options) {
1546 if (schemas.empty()) {
1547 return Status::Invalid("Must provide at least one schema to unify.");
1548 }
1549
1550 if (!schemas[0]->HasDistinctFieldNames()) {
1551 return Status::Invalid("Can't unify schema with duplicate field names.");
1552 }
1553
1554 SchemaBuilder builder{schemas[0], SchemaBuilder::CONFLICT_MERGE, field_merge_options};
1555
1556 for (size_t i = 1; i < schemas.size(); i++) {
1557 const auto& schema = schemas[i];
1558 if (!schema->HasDistinctFieldNames()) {
1559 return Status::Invalid("Can't unify schema with duplicate field names.");
1560 }
1561 RETURN_NOT_OK(builder.AddSchema(schema));
1562 }
1563
1564 return builder.Finish();
1565 }
1566
1567 // ----------------------------------------------------------------------
1568 // Fingerprint computations
1569
1570 namespace detail {
1571
~Fingerprintable()1572 Fingerprintable::~Fingerprintable() {
1573 delete fingerprint_.load();
1574 delete metadata_fingerprint_.load();
1575 }
1576
1577 template <typename ComputeFingerprint>
LoadFingerprint(std::atomic<std::string * > * fingerprint,ComputeFingerprint && compute_fingerprint)1578 static const std::string& LoadFingerprint(std::atomic<std::string*>* fingerprint,
1579 ComputeFingerprint&& compute_fingerprint) {
1580 auto new_p = new std::string(std::forward<ComputeFingerprint>(compute_fingerprint)());
1581 // Since fingerprint() and metadata_fingerprint() return a *reference* to the
1582 // allocated string, the first allocation ever should never be replaced by another
1583 // one. Hence the compare_exchange_strong() against nullptr.
1584 std::string* expected = nullptr;
1585 if (fingerprint->compare_exchange_strong(expected, new_p)) {
1586 return *new_p;
1587 } else {
1588 delete new_p;
1589 DCHECK_NE(expected, nullptr);
1590 return *expected;
1591 }
1592 }
1593
LoadFingerprintSlow() const1594 const std::string& Fingerprintable::LoadFingerprintSlow() const {
1595 return LoadFingerprint(&fingerprint_, [this]() { return ComputeFingerprint(); });
1596 }
1597
LoadMetadataFingerprintSlow() const1598 const std::string& Fingerprintable::LoadMetadataFingerprintSlow() const {
1599 return LoadFingerprint(&metadata_fingerprint_,
1600 [this]() { return ComputeMetadataFingerprint(); });
1601 }
1602
1603 } // namespace detail
1604
TypeIdFingerprint(const DataType & type)1605 static inline std::string TypeIdFingerprint(const DataType& type) {
1606 auto c = static_cast<int>(type.id()) + 'A';
1607 DCHECK_GE(c, 0);
1608 DCHECK_LT(c, 128); // Unlikely to happen any soon
1609 // Prefix with an unusual character in order to disambiguate
1610 std::string s{'@', static_cast<char>(c)};
1611 return s;
1612 }
1613
TimeUnitFingerprint(TimeUnit::type unit)1614 static char TimeUnitFingerprint(TimeUnit::type unit) {
1615 switch (unit) {
1616 case TimeUnit::SECOND:
1617 return 's';
1618 case TimeUnit::MILLI:
1619 return 'm';
1620 case TimeUnit::MICRO:
1621 return 'u';
1622 case TimeUnit::NANO:
1623 return 'n';
1624 default:
1625 DCHECK(false) << "Unexpected TimeUnit";
1626 return '\0';
1627 }
1628 }
1629
IntervalTypeFingerprint(IntervalType::type unit)1630 static char IntervalTypeFingerprint(IntervalType::type unit) {
1631 switch (unit) {
1632 case IntervalType::DAY_TIME:
1633 return 'd';
1634 case IntervalType::MONTHS:
1635 return 'M';
1636 default:
1637 DCHECK(false) << "Unexpected IntervalType::type";
1638 return '\0';
1639 }
1640 }
1641
AppendMetadataFingerprint(const KeyValueMetadata & metadata,std::stringstream * ss)1642 static void AppendMetadataFingerprint(const KeyValueMetadata& metadata,
1643 std::stringstream* ss) {
1644 // Compute metadata fingerprint. KeyValueMetadata is not immutable,
1645 // so we don't cache the result on the metadata instance.
1646 const auto pairs = metadata.sorted_pairs();
1647 if (!pairs.empty()) {
1648 *ss << "!{";
1649 for (const auto& p : pairs) {
1650 const auto& k = p.first;
1651 const auto& v = p.second;
1652 // Since metadata strings can contain arbitrary characters, prefix with
1653 // string length to disambiguate.
1654 *ss << k.length() << ':' << k << ':';
1655 *ss << v.length() << ':' << v << ';';
1656 }
1657 *ss << '}';
1658 }
1659 }
1660
ComputeFingerprint() const1661 std::string Field::ComputeFingerprint() const {
1662 const auto& type_fingerprint = type_->fingerprint();
1663 if (type_fingerprint.empty()) {
1664 // Underlying DataType doesn't support fingerprinting.
1665 return "";
1666 }
1667 std::stringstream ss;
1668 ss << 'F';
1669 if (nullable_) {
1670 ss << 'n';
1671 } else {
1672 ss << 'N';
1673 }
1674 ss << name_;
1675 ss << '{' << type_fingerprint << '}';
1676 return ss.str();
1677 }
1678
ComputeMetadataFingerprint() const1679 std::string Field::ComputeMetadataFingerprint() const {
1680 std::stringstream ss;
1681 if (metadata_) {
1682 AppendMetadataFingerprint(*metadata_, &ss);
1683 }
1684 const auto& type_fingerprint = type_->metadata_fingerprint();
1685 if (!type_fingerprint.empty()) {
1686 ss << "+{" << type_->metadata_fingerprint() << "}";
1687 }
1688 return ss.str();
1689 }
1690
ComputeFingerprint() const1691 std::string Schema::ComputeFingerprint() const {
1692 std::stringstream ss;
1693 ss << "S{";
1694 for (const auto& field : fields()) {
1695 const auto& field_fingerprint = field->fingerprint();
1696 if (field_fingerprint.empty()) {
1697 return "";
1698 }
1699 ss << field_fingerprint << ";";
1700 }
1701 ss << "}";
1702 return ss.str();
1703 }
1704
ComputeMetadataFingerprint() const1705 std::string Schema::ComputeMetadataFingerprint() const {
1706 std::stringstream ss;
1707 if (HasMetadata()) {
1708 AppendMetadataFingerprint(*metadata(), &ss);
1709 }
1710 ss << "S{";
1711 for (const auto& field : fields()) {
1712 const auto& field_fingerprint = field->metadata_fingerprint();
1713 ss << field_fingerprint << ";";
1714 }
1715 ss << "}";
1716 return ss.str();
1717 }
1718
PrintTo(const Schema & s,std::ostream * os)1719 void PrintTo(const Schema& s, std::ostream* os) { *os << s; }
1720
ComputeFingerprint() const1721 std::string DataType::ComputeFingerprint() const {
1722 // Default implementation returns empty string, signalling non-implemented
1723 // functionality.
1724 return "";
1725 }
1726
ComputeMetadataFingerprint() const1727 std::string DataType::ComputeMetadataFingerprint() const {
1728 // Whatever the data type, metadata can only be found on child fields
1729 std::string s;
1730 for (const auto& child : children_) {
1731 s += child->metadata_fingerprint() + ";";
1732 }
1733 return s;
1734 }
1735
1736 #define PARAMETER_LESS_FINGERPRINT(TYPE_CLASS) \
1737 std::string TYPE_CLASS##Type::ComputeFingerprint() const { \
1738 return TypeIdFingerprint(*this); \
1739 }
1740
1741 PARAMETER_LESS_FINGERPRINT(Null)
PARAMETER_LESS_FINGERPRINT(Boolean)1742 PARAMETER_LESS_FINGERPRINT(Boolean)
1743 PARAMETER_LESS_FINGERPRINT(Int8)
1744 PARAMETER_LESS_FINGERPRINT(Int16)
1745 PARAMETER_LESS_FINGERPRINT(Int32)
1746 PARAMETER_LESS_FINGERPRINT(Int64)
1747 PARAMETER_LESS_FINGERPRINT(UInt8)
1748 PARAMETER_LESS_FINGERPRINT(UInt16)
1749 PARAMETER_LESS_FINGERPRINT(UInt32)
1750 PARAMETER_LESS_FINGERPRINT(UInt64)
1751 PARAMETER_LESS_FINGERPRINT(HalfFloat)
1752 PARAMETER_LESS_FINGERPRINT(Float)
1753 PARAMETER_LESS_FINGERPRINT(Double)
1754 PARAMETER_LESS_FINGERPRINT(Binary)
1755 PARAMETER_LESS_FINGERPRINT(LargeBinary)
1756 PARAMETER_LESS_FINGERPRINT(String)
1757 PARAMETER_LESS_FINGERPRINT(LargeString)
1758 PARAMETER_LESS_FINGERPRINT(Date32)
1759 PARAMETER_LESS_FINGERPRINT(Date64)
1760
1761 #undef PARAMETER_LESS_FINGERPRINT
1762
1763 std::string DictionaryType::ComputeFingerprint() const {
1764 const auto& index_fingerprint = index_type_->fingerprint();
1765 const auto& value_fingerprint = value_type_->fingerprint();
1766 std::string ordered_fingerprint = ordered_ ? "1" : "0";
1767
1768 DCHECK(!index_fingerprint.empty()); // it's an integer type
1769 if (!value_fingerprint.empty()) {
1770 return TypeIdFingerprint(*this) + index_fingerprint + value_fingerprint +
1771 ordered_fingerprint;
1772 }
1773 return ordered_fingerprint;
1774 }
1775
ComputeFingerprint() const1776 std::string ListType::ComputeFingerprint() const {
1777 const auto& child_fingerprint = children_[0]->fingerprint();
1778 if (!child_fingerprint.empty()) {
1779 return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
1780 }
1781 return "";
1782 }
1783
ComputeFingerprint() const1784 std::string LargeListType::ComputeFingerprint() const {
1785 const auto& child_fingerprint = children_[0]->fingerprint();
1786 if (!child_fingerprint.empty()) {
1787 return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
1788 }
1789 return "";
1790 }
1791
ComputeFingerprint() const1792 std::string MapType::ComputeFingerprint() const {
1793 const auto& child_fingerprint = children_[0]->fingerprint();
1794 if (!child_fingerprint.empty()) {
1795 if (keys_sorted_) {
1796 return TypeIdFingerprint(*this) + "s{" + child_fingerprint + "}";
1797 } else {
1798 return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
1799 }
1800 }
1801 return "";
1802 }
1803
ComputeFingerprint() const1804 std::string FixedSizeListType::ComputeFingerprint() const {
1805 const auto& child_fingerprint = children_[0]->fingerprint();
1806 if (!child_fingerprint.empty()) {
1807 std::stringstream ss;
1808 ss << TypeIdFingerprint(*this) << "[" << list_size_ << "]"
1809 << "{" << child_fingerprint << "}";
1810 return ss.str();
1811 }
1812 return "";
1813 }
1814
ComputeFingerprint() const1815 std::string FixedSizeBinaryType::ComputeFingerprint() const {
1816 std::stringstream ss;
1817 ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "]";
1818 return ss.str();
1819 }
1820
ComputeFingerprint() const1821 std::string DecimalType::ComputeFingerprint() const {
1822 std::stringstream ss;
1823 ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "," << precision_ << ","
1824 << scale_ << "]";
1825 return ss.str();
1826 }
1827
ComputeFingerprint() const1828 std::string StructType::ComputeFingerprint() const {
1829 std::stringstream ss;
1830 ss << TypeIdFingerprint(*this) << "{";
1831 for (const auto& child : children_) {
1832 const auto& child_fingerprint = child->fingerprint();
1833 if (child_fingerprint.empty()) {
1834 return "";
1835 }
1836 ss << child_fingerprint << ";";
1837 }
1838 ss << "}";
1839 return ss.str();
1840 }
1841
ComputeFingerprint() const1842 std::string UnionType::ComputeFingerprint() const {
1843 std::stringstream ss;
1844 ss << TypeIdFingerprint(*this);
1845 switch (mode_) {
1846 case UnionMode::SPARSE:
1847 ss << "[s";
1848 break;
1849 case UnionMode::DENSE:
1850 ss << "[d";
1851 break;
1852 default:
1853 DCHECK(false) << "Unexpected UnionMode";
1854 }
1855 for (const auto code : type_codes_) {
1856 // Represent code as integer, not raw character
1857 ss << ':' << static_cast<int32_t>(code);
1858 }
1859 ss << "]{";
1860 for (const auto& child : children_) {
1861 const auto& child_fingerprint = child->fingerprint();
1862 if (child_fingerprint.empty()) {
1863 return "";
1864 }
1865 ss << child_fingerprint << ";";
1866 }
1867 ss << "}";
1868 return ss.str();
1869 }
1870
ComputeFingerprint() const1871 std::string TimeType::ComputeFingerprint() const {
1872 std::stringstream ss;
1873 ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_);
1874 return ss.str();
1875 }
1876
ComputeFingerprint() const1877 std::string TimestampType::ComputeFingerprint() const {
1878 std::stringstream ss;
1879 ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_) << timezone_.length()
1880 << ':' << timezone_;
1881 return ss.str();
1882 }
1883
ComputeFingerprint() const1884 std::string IntervalType::ComputeFingerprint() const {
1885 std::stringstream ss;
1886 ss << TypeIdFingerprint(*this) << IntervalTypeFingerprint(interval_type());
1887 return ss.str();
1888 }
1889
ComputeFingerprint() const1890 std::string DurationType::ComputeFingerprint() const {
1891 std::stringstream ss;
1892 ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_);
1893 return ss.str();
1894 }
1895
1896 // ----------------------------------------------------------------------
1897 // Visitors and factory functions
1898
Accept(TypeVisitor * visitor) const1899 Status DataType::Accept(TypeVisitor* visitor) const {
1900 return VisitTypeInline(*this, visitor);
1901 }
1902
1903 #define TYPE_FACTORY(NAME, KLASS) \
1904 std::shared_ptr<DataType> NAME() { \
1905 static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
1906 return result; \
1907 }
1908
TYPE_FACTORY(null,NullType)1909 TYPE_FACTORY(null, NullType)
1910 TYPE_FACTORY(boolean, BooleanType)
1911 TYPE_FACTORY(int8, Int8Type)
1912 TYPE_FACTORY(uint8, UInt8Type)
1913 TYPE_FACTORY(int16, Int16Type)
1914 TYPE_FACTORY(uint16, UInt16Type)
1915 TYPE_FACTORY(int32, Int32Type)
1916 TYPE_FACTORY(uint32, UInt32Type)
1917 TYPE_FACTORY(int64, Int64Type)
1918 TYPE_FACTORY(uint64, UInt64Type)
1919 TYPE_FACTORY(float16, HalfFloatType)
1920 TYPE_FACTORY(float32, FloatType)
1921 TYPE_FACTORY(float64, DoubleType)
1922 TYPE_FACTORY(utf8, StringType)
1923 TYPE_FACTORY(large_utf8, LargeStringType)
1924 TYPE_FACTORY(binary, BinaryType)
1925 TYPE_FACTORY(large_binary, LargeBinaryType)
1926 TYPE_FACTORY(date64, Date64Type)
1927 TYPE_FACTORY(date32, Date32Type)
1928
1929 std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width) {
1930 return std::make_shared<FixedSizeBinaryType>(byte_width);
1931 }
1932
duration(TimeUnit::type unit)1933 std::shared_ptr<DataType> duration(TimeUnit::type unit) {
1934 return std::make_shared<DurationType>(unit);
1935 }
1936
day_time_interval()1937 std::shared_ptr<DataType> day_time_interval() {
1938 return std::make_shared<DayTimeIntervalType>();
1939 }
1940
month_interval()1941 std::shared_ptr<DataType> month_interval() {
1942 return std::make_shared<MonthIntervalType>();
1943 }
1944
timestamp(TimeUnit::type unit)1945 std::shared_ptr<DataType> timestamp(TimeUnit::type unit) {
1946 return std::make_shared<TimestampType>(unit);
1947 }
1948
timestamp(TimeUnit::type unit,const std::string & timezone)1949 std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone) {
1950 return std::make_shared<TimestampType>(unit, timezone);
1951 }
1952
time32(TimeUnit::type unit)1953 std::shared_ptr<DataType> time32(TimeUnit::type unit) {
1954 return std::make_shared<Time32Type>(unit);
1955 }
1956
time64(TimeUnit::type unit)1957 std::shared_ptr<DataType> time64(TimeUnit::type unit) {
1958 return std::make_shared<Time64Type>(unit);
1959 }
1960
list(const std::shared_ptr<DataType> & value_type)1961 std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
1962 return std::make_shared<ListType>(value_type);
1963 }
1964
list(const std::shared_ptr<Field> & value_field)1965 std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) {
1966 return std::make_shared<ListType>(value_field);
1967 }
1968
large_list(const std::shared_ptr<DataType> & value_type)1969 std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type) {
1970 return std::make_shared<LargeListType>(value_type);
1971 }
1972
large_list(const std::shared_ptr<Field> & value_field)1973 std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_field) {
1974 return std::make_shared<LargeListType>(value_field);
1975 }
1976
map(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<DataType> & item_type,bool keys_sorted)1977 std::shared_ptr<DataType> map(const std::shared_ptr<DataType>& key_type,
1978 const std::shared_ptr<DataType>& item_type,
1979 bool keys_sorted) {
1980 return std::make_shared<MapType>(key_type, item_type, keys_sorted);
1981 }
1982
map(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<Field> & item_field,bool keys_sorted)1983 std::shared_ptr<DataType> map(const std::shared_ptr<DataType>& key_type,
1984 const std::shared_ptr<Field>& item_field,
1985 bool keys_sorted) {
1986 return std::make_shared<MapType>(key_type, item_field, keys_sorted);
1987 }
1988
fixed_size_list(const std::shared_ptr<DataType> & value_type,int32_t list_size)1989 std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type,
1990 int32_t list_size) {
1991 return std::make_shared<FixedSizeListType>(value_type, list_size);
1992 }
1993
fixed_size_list(const std::shared_ptr<Field> & value_field,int32_t list_size)1994 std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<Field>& value_field,
1995 int32_t list_size) {
1996 return std::make_shared<FixedSizeListType>(value_field, list_size);
1997 }
1998
struct_(const std::vector<std::shared_ptr<Field>> & fields)1999 std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) {
2000 return std::make_shared<StructType>(fields);
2001 }
2002
union_(const std::vector<std::shared_ptr<Field>> & child_fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)2003 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
2004 const std::vector<int8_t>& type_codes,
2005 UnionMode::type mode) {
2006 return std::make_shared<UnionType>(child_fields, type_codes, mode);
2007 }
2008
union_(const std::vector<std::shared_ptr<Field>> & child_fields,UnionMode::type mode)2009 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
2010 UnionMode::type mode) {
2011 std::vector<int8_t> type_codes(child_fields.size());
2012 for (int i = 0; i < static_cast<int>(child_fields.size()); ++i) {
2013 type_codes[i] = static_cast<int8_t>(i);
2014 }
2015 return std::make_shared<UnionType>(child_fields, type_codes, mode);
2016 }
2017
union_(UnionMode::type mode)2018 std::shared_ptr<DataType> union_(UnionMode::type mode) {
2019 std::vector<std::shared_ptr<Field>> child_fields;
2020 return union_(child_fields, mode);
2021 }
2022
union_(const std::vector<std::shared_ptr<Array>> & children,const std::vector<std::string> & field_names,const std::vector<int8_t> & given_type_codes,UnionMode::type mode)2023 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children,
2024 const std::vector<std::string>& field_names,
2025 const std::vector<int8_t>& given_type_codes,
2026 UnionMode::type mode) {
2027 std::vector<std::shared_ptr<Field>> fields;
2028 std::vector<int8_t> type_codes(given_type_codes);
2029 int8_t counter = 0;
2030 for (const auto& child : children) {
2031 if (field_names.size() == 0) {
2032 fields.push_back(field(std::to_string(counter), child->type()));
2033 } else {
2034 fields.push_back(field(std::move(field_names[counter]), child->type()));
2035 }
2036 if (given_type_codes.size() == 0) {
2037 type_codes.push_back(counter);
2038 }
2039 counter++;
2040 }
2041 return union_(fields, std::move(type_codes), mode);
2042 }
2043
dictionary(const std::shared_ptr<DataType> & index_type,const std::shared_ptr<DataType> & dict_type,bool ordered)2044 std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
2045 const std::shared_ptr<DataType>& dict_type,
2046 bool ordered) {
2047 return std::make_shared<DictionaryType>(index_type, dict_type, ordered);
2048 }
2049
field(std::string name,std::shared_ptr<DataType> type,bool nullable,std::shared_ptr<const KeyValueMetadata> metadata)2050 std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
2051 bool nullable,
2052 std::shared_ptr<const KeyValueMetadata> metadata) {
2053 return std::make_shared<Field>(std::move(name), std::move(type), nullable,
2054 std::move(metadata));
2055 }
2056
decimal(int32_t precision,int32_t scale)2057 std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
2058 return std::make_shared<Decimal128Type>(precision, scale);
2059 }
2060
ToString() const2061 std::string Decimal128Type::ToString() const {
2062 std::stringstream s;
2063 s << "decimal(" << precision_ << ", " << scale_ << ")";
2064 return s.str();
2065 }
2066
2067 } // namespace arrow
2068