1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "arrow/type.h"
19 
20 #include <algorithm>
21 #include <climits>
22 #include <cstddef>
23 #include <ostream>
24 #include <sstream>  // IWYU pragma: keep
25 #include <string>
26 #include <unordered_map>
27 #include <unordered_set>
28 #include <utility>
29 #include <vector>
30 
31 #include "arrow/array.h"
32 #include "arrow/compare.h"
33 #include "arrow/record_batch.h"
34 #include "arrow/result.h"
35 #include "arrow/status.h"
36 #include "arrow/table.h"
37 #include "arrow/util/checked_cast.h"
38 #include "arrow/util/hash_util.h"
39 #include "arrow/util/hashing.h"
40 #include "arrow/util/key_value_metadata.h"
41 #include "arrow/util/logging.h"
42 #include "arrow/util/make_unique.h"
43 #include "arrow/util/vector.h"
44 #include "arrow/visitor_inline.h"
45 
46 namespace arrow {
47 
48 constexpr Type::type NullType::type_id;
49 constexpr Type::type ListType::type_id;
50 constexpr Type::type LargeListType::type_id;
51 
52 constexpr Type::type MapType::type_id;
53 
54 constexpr Type::type FixedSizeListType::type_id;
55 
56 constexpr Type::type BinaryType::type_id;
57 
58 constexpr Type::type LargeBinaryType::type_id;
59 
60 constexpr Type::type StringType::type_id;
61 
62 constexpr Type::type LargeStringType::type_id;
63 
64 constexpr Type::type FixedSizeBinaryType::type_id;
65 
66 constexpr Type::type StructType::type_id;
67 
68 constexpr Type::type Decimal128Type::type_id;
69 
70 constexpr Type::type UnionType::type_id;
71 
72 constexpr Type::type Date32Type::type_id;
73 
74 constexpr Type::type Date64Type::type_id;
75 
76 constexpr Type::type Time32Type::type_id;
77 
78 constexpr Type::type Time64Type::type_id;
79 
80 constexpr Type::type TimestampType::type_id;
81 
82 constexpr Type::type MonthIntervalType::type_id;
83 
84 constexpr Type::type DayTimeIntervalType::type_id;
85 
86 constexpr Type::type DurationType::type_id;
87 
88 constexpr Type::type DictionaryType::type_id;
89 
90 namespace internal {
91 
ToString(Type::type id)92 std::string ToString(Type::type id) {
93   switch (id) {
94     case Type::NA:
95       return "NA";
96     case Type::BOOL:
97       return "BOOL";
98     case Type::UINT8:
99       return "UINT8";
100     case Type::INT8:
101       return "INT8";
102     case Type::UINT16:
103       return "UINT16";
104     case Type::INT16:
105       return "INT16";
106     case Type::UINT32:
107       return "UINT32";
108     case Type::INT32:
109       return "INT32";
110     case Type::UINT64:
111       return "UINT64";
112     case Type::INT64:
113       return "INT64";
114     case Type::HALF_FLOAT:
115       return "HALF_FLOAT";
116     case Type::FLOAT:
117       return "FLOAT";
118     case Type::DOUBLE:
119       return "DOUBLE";
120     case Type::STRING:
121       return "UTF8";
122     case Type::BINARY:
123       return "BINARY";
124     case Type::FIXED_SIZE_BINARY:
125       return "FIXED_SIZE_BINARY";
126     case Type::DATE64:
127       return "DATE64";
128     case Type::TIMESTAMP:
129       return "TIMESTAMP";
130     case Type::TIME32:
131       return "TIME32";
132     case Type::TIME64:
133       return "TIME64";
134     case Type::INTERVAL_MONTHS:
135       return "INTERVAL_MONTHS";
136     case Type::INTERVAL_DAY_TIME:
137       return "INTERVAL_DAY_TIME";
138     case Type::DECIMAL:
139       return "DECIMAL";
140     case Type::LIST:
141       return "LIST";
142     case Type::STRUCT:
143       return "STRUCT";
144     case Type::UNION:
145       return "UNION";
146     case Type::DICTIONARY:
147       return "DICTIONARY";
148     case Type::MAP:
149       return "MAP";
150     case Type::EXTENSION:
151       return "EXTENSION";
152     case Type::FIXED_SIZE_LIST:
153       return "FIXED_SIZE_LIST";
154     case Type::DURATION:
155       return "DURATION";
156     case Type::LARGE_BINARY:
157       return "LARGE_BINARY";
158     case Type::LARGE_LIST:
159       return "LARGE_LIST";
160     default:
161       DCHECK(false) << "Should not be able to reach here";
162       return "unknown";
163   }
164 }
165 
ToString(TimeUnit::type unit)166 std::string ToString(TimeUnit::type unit) {
167   switch (unit) {
168     case TimeUnit::SECOND:
169       return "s";
170     case TimeUnit::MILLI:
171       return "ms";
172     case TimeUnit::MICRO:
173       return "us";
174     case TimeUnit::NANO:
175       return "ns";
176     default:
177       DCHECK(false);
178       return "";
179   }
180 }
181 
182 }  // namespace internal
183 
184 namespace {
185 
186 using internal::checked_cast;
187 
188 // Merges `existing` and `other` if one of them is of NullType, otherwise
189 // returns nullptr.
190 //   - if `other` if of NullType or is nullable, the unified field will be nullable.
191 //   - if `existing` is of NullType but other is not, the unified field will
192 //     have `other`'s type and will be nullable
MaybePromoteNullTypes(const Field & existing,const Field & other)193 std::shared_ptr<Field> MaybePromoteNullTypes(const Field& existing, const Field& other) {
194   if (existing.type()->id() != Type::NA && other.type()->id() != Type::NA) {
195     return nullptr;
196   }
197   if (existing.type()->id() == Type::NA) {
198     return other.WithNullable(true)->WithMetadata(existing.metadata());
199   }
200   // `other` must be null.
201   return existing.WithNullable(true);
202 }
203 }  // namespace
204 
~Field()205 Field::~Field() {}
206 
HasMetadata() const207 bool Field::HasMetadata() const {
208   return (metadata_ != nullptr) && (metadata_->size() > 0);
209 }
210 
WithMetadata(const std::shared_ptr<const KeyValueMetadata> & metadata) const211 std::shared_ptr<Field> Field::WithMetadata(
212     const std::shared_ptr<const KeyValueMetadata>& metadata) const {
213   return std::make_shared<Field>(name_, type_, nullable_, metadata);
214 }
215 
WithMergedMetadata(const std::shared_ptr<const KeyValueMetadata> & metadata) const216 std::shared_ptr<Field> Field::WithMergedMetadata(
217     const std::shared_ptr<const KeyValueMetadata>& metadata) const {
218   std::shared_ptr<const KeyValueMetadata> merged_metadata;
219   if (metadata_) {
220     merged_metadata = metadata_->Merge(*metadata);
221   } else {
222     merged_metadata = metadata;
223   }
224   return std::make_shared<Field>(name_, type_, nullable_, merged_metadata);
225 }
226 
RemoveMetadata() const227 std::shared_ptr<Field> Field::RemoveMetadata() const {
228   return std::make_shared<Field>(name_, type_, nullable_);
229 }
230 
WithType(const std::shared_ptr<DataType> & type) const231 std::shared_ptr<Field> Field::WithType(const std::shared_ptr<DataType>& type) const {
232   return std::make_shared<Field>(name_, type, nullable_, metadata_);
233 }
234 
WithName(const std::string & name) const235 std::shared_ptr<Field> Field::WithName(const std::string& name) const {
236   return std::make_shared<Field>(name, type_, nullable_, metadata_);
237 }
238 
WithNullable(const bool nullable) const239 std::shared_ptr<Field> Field::WithNullable(const bool nullable) const {
240   return std::make_shared<Field>(name_, type_, nullable, metadata_);
241 }
242 
MergeWith(const Field & other,MergeOptions options) const243 Result<std::shared_ptr<Field>> Field::MergeWith(const Field& other,
244                                                 MergeOptions options) const {
245   if (name() != other.name()) {
246     return Status::Invalid("Field ", name(), " doesn't have the same name as ",
247                            other.name());
248   }
249 
250   if (Equals(other, /*check_metadata=*/false)) {
251     return Copy();
252   }
253 
254   if (options.promote_nullability) {
255     if (type()->Equals(other.type())) {
256       return Copy()->WithNullable(nullable() || other.nullable());
257     }
258     std::shared_ptr<Field> promoted = MaybePromoteNullTypes(*this, other);
259     if (promoted) return promoted;
260   }
261 
262   return Status::Invalid("Unable to merge: Field ", name(),
263                          " has incompatible types: ", type()->ToString(), " vs ",
264                          other.type()->ToString());
265 }
266 
MergeWith(const std::shared_ptr<Field> & other,MergeOptions options) const267 Result<std::shared_ptr<Field>> Field::MergeWith(const std::shared_ptr<Field>& other,
268                                                 MergeOptions options) const {
269   DCHECK_NE(other, nullptr);
270   return MergeWith(*other, options);
271 }
272 
Flatten() const273 std::vector<std::shared_ptr<Field>> Field::Flatten() const {
274   std::vector<std::shared_ptr<Field>> flattened;
275   if (type_->id() == Type::STRUCT) {
276     for (const auto& child : type_->fields()) {
277       auto flattened_child = child->Copy();
278       flattened.push_back(flattened_child);
279       flattened_child->name_.insert(0, name() + ".");
280       flattened_child->nullable_ |= nullable_;
281     }
282   } else {
283     flattened.push_back(this->Copy());
284   }
285   return flattened;
286 }
287 
Copy() const288 std::shared_ptr<Field> Field::Copy() const {
289   return ::arrow::field(name_, type_, nullable_, metadata_);
290 }
291 
Equals(const Field & other,bool check_metadata) const292 bool Field::Equals(const Field& other, bool check_metadata) const {
293   if (this == &other) {
294     return true;
295   }
296   if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
297       this->type_->Equals(*other.type_.get(), check_metadata)) {
298     if (!check_metadata) {
299       return true;
300     } else if (this->HasMetadata() && other.HasMetadata()) {
301       return metadata_->Equals(*other.metadata_);
302     } else if (!this->HasMetadata() && !other.HasMetadata()) {
303       return true;
304     } else {
305       return false;
306     }
307   }
308   return false;
309 }
310 
Equals(const std::shared_ptr<Field> & other,bool check_metadata) const311 bool Field::Equals(const std::shared_ptr<Field>& other, bool check_metadata) const {
312   return Equals(*other.get(), check_metadata);
313 }
314 
IsCompatibleWith(const Field & other) const315 bool Field::IsCompatibleWith(const Field& other) const { return MergeWith(other).ok(); }
316 
IsCompatibleWith(const std::shared_ptr<Field> & other) const317 bool Field::IsCompatibleWith(const std::shared_ptr<Field>& other) const {
318   DCHECK_NE(other, nullptr);
319   return IsCompatibleWith(*other);
320 }
321 
ToString(bool show_metadata) const322 std::string Field::ToString(bool show_metadata) const {
323   std::stringstream ss;
324   ss << name_ << ": " << type_->ToString();
325   if (!nullable_) {
326     ss << " not null";
327   }
328   if (show_metadata && metadata_) {
329     ss << metadata_->ToString();
330   }
331   return ss.str();
332 }
333 
~DataType()334 DataType::~DataType() {}
335 
Equals(const DataType & other,bool check_metadata) const336 bool DataType::Equals(const DataType& other, bool check_metadata) const {
337   return TypeEquals(*this, other, check_metadata);
338 }
339 
Equals(const std::shared_ptr<DataType> & other) const340 bool DataType::Equals(const std::shared_ptr<DataType>& other) const {
341   if (!other) {
342     return false;
343   }
344   return Equals(*other.get());
345 }
346 
Hash() const347 size_t DataType::Hash() const {
348   static constexpr size_t kHashSeed = 0;
349   size_t result = kHashSeed;
350   internal::hash_combine(result, this->ComputeFingerprint());
351   return result;
352 }
353 
operator <<(std::ostream & os,const DataType & type)354 std::ostream& operator<<(std::ostream& os, const DataType& type) {
355   os << type.ToString();
356   return os;
357 }
358 
precision() const359 FloatingPointType::Precision HalfFloatType::precision() const {
360   return FloatingPointType::HALF;
361 }
362 
precision() const363 FloatingPointType::Precision FloatType::precision() const {
364   return FloatingPointType::SINGLE;
365 }
366 
precision() const367 FloatingPointType::Precision DoubleType::precision() const {
368   return FloatingPointType::DOUBLE;
369 }
370 
ToString() const371 std::string ListType::ToString() const {
372   std::stringstream s;
373   s << "list<" << value_field()->ToString() << ">";
374   return s.str();
375 }
376 
ToString() const377 std::string LargeListType::ToString() const {
378   std::stringstream s;
379   s << "large_list<" << value_field()->ToString() << ">";
380   return s.str();
381 }
382 
MapType(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<DataType> & item_type,bool keys_sorted)383 MapType::MapType(const std::shared_ptr<DataType>& key_type,
384                  const std::shared_ptr<DataType>& item_type, bool keys_sorted)
385     : MapType(key_type, ::arrow::field("value", item_type), keys_sorted) {}
386 
MapType(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<Field> & item_field,bool keys_sorted)387 MapType::MapType(const std::shared_ptr<DataType>& key_type,
388                  const std::shared_ptr<Field>& item_field, bool keys_sorted)
389     : ListType(::arrow::field(
390           "entries",
391           struct_({std::make_shared<Field>("key", key_type, false), item_field}), false)),
392       keys_sorted_(keys_sorted) {
393   id_ = type_id;
394 }
395 
ToString() const396 std::string MapType::ToString() const {
397   std::stringstream s;
398   s << "map<" << key_type()->ToString() << ", " << item_type()->ToString();
399   if (keys_sorted_) {
400     s << ", keys_sorted";
401   }
402   s << ">";
403   return s.str();
404 }
405 
ToString() const406 std::string FixedSizeListType::ToString() const {
407   std::stringstream s;
408   s << "fixed_size_list<" << value_field()->ToString() << ">[" << list_size_ << "]";
409   return s.str();
410 }
411 
ToString() const412 std::string BinaryType::ToString() const { return "binary"; }
413 
ToString() const414 std::string LargeBinaryType::ToString() const { return "large_binary"; }
415 
ToString() const416 std::string StringType::ToString() const { return "string"; }
417 
ToString() const418 std::string LargeStringType::ToString() const { return "large_string"; }
419 
bit_width() const420 int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
421 
ToString() const422 std::string FixedSizeBinaryType::ToString() const {
423   std::stringstream ss;
424   ss << "fixed_size_binary[" << byte_width_ << "]";
425   return ss.str();
426 }
427 
428 // ----------------------------------------------------------------------
429 // Date types
430 
DateType(Type::type type_id)431 DateType::DateType(Type::type type_id) : TemporalType(type_id) {}
432 
Date32Type()433 Date32Type::Date32Type() : DateType(Type::DATE32) {}
434 
Date64Type()435 Date64Type::Date64Type() : DateType(Type::DATE64) {}
436 
ToString() const437 std::string Date64Type::ToString() const { return std::string("date64[ms]"); }
438 
ToString() const439 std::string Date32Type::ToString() const { return std::string("date32[day]"); }
440 
441 // ----------------------------------------------------------------------
442 // Time types
443 
TimeType(Type::type type_id,TimeUnit::type unit)444 TimeType::TimeType(Type::type type_id, TimeUnit::type unit)
445     : TemporalType(type_id), unit_(unit) {}
446 
Time32Type(TimeUnit::type unit)447 Time32Type::Time32Type(TimeUnit::type unit) : TimeType(Type::TIME32, unit) {
448   ARROW_CHECK(unit == TimeUnit::SECOND || unit == TimeUnit::MILLI)
449       << "Must be seconds or milliseconds";
450 }
451 
ToString() const452 std::string Time32Type::ToString() const {
453   std::stringstream ss;
454   ss << "time32[" << this->unit_ << "]";
455   return ss.str();
456 }
457 
Time64Type(TimeUnit::type unit)458 Time64Type::Time64Type(TimeUnit::type unit) : TimeType(Type::TIME64, unit) {
459   ARROW_CHECK(unit == TimeUnit::MICRO || unit == TimeUnit::NANO)
460       << "Must be microseconds or nanoseconds";
461 }
462 
ToString() const463 std::string Time64Type::ToString() const {
464   std::stringstream ss;
465   ss << "time64[" << this->unit_ << "]";
466   return ss.str();
467 }
468 
operator <<(std::ostream & os,TimeUnit::type unit)469 std::ostream& operator<<(std::ostream& os, TimeUnit::type unit) {
470   switch (unit) {
471     case TimeUnit::SECOND:
472       os << "s";
473       break;
474     case TimeUnit::MILLI:
475       os << "ms";
476       break;
477     case TimeUnit::MICRO:
478       os << "us";
479       break;
480     case TimeUnit::NANO:
481       os << "ns";
482       break;
483   }
484   return os;
485 }
486 
487 // ----------------------------------------------------------------------
488 // Timestamp types
489 
ToString() const490 std::string TimestampType::ToString() const {
491   std::stringstream ss;
492   ss << "timestamp[" << this->unit_;
493   if (this->timezone_.size() > 0) {
494     ss << ", tz=" << this->timezone_;
495   }
496   ss << "]";
497   return ss.str();
498 }
499 
500 // Duration types
ToString() const501 std::string DurationType::ToString() const {
502   std::stringstream ss;
503   ss << "duration[" << this->unit_ << "]";
504   return ss.str();
505 }
506 
507 // ----------------------------------------------------------------------
508 // Union type
509 
510 constexpr int8_t UnionType::kMaxTypeCode;
511 constexpr int UnionType::kInvalidChildId;
512 
UnionType(const std::vector<std::shared_ptr<Field>> & fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)513 UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
514                      const std::vector<int8_t>& type_codes, UnionMode::type mode)
515     : NestedType(Type::UNION),
516       mode_(mode),
517       type_codes_(type_codes),
518       child_ids_(kMaxTypeCode + 1, kInvalidChildId) {
519   DCHECK_OK(ValidateParameters(fields, type_codes, mode));
520   children_ = fields;
521   for (int child_id = 0; child_id < static_cast<int>(type_codes_.size()); ++child_id) {
522     const auto type_code = type_codes_[child_id];
523     child_ids_[type_code] = child_id;
524   }
525 }
526 
Make(const std::vector<std::shared_ptr<Field>> & fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)527 Result<std::shared_ptr<DataType>> UnionType::Make(
528     const std::vector<std::shared_ptr<Field>>& fields,
529     const std::vector<int8_t>& type_codes, UnionMode::type mode) {
530   RETURN_NOT_OK(ValidateParameters(fields, type_codes, mode));
531   return std::make_shared<UnionType>(fields, type_codes, mode);
532 }
533 
ValidateParameters(const std::vector<std::shared_ptr<Field>> & fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)534 Status UnionType::ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields,
535                                      const std::vector<int8_t>& type_codes,
536                                      UnionMode::type mode) {
537   if (fields.size() != type_codes.size()) {
538     return Status::Invalid("Union should get the same number of fields as type codes");
539   }
540   for (const auto type_code : type_codes) {
541     if (type_code < 0 || type_code > kMaxTypeCode) {
542       return Status::Invalid("Union type code out of bounds");
543     }
544   }
545   return Status::OK();
546 }
547 
layout() const548 DataTypeLayout UnionType::layout() const {
549   if (mode_ == UnionMode::SPARSE) {
550     return DataTypeLayout({DataTypeLayout::Bitmap(),
551                            DataTypeLayout::FixedWidth(sizeof(uint8_t)),
552                            DataTypeLayout::AlwaysNull()});
553   } else {
554     return DataTypeLayout({DataTypeLayout::Bitmap(),
555                            DataTypeLayout::FixedWidth(sizeof(uint8_t)),
556                            DataTypeLayout::FixedWidth(sizeof(int32_t))});
557   }
558 }
559 
max_type_code() const560 uint8_t UnionType::max_type_code() const {
561   return type_codes_.size() == 0
562              ? 0
563              : *std::max_element(type_codes_.begin(), type_codes_.end());
564 }
565 
ToString() const566 std::string UnionType::ToString() const {
567   std::stringstream s;
568 
569   if (mode_ == UnionMode::SPARSE) {
570     s << "union[sparse]<";
571   } else {
572     s << "union[dense]<";
573   }
574 
575   for (size_t i = 0; i < children_.size(); ++i) {
576     if (i) {
577       s << ", ";
578     }
579     s << children_[i]->ToString() << "=" << static_cast<int>(type_codes_[i]);
580   }
581   s << ">";
582   return s.str();
583 }
584 
585 // ----------------------------------------------------------------------
586 // Struct type
587 
588 namespace {
589 
CreateNameToIndexMap(const std::vector<std::shared_ptr<Field>> & fields)590 std::unordered_multimap<std::string, int> CreateNameToIndexMap(
591     const std::vector<std::shared_ptr<Field>>& fields) {
592   std::unordered_multimap<std::string, int> name_to_index;
593   for (size_t i = 0; i < fields.size(); ++i) {
594     name_to_index.emplace(fields[i]->name(), static_cast<int>(i));
595   }
596   return name_to_index;
597 }
598 
599 template <int NotFoundValue = -1, int DuplicateFoundValue = -1>
LookupNameIndex(const std::unordered_multimap<std::string,int> & name_to_index,const std::string & name)600 int LookupNameIndex(const std::unordered_multimap<std::string, int>& name_to_index,
601                     const std::string& name) {
602   auto p = name_to_index.equal_range(name);
603   auto it = p.first;
604   if (it == p.second) {
605     // Not found
606     return NotFoundValue;
607   }
608   auto index = it->second;
609   if (++it != p.second) {
610     // Duplicate field name
611     return DuplicateFoundValue;
612   }
613   return index;
614 }
615 
616 }  // namespace
617 
618 class StructType::Impl {
619  public:
Impl(const std::vector<std::shared_ptr<Field>> & fields)620   explicit Impl(const std::vector<std::shared_ptr<Field>>& fields)
621       : name_to_index_(CreateNameToIndexMap(fields)) {}
622 
623   const std::unordered_multimap<std::string, int> name_to_index_;
624 };
625 
StructType(const std::vector<std::shared_ptr<Field>> & fields)626 StructType::StructType(const std::vector<std::shared_ptr<Field>>& fields)
627     : NestedType(Type::STRUCT), impl_(new Impl(fields)) {
628   children_ = fields;
629 }
630 
~StructType()631 StructType::~StructType() {}
632 
ToString() const633 std::string StructType::ToString() const {
634   std::stringstream s;
635   s << "struct<";
636   for (int i = 0; i < this->num_fields(); ++i) {
637     if (i > 0) {
638       s << ", ";
639     }
640     std::shared_ptr<Field> field = this->field(i);
641     s << field->ToString();
642   }
643   s << ">";
644   return s.str();
645 }
646 
GetFieldByName(const std::string & name) const647 std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const {
648   int i = GetFieldIndex(name);
649   return i == -1 ? nullptr : children_[i];
650 }
651 
GetFieldIndex(const std::string & name) const652 int StructType::GetFieldIndex(const std::string& name) const {
653   return LookupNameIndex(impl_->name_to_index_, name);
654 }
655 
GetAllFieldIndices(const std::string & name) const656 std::vector<int> StructType::GetAllFieldIndices(const std::string& name) const {
657   std::vector<int> result;
658   auto p = impl_->name_to_index_.equal_range(name);
659   for (auto it = p.first; it != p.second; ++it) {
660     result.push_back(it->second);
661   }
662   if (result.size() > 1) {
663     std::sort(result.begin(), result.end());
664   }
665   return result;
666 }
667 
GetAllFieldsByName(const std::string & name) const668 std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName(
669     const std::string& name) const {
670   std::vector<std::shared_ptr<Field>> result;
671   auto p = impl_->name_to_index_.equal_range(name);
672   for (auto it = p.first; it != p.second; ++it) {
673     result.push_back(children_[it->second]);
674   }
675   return result;
676 }
677 
678 // ----------------------------------------------------------------------
679 // Decimal128 type
680 
Decimal128Type(int32_t precision,int32_t scale)681 Decimal128Type::Decimal128Type(int32_t precision, int32_t scale)
682     : DecimalType(16, precision, scale) {
683   ARROW_CHECK_GE(precision, kMinPrecision);
684   ARROW_CHECK_LE(precision, kMaxPrecision);
685 }
686 
Make(int32_t precision,int32_t scale)687 Result<std::shared_ptr<DataType>> Decimal128Type::Make(int32_t precision, int32_t scale) {
688   if (precision < kMinPrecision || precision > kMaxPrecision) {
689     return Status::Invalid("Decimal precision out of range: ", precision);
690   }
691   return std::make_shared<Decimal128Type>(precision, scale);
692 }
693 
Make(int32_t precision,int32_t scale,std::shared_ptr<DataType> * out)694 Status Decimal128Type::Make(int32_t precision, int32_t scale,
695                             std::shared_ptr<DataType>* out) {
696   return Make(precision, scale).Value(out);
697 }
698 
699 // ----------------------------------------------------------------------
700 // Dictionary-encoded type
701 
ValidateParameters(const DataType & index_type,const DataType & value_type)702 Status DictionaryType::ValidateParameters(const DataType& index_type,
703                                           const DataType& value_type) {
704   const bool index_type_ok = is_integer(index_type.id()) &&
705                              checked_cast<const IntegerType&>(index_type).is_signed();
706   if (!index_type_ok) {
707     return Status::TypeError("Dictionary index type should be signed integer, got ",
708                              index_type.ToString());
709   }
710   return Status::OK();
711 }
712 
bit_width() const713 int DictionaryType::bit_width() const {
714   return checked_cast<const FixedWidthType&>(*index_type_).bit_width();
715 }
716 
Make(const std::shared_ptr<DataType> & index_type,const std::shared_ptr<DataType> & value_type,bool ordered)717 Result<std::shared_ptr<DataType>> DictionaryType::Make(
718     const std::shared_ptr<DataType>& index_type,
719     const std::shared_ptr<DataType>& value_type, bool ordered) {
720   RETURN_NOT_OK(ValidateParameters(*index_type, *value_type));
721   return std::make_shared<DictionaryType>(index_type, value_type, ordered);
722 }
723 
DictionaryType(const std::shared_ptr<DataType> & index_type,const std::shared_ptr<DataType> & value_type,bool ordered)724 DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type,
725                                const std::shared_ptr<DataType>& value_type, bool ordered)
726     : FixedWidthType(Type::DICTIONARY),
727       index_type_(index_type),
728       value_type_(value_type),
729       ordered_(ordered) {
730   ARROW_CHECK_OK(ValidateParameters(*index_type_, *value_type_));
731 }
732 
layout() const733 DataTypeLayout DictionaryType::layout() const {
734   auto layout = index_type_->layout();
735   layout.has_dictionary = true;
736   return layout;
737 }
738 
ToString() const739 std::string DictionaryType::ToString() const {
740   std::stringstream ss;
741   ss << this->name() << "<values=" << value_type_->ToString()
742      << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">";
743   return ss.str();
744 }
745 
746 // ----------------------------------------------------------------------
747 // Null type
748 
ToString() const749 std::string NullType::ToString() const { return name(); }
750 
751 // ----------------------------------------------------------------------
752 // FieldRef
753 
hash() const754 size_t FieldPath::hash() const {
755   return internal::ComputeStringHash<0>(indices().data(), indices().size() * sizeof(int));
756 }
757 
ToString() const758 std::string FieldPath::ToString() const {
759   std::string repr = "FieldPath(";
760   for (auto index : this->indices()) {
761     repr += std::to_string(index) + " ";
762   }
763   repr.resize(repr.size() - 1);
764   repr += ")";
765   return repr;
766 }
767 
768 struct FieldPathGetImpl {
GetTypearrow::FieldPathGetImpl769   static const DataType& GetType(const ArrayData& data) { return *data.type; }
770 
GetTypearrow::FieldPathGetImpl771   static const DataType& GetType(const ChunkedArray& array) { return *array.type(); }
772 
Summarizearrow::FieldPathGetImpl773   static void Summarize(const FieldVector& fields, std::stringstream* ss) {
774     *ss << "{ ";
775     for (const auto& field : fields) {
776       *ss << field->ToString() << ", ";
777     }
778     *ss << "}";
779   }
780 
781   template <typename T>
Summarizearrow::FieldPathGetImpl782   static void Summarize(const std::vector<T>& columns, std::stringstream* ss) {
783     *ss << "{ ";
784     for (const auto& column : columns) {
785       *ss << GetType(*column) << ", ";
786     }
787     *ss << "}";
788   }
789 
790   template <typename T>
IndexErrorarrow::FieldPathGetImpl791   static Status IndexError(const FieldPath* path, int out_of_range_depth,
792                            const std::vector<T>& children) {
793     std::stringstream ss;
794     ss << "index out of range. ";
795 
796     ss << "indices=[ ";
797     int depth = 0;
798     for (int i : path->indices()) {
799       if (depth != out_of_range_depth) {
800         ss << i << " ";
801         continue;
802       }
803       ss << ">" << i << "< ";
804       ++depth;
805     }
806     ss << "] ";
807 
808     if (std::is_same<T, std::shared_ptr<Field>>::value) {
809       ss << "fields were: ";
810     } else {
811       ss << "columns had types: ";
812     }
813     Summarize(children, &ss);
814 
815     return Status::IndexError(ss.str());
816   }
817 
818   template <typename T, typename GetChildren>
Getarrow::FieldPathGetImpl819   static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
820                        GetChildren&& get_children, int* out_of_range_depth) {
821     if (path->indices().empty()) {
822       return Status::Invalid("empty indices cannot be traversed");
823     }
824 
825     int depth = 0;
826     const T* out;
827     for (int index : path->indices()) {
828       if (index < 0 || static_cast<size_t>(index) >= children->size()) {
829         *out_of_range_depth = depth;
830         return nullptr;
831       }
832 
833       out = &children->at(index);
834       children = get_children(*out);
835       ++depth;
836     }
837 
838     return *out;
839   }
840 
841   template <typename T, typename GetChildren>
Getarrow::FieldPathGetImpl842   static Result<T> Get(const FieldPath* path, const std::vector<T>* children,
843                        GetChildren&& get_children) {
844     int out_of_range_depth = -1;
845     ARROW_ASSIGN_OR_RAISE(auto child,
846                           Get(path, children, std::forward<GetChildren>(get_children),
847                               &out_of_range_depth));
848     if (child != nullptr) {
849       return std::move(child);
850     }
851     return IndexError(path, out_of_range_depth, *children);
852   }
853 
Getarrow::FieldPathGetImpl854   static Result<std::shared_ptr<Field>> Get(const FieldPath* path,
855                                             const FieldVector& fields) {
856     return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr<Field>& field) {
857       return &field->type()->fields();
858     });
859   }
860 
Getarrow::FieldPathGetImpl861   static Result<std::shared_ptr<ArrayData>> Get(const FieldPath* path,
862                                                 const ArrayDataVector& child_data) {
863     return FieldPathGetImpl::Get(
864         path, &child_data,
865         [](const std::shared_ptr<ArrayData>& data) { return &data->child_data; });
866   }
867 
Getarrow::FieldPathGetImpl868   static Result<std::shared_ptr<ChunkedArray>> Get(
869       const FieldPath* path, const ChunkedArrayVector& columns_arg) {
870     ChunkedArrayVector columns = columns_arg;
871 
872     return FieldPathGetImpl::Get(
873         path, &columns, [&](const std::shared_ptr<ChunkedArray>& a) {
874           columns.clear();
875 
876           for (int i = 0; i < a->type()->num_fields(); ++i) {
877             ArrayVector child_chunks;
878 
879             for (const auto& chunk : a->chunks()) {
880               auto child_chunk = MakeArray(chunk->data()->child_data[i]);
881               child_chunks.push_back(std::move(child_chunk));
882             }
883 
884             auto child_column = std::make_shared<ChunkedArray>(
885                 std::move(child_chunks), a->type()->field(i)->type());
886 
887             columns.emplace_back(std::move(child_column));
888           }
889 
890           return &columns;
891         });
892   }
893 };
894 
Get(const Schema & schema) const895 Result<std::shared_ptr<Field>> FieldPath::Get(const Schema& schema) const {
896   return FieldPathGetImpl::Get(this, schema.fields());
897 }
898 
Get(const Field & field) const899 Result<std::shared_ptr<Field>> FieldPath::Get(const Field& field) const {
900   return FieldPathGetImpl::Get(this, field.type()->fields());
901 }
902 
Get(const DataType & type) const903 Result<std::shared_ptr<Field>> FieldPath::Get(const DataType& type) const {
904   return FieldPathGetImpl::Get(this, type.fields());
905 }
906 
Get(const FieldVector & fields) const907 Result<std::shared_ptr<Field>> FieldPath::Get(const FieldVector& fields) const {
908   return FieldPathGetImpl::Get(this, fields);
909 }
910 
Get(const RecordBatch & batch) const911 Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const {
912   ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data()));
913   return MakeArray(data);
914 }
915 
Get(const Table & table) const916 Result<std::shared_ptr<ChunkedArray>> FieldPath::Get(const Table& table) const {
917   return FieldPathGetImpl::Get(this, table.columns());
918 }
919 
FieldRef(FieldPath indices)920 FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
921   DCHECK_GT(util::get<FieldPath>(impl_).indices().size(), 0);
922 }
923 
Flatten(std::vector<FieldRef> children)924 void FieldRef::Flatten(std::vector<FieldRef> children) {
925   // flatten children
926   struct Visitor {
927     void operator()(std::string&& name) { *out++ = FieldRef(std::move(name)); }
928 
929     void operator()(FieldPath&& indices) { *out++ = FieldRef(std::move(indices)); }
930 
931     void operator()(std::vector<FieldRef>&& children) {
932       for (auto& child : children) {
933         util::visit(*this, std::move(child.impl_));
934       }
935     }
936 
937     std::back_insert_iterator<std::vector<FieldRef>> out;
938   };
939 
940   std::vector<FieldRef> out;
941   Visitor visitor{std::back_inserter(out)};
942   visitor(std::move(children));
943 
944   DCHECK(!out.empty());
945   DCHECK(std::none_of(out.begin(), out.end(),
946                       [](const FieldRef& ref) { return ref.IsNested(); }));
947 
948   if (out.size() == 1) {
949     impl_ = std::move(out[0].impl_);
950   } else {
951     impl_ = std::move(out);
952   }
953 }
954 
FromDotPath(const std::string & dot_path_arg)955 Result<FieldRef> FieldRef::FromDotPath(const std::string& dot_path_arg) {
956   if (dot_path_arg.empty()) {
957     return Status::Invalid("Dot path was empty");
958   }
959 
960   std::vector<FieldRef> children;
961 
962   util::string_view dot_path = dot_path_arg;
963 
964   auto parse_name = [&] {
965     std::string name;
966     for (;;) {
967       auto segment_end = dot_path.find_first_of("\\[.");
968       if (segment_end == util::string_view::npos) {
969         // dot_path doesn't contain any other special characters; consume all
970         name.append(dot_path.begin(), dot_path.end());
971         dot_path = "";
972         break;
973       }
974 
975       if (dot_path[segment_end] != '\\') {
976         // segment_end points to a subscript for a new FieldRef
977         name.append(dot_path.begin(), segment_end);
978         dot_path = dot_path.substr(segment_end);
979         break;
980       }
981 
982       if (dot_path.size() == segment_end + 1) {
983         // dot_path ends with backslash; consume it all
984         name.append(dot_path.begin(), dot_path.end());
985         dot_path = "";
986         break;
987       }
988 
989       // append all characters before backslash, then the character which follows it
990       name.append(dot_path.begin(), segment_end);
991       name.push_back(dot_path[segment_end + 1]);
992       dot_path = dot_path.substr(segment_end + 2);
993     }
994     return name;
995   };
996 
997   while (!dot_path.empty()) {
998     auto subscript = dot_path[0];
999     dot_path = dot_path.substr(1);
1000     switch (subscript) {
1001       case '.': {
1002         // next element is a name
1003         children.emplace_back(parse_name());
1004         continue;
1005       }
1006       case '[': {
1007         auto subscript_end = dot_path.find_first_not_of("0123456789");
1008         if (subscript_end == util::string_view::npos || dot_path[subscript_end] != ']') {
1009           return Status::Invalid("Dot path '", dot_path_arg,
1010                                  "' contained an unterminated index");
1011         }
1012         children.emplace_back(std::atoi(dot_path.data()));
1013         dot_path = dot_path.substr(subscript_end + 1);
1014         continue;
1015       }
1016       default:
1017         return Status::Invalid("Dot path must begin with '[' or '.', got '", dot_path_arg,
1018                                "'");
1019     }
1020   }
1021 
1022   FieldRef out;
1023   out.Flatten(std::move(children));
1024   return out;
1025 }
1026 
hash() const1027 size_t FieldRef::hash() const {
1028   struct Visitor : std::hash<std::string> {
1029     using std::hash<std::string>::operator();
1030 
1031     size_t operator()(const FieldPath& path) { return path.hash(); }
1032 
1033     size_t operator()(const std::vector<FieldRef>& children) {
1034       size_t hash = 0;
1035 
1036       for (const FieldRef& child : children) {
1037         hash ^= child.hash();
1038       }
1039 
1040       return hash;
1041     }
1042   };
1043 
1044   return util::visit(Visitor{}, impl_);
1045 }
1046 
ToString() const1047 std::string FieldRef::ToString() const {
1048   struct Visitor {
1049     std::string operator()(const FieldPath& path) { return path.ToString(); }
1050 
1051     std::string operator()(const std::string& name) { return "Name(" + name + ")"; }
1052 
1053     std::string operator()(const std::vector<FieldRef>& children) {
1054       std::string repr = "Nested(";
1055       for (const auto& child : children) {
1056         repr += child.ToString() + " ";
1057       }
1058       repr.resize(repr.size() - 1);
1059       repr += ")";
1060       return repr;
1061     }
1062   };
1063 
1064   return "FieldRef." + util::visit(Visitor{}, impl_);
1065 }
1066 
FindAll(const Schema & schema) const1067 std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const {
1068   return FindAll(schema.fields());
1069 }
1070 
FindAll(const Field & field) const1071 std::vector<FieldPath> FieldRef::FindAll(const Field& field) const {
1072   return FindAll(field.type()->fields());
1073 }
1074 
FindAll(const DataType & type) const1075 std::vector<FieldPath> FieldRef::FindAll(const DataType& type) const {
1076   return FindAll(type.fields());
1077 }
1078 
FindAll(const FieldVector & fields) const1079 std::vector<FieldPath> FieldRef::FindAll(const FieldVector& fields) const {
1080   struct Visitor {
1081     std::vector<FieldPath> operator()(const FieldPath& path) {
1082       // skip long IndexError construction if path is out of range
1083       int out_of_range_depth;
1084       auto maybe_field = FieldPathGetImpl::Get(
1085           &path, &fields_,
1086           [](const std::shared_ptr<Field>& field) { return &field->type()->fields(); },
1087           &out_of_range_depth);
1088 
1089       DCHECK_OK(maybe_field.status());
1090 
1091       if (maybe_field.ValueOrDie() != nullptr) {
1092         return {path};
1093       }
1094       return {};
1095     }
1096 
1097     std::vector<FieldPath> operator()(const std::string& name) {
1098       std::vector<FieldPath> out;
1099 
1100       for (int i = 0; i < static_cast<int>(fields_.size()); ++i) {
1101         if (fields_[i]->name() == name) {
1102           out.push_back({i});
1103         }
1104       }
1105 
1106       return out;
1107     }
1108 
1109     struct Matches {
1110       // referents[i] is referenced by prefixes[i]
1111       std::vector<FieldPath> prefixes;
1112       FieldVector referents;
1113 
1114       Matches(std::vector<FieldPath> matches, const FieldVector& fields) {
1115         for (auto& match : matches) {
1116           Add({}, std::move(match), fields);
1117         }
1118       }
1119 
1120       Matches() = default;
1121 
1122       size_t size() const { return referents.size(); }
1123 
1124       void Add(FieldPath prefix, const FieldPath& match, const FieldVector& fields) {
1125         auto maybe_field = match.Get(fields);
1126         DCHECK_OK(maybe_field.status());
1127 
1128         prefix.indices().resize(prefix.indices().size() + match.indices().size());
1129         std::copy(match.indices().begin(), match.indices().end(),
1130                   prefix.indices().end() - match.indices().size());
1131         prefixes.push_back(std::move(prefix));
1132         referents.push_back(std::move(maybe_field).ValueOrDie());
1133       }
1134     };
1135 
1136     std::vector<FieldPath> operator()(const std::vector<FieldRef>& refs) {
1137       DCHECK_GE(refs.size(), 1);
1138       Matches matches(refs.front().FindAll(fields_), fields_);
1139 
1140       for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) {
1141         Matches next_matches;
1142         for (size_t i = 0; i < matches.size(); ++i) {
1143           const auto& referent = *matches.referents[i];
1144 
1145           for (const FieldPath& match : ref_it->FindAll(referent)) {
1146             next_matches.Add(matches.prefixes[i], match, referent.type()->fields());
1147           }
1148         }
1149         matches = std::move(next_matches);
1150       }
1151 
1152       return matches.prefixes;
1153     }
1154 
1155     const FieldVector& fields_;
1156   };
1157 
1158   return util::visit(Visitor{fields}, impl_);
1159 }
1160 
PrintTo(const FieldRef & ref,std::ostream * os)1161 void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); }
1162 
1163 // ----------------------------------------------------------------------
1164 // Schema implementation
1165 
1166 class Schema::Impl {
1167  public:
Impl(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata)1168   Impl(std::vector<std::shared_ptr<Field>> fields,
1169        std::shared_ptr<const KeyValueMetadata> metadata)
1170       : fields_(std::move(fields)),
1171         name_to_index_(CreateNameToIndexMap(fields_)),
1172         metadata_(std::move(metadata)) {}
1173 
1174   std::vector<std::shared_ptr<Field>> fields_;
1175   std::unordered_multimap<std::string, int> name_to_index_;
1176   std::shared_ptr<const KeyValueMetadata> metadata_;
1177 };
1178 
Schema(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata)1179 Schema::Schema(std::vector<std::shared_ptr<Field>> fields,
1180                std::shared_ptr<const KeyValueMetadata> metadata)
1181     : detail::Fingerprintable(),
1182       impl_(new Impl(std::move(fields), std::move(metadata))) {}
1183 
Schema(const Schema & schema)1184 Schema::Schema(const Schema& schema)
1185     : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {}
1186 
~Schema()1187 Schema::~Schema() {}
1188 
num_fields() const1189 int Schema::num_fields() const { return static_cast<int>(impl_->fields_.size()); }
1190 
field(int i) const1191 const std::shared_ptr<Field>& Schema::field(int i) const {
1192   DCHECK_GE(i, 0);
1193   DCHECK_LT(i, num_fields());
1194   return impl_->fields_[i];
1195 }
1196 
fields() const1197 const std::vector<std::shared_ptr<Field>>& Schema::fields() const {
1198   return impl_->fields_;
1199 }
1200 
Equals(const Schema & other,bool check_metadata) const1201 bool Schema::Equals(const Schema& other, bool check_metadata) const {
1202   if (this == &other) {
1203     return true;
1204   }
1205 
1206   // checks field equality
1207   if (num_fields() != other.num_fields()) {
1208     return false;
1209   }
1210 
1211   if (check_metadata) {
1212     const auto& metadata_fp = metadata_fingerprint();
1213     const auto& other_metadata_fp = other.metadata_fingerprint();
1214     if (metadata_fp != other_metadata_fp) {
1215       return false;
1216     }
1217   }
1218 
1219   // Fast path using fingerprints, if possible
1220   const auto& fp = fingerprint();
1221   const auto& other_fp = other.fingerprint();
1222   if (!fp.empty() && !other_fp.empty()) {
1223     return fp == other_fp;
1224   }
1225 
1226   // Fall back on field-by-field comparison
1227   for (int i = 0; i < num_fields(); ++i) {
1228     if (!field(i)->Equals(*other.field(i).get(), check_metadata)) {
1229       return false;
1230     }
1231   }
1232 
1233   return true;
1234 }
1235 
Equals(const std::shared_ptr<Schema> & other,bool check_metadata) const1236 bool Schema::Equals(const std::shared_ptr<Schema>& other, bool check_metadata) const {
1237   if (other == nullptr) {
1238     return false;
1239   }
1240 
1241   return Equals(*other, check_metadata);
1242 }
1243 
GetFieldByName(const std::string & name) const1244 std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) const {
1245   int i = GetFieldIndex(name);
1246   return i == -1 ? nullptr : impl_->fields_[i];
1247 }
1248 
GetFieldIndex(const std::string & name) const1249 int Schema::GetFieldIndex(const std::string& name) const {
1250   return LookupNameIndex(impl_->name_to_index_, name);
1251 }
1252 
GetAllFieldIndices(const std::string & name) const1253 std::vector<int> Schema::GetAllFieldIndices(const std::string& name) const {
1254   std::vector<int> result;
1255   auto p = impl_->name_to_index_.equal_range(name);
1256   for (auto it = p.first; it != p.second; ++it) {
1257     result.push_back(it->second);
1258   }
1259   if (result.size() > 1) {
1260     std::sort(result.begin(), result.end());
1261   }
1262   return result;
1263 }
1264 
CanReferenceFieldsByNames(const std::vector<std::string> & names) const1265 Status Schema::CanReferenceFieldsByNames(const std::vector<std::string>& names) const {
1266   for (const auto& name : names) {
1267     if (GetFieldByName(name) == nullptr) {
1268       return Status::Invalid("Field named '", name,
1269                              "' not found or not unique in the schema.");
1270     }
1271   }
1272 
1273   return Status::OK();
1274 }
1275 
GetAllFieldsByName(const std::string & name) const1276 std::vector<std::shared_ptr<Field>> Schema::GetAllFieldsByName(
1277     const std::string& name) const {
1278   std::vector<std::shared_ptr<Field>> result;
1279   auto p = impl_->name_to_index_.equal_range(name);
1280   for (auto it = p.first; it != p.second; ++it) {
1281     result.push_back(impl_->fields_[it->second]);
1282   }
1283   return result;
1284 }
1285 
AddField(int i,const std::shared_ptr<Field> & field) const1286 Result<std::shared_ptr<Schema>> Schema::AddField(
1287     int i, const std::shared_ptr<Field>& field) const {
1288   if (i < 0 || i > this->num_fields()) {
1289     return Status::Invalid("Invalid column index to add field.");
1290   }
1291 
1292   return std::make_shared<Schema>(internal::AddVectorElement(impl_->fields_, i, field),
1293                                   impl_->metadata_);
1294 }
1295 
SetField(int i,const std::shared_ptr<Field> & field) const1296 Result<std::shared_ptr<Schema>> Schema::SetField(
1297     int i, const std::shared_ptr<Field>& field) const {
1298   if (i < 0 || i > this->num_fields()) {
1299     return Status::Invalid("Invalid column index to add field.");
1300   }
1301 
1302   return std::make_shared<Schema>(
1303       internal::ReplaceVectorElement(impl_->fields_, i, field), impl_->metadata_);
1304 }
1305 
RemoveField(int i) const1306 Result<std::shared_ptr<Schema>> Schema::RemoveField(int i) const {
1307   if (i < 0 || i >= this->num_fields()) {
1308     return Status::Invalid("Invalid column index to remove field.");
1309   }
1310 
1311   return std::make_shared<Schema>(internal::DeleteVectorElement(impl_->fields_, i),
1312                                   impl_->metadata_);
1313 }
1314 
AddField(int i,const std::shared_ptr<Field> & field,std::shared_ptr<Schema> * out) const1315 Status Schema::AddField(int i, const std::shared_ptr<Field>& field,
1316                         std::shared_ptr<Schema>* out) const {
1317   return AddField(i, field).Value(out);
1318 }
1319 
SetField(int i,const std::shared_ptr<Field> & field,std::shared_ptr<Schema> * out) const1320 Status Schema::SetField(int i, const std::shared_ptr<Field>& field,
1321                         std::shared_ptr<Schema>* out) const {
1322   return SetField(i, field).Value(out);
1323 }
1324 
RemoveField(int i,std::shared_ptr<Schema> * out) const1325 Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const {
1326   return RemoveField(i).Value(out);
1327 }
1328 
HasMetadata() const1329 bool Schema::HasMetadata() const {
1330   return (impl_->metadata_ != nullptr) && (impl_->metadata_->size() > 0);
1331 }
1332 
HasDistinctFieldNames() const1333 bool Schema::HasDistinctFieldNames() const {
1334   auto fields = field_names();
1335   std::unordered_set<std::string> names{fields.cbegin(), fields.cend()};
1336   return names.size() == fields.size();
1337 }
1338 
WithMetadata(const std::shared_ptr<const KeyValueMetadata> & metadata) const1339 std::shared_ptr<Schema> Schema::WithMetadata(
1340     const std::shared_ptr<const KeyValueMetadata>& metadata) const {
1341   return std::make_shared<Schema>(impl_->fields_, metadata);
1342 }
1343 
metadata() const1344 std::shared_ptr<const KeyValueMetadata> Schema::metadata() const {
1345   return impl_->metadata_;
1346 }
1347 
RemoveMetadata() const1348 std::shared_ptr<Schema> Schema::RemoveMetadata() const {
1349   return std::make_shared<Schema>(impl_->fields_);
1350 }
1351 
ToString(bool show_metadata) const1352 std::string Schema::ToString(bool show_metadata) const {
1353   std::stringstream buffer;
1354 
1355   int i = 0;
1356   for (const auto& field : impl_->fields_) {
1357     if (i > 0) {
1358       buffer << std::endl;
1359     }
1360     buffer << field->ToString(show_metadata);
1361     ++i;
1362   }
1363 
1364   if (show_metadata && HasMetadata()) {
1365     buffer << impl_->metadata_->ToString();
1366   }
1367 
1368   return buffer.str();
1369 }
1370 
field_names() const1371 std::vector<std::string> Schema::field_names() const {
1372   std::vector<std::string> names;
1373   for (const auto& field : impl_->fields_) {
1374     names.push_back(field->name());
1375   }
1376   return names;
1377 }
1378 
1379 class SchemaBuilder::Impl {
1380  public:
1381   friend class SchemaBuilder;
Impl(ConflictPolicy policy,Field::MergeOptions field_merge_options)1382   Impl(ConflictPolicy policy, Field::MergeOptions field_merge_options)
1383       : policy_(policy), field_merge_options_(field_merge_options) {}
1384 
Impl(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata,ConflictPolicy conflict_policy,Field::MergeOptions field_merge_options)1385   Impl(std::vector<std::shared_ptr<Field>> fields,
1386        std::shared_ptr<const KeyValueMetadata> metadata, ConflictPolicy conflict_policy,
1387        Field::MergeOptions field_merge_options)
1388       : fields_(std::move(fields)),
1389         name_to_index_(CreateNameToIndexMap(fields_)),
1390         metadata_(std::move(metadata)),
1391         policy_(conflict_policy),
1392         field_merge_options_(field_merge_options) {}
1393 
AddField(const std::shared_ptr<Field> & field)1394   Status AddField(const std::shared_ptr<Field>& field) {
1395     DCHECK_NE(field, nullptr);
1396 
1397     // Short-circuit, no lookup needed.
1398     if (policy_ == CONFLICT_APPEND) {
1399       return AppendField(field);
1400     }
1401 
1402     auto name = field->name();
1403     constexpr int kNotFound = -1;
1404     constexpr int kDuplicateFound = -2;
1405     auto i = LookupNameIndex<kNotFound, kDuplicateFound>(name_to_index_, name);
1406 
1407     if (i == kNotFound) {
1408       return AppendField(field);
1409     }
1410 
1411     // From this point, there's one or more field in the builder that exists with
1412     // the same name.
1413 
1414     if (policy_ == CONFLICT_IGNORE) {
1415       // The ignore policy is more generous when there's duplicate in the builder.
1416       return Status::OK();
1417     } else if (policy_ == CONFLICT_ERROR) {
1418       return Status::Invalid("Duplicate found, policy dictate to treat as an error");
1419     }
1420 
1421     if (i == kDuplicateFound) {
1422       // Cannot merge/replace when there's more than one field in the builder
1423       // because we can't decide which to merge/replace.
1424       return Status::Invalid("Cannot merge field ", name,
1425                              " more than one field with same name exists");
1426     }
1427 
1428     DCHECK_GE(i, 0);
1429 
1430     if (policy_ == CONFLICT_REPLACE) {
1431       fields_[i] = field;
1432     } else if (policy_ == CONFLICT_MERGE) {
1433       ARROW_ASSIGN_OR_RAISE(fields_[i], fields_[i]->MergeWith(field));
1434     }
1435 
1436     return Status::OK();
1437   }
1438 
AppendField(const std::shared_ptr<Field> & field)1439   Status AppendField(const std::shared_ptr<Field>& field) {
1440     name_to_index_.emplace(field->name(), static_cast<int>(fields_.size()));
1441     fields_.push_back(field);
1442     return Status::OK();
1443   }
1444 
Reset()1445   void Reset() {
1446     fields_.clear();
1447     name_to_index_.clear();
1448     metadata_.reset();
1449   }
1450 
1451  private:
1452   std::vector<std::shared_ptr<Field>> fields_;
1453   std::unordered_multimap<std::string, int> name_to_index_;
1454   std::shared_ptr<const KeyValueMetadata> metadata_;
1455   ConflictPolicy policy_;
1456   Field::MergeOptions field_merge_options_;
1457 };
1458 
SchemaBuilder(ConflictPolicy policy,Field::MergeOptions field_merge_options)1459 SchemaBuilder::SchemaBuilder(ConflictPolicy policy,
1460                              Field::MergeOptions field_merge_options) {
1461   impl_ = internal::make_unique<Impl>(policy, field_merge_options);
1462 }
1463 
SchemaBuilder(std::vector<std::shared_ptr<Field>> fields,ConflictPolicy policy,Field::MergeOptions field_merge_options)1464 SchemaBuilder::SchemaBuilder(std::vector<std::shared_ptr<Field>> fields,
1465                              ConflictPolicy policy,
1466                              Field::MergeOptions field_merge_options) {
1467   impl_ = internal::make_unique<Impl>(std::move(fields), nullptr, policy,
1468                                       field_merge_options);
1469 }
1470 
SchemaBuilder(const std::shared_ptr<Schema> & schema,ConflictPolicy policy,Field::MergeOptions field_merge_options)1471 SchemaBuilder::SchemaBuilder(const std::shared_ptr<Schema>& schema, ConflictPolicy policy,
1472                              Field::MergeOptions field_merge_options) {
1473   std::shared_ptr<const KeyValueMetadata> metadata;
1474   if (schema->HasMetadata()) {
1475     metadata = schema->metadata()->Copy();
1476   }
1477 
1478   impl_ = internal::make_unique<Impl>(schema->fields(), std::move(metadata), policy,
1479                                       field_merge_options);
1480 }
1481 
~SchemaBuilder()1482 SchemaBuilder::~SchemaBuilder() {}
1483 
policy() const1484 SchemaBuilder::ConflictPolicy SchemaBuilder::policy() const { return impl_->policy_; }
1485 
SetPolicy(SchemaBuilder::ConflictPolicy resolution)1486 void SchemaBuilder::SetPolicy(SchemaBuilder::ConflictPolicy resolution) {
1487   impl_->policy_ = resolution;
1488 }
1489 
AddField(const std::shared_ptr<Field> & field)1490 Status SchemaBuilder::AddField(const std::shared_ptr<Field>& field) {
1491   return impl_->AddField(field);
1492 }
1493 
AddFields(const std::vector<std::shared_ptr<Field>> & fields)1494 Status SchemaBuilder::AddFields(const std::vector<std::shared_ptr<Field>>& fields) {
1495   for (const auto& field : fields) {
1496     RETURN_NOT_OK(AddField(field));
1497   }
1498 
1499   return Status::OK();
1500 }
1501 
AddSchema(const std::shared_ptr<Schema> & schema)1502 Status SchemaBuilder::AddSchema(const std::shared_ptr<Schema>& schema) {
1503   DCHECK_NE(schema, nullptr);
1504   return AddFields(schema->fields());
1505 }
1506 
AddSchemas(const std::vector<std::shared_ptr<Schema>> & schemas)1507 Status SchemaBuilder::AddSchemas(const std::vector<std::shared_ptr<Schema>>& schemas) {
1508   for (const auto& schema : schemas) {
1509     RETURN_NOT_OK(AddSchema(schema));
1510   }
1511 
1512   return Status::OK();
1513 }
1514 
AddMetadata(const KeyValueMetadata & metadata)1515 Status SchemaBuilder::AddMetadata(const KeyValueMetadata& metadata) {
1516   impl_->metadata_ = metadata.Copy();
1517   return Status::OK();
1518 }
1519 
Finish() const1520 Result<std::shared_ptr<Schema>> SchemaBuilder::Finish() const {
1521   return schema(impl_->fields_, impl_->metadata_);
1522 }
1523 
Reset()1524 void SchemaBuilder::Reset() { impl_->Reset(); }
1525 
Merge(const std::vector<std::shared_ptr<Schema>> & schemas,ConflictPolicy policy)1526 Result<std::shared_ptr<Schema>> SchemaBuilder::Merge(
1527     const std::vector<std::shared_ptr<Schema>>& schemas, ConflictPolicy policy) {
1528   SchemaBuilder builder{policy};
1529   RETURN_NOT_OK(builder.AddSchemas(schemas));
1530   return builder.Finish();
1531 }
1532 
AreCompatible(const std::vector<std::shared_ptr<Schema>> & schemas,ConflictPolicy policy)1533 Status SchemaBuilder::AreCompatible(const std::vector<std::shared_ptr<Schema>>& schemas,
1534                                     ConflictPolicy policy) {
1535   return Merge(schemas, policy).status();
1536 }
1537 
schema(std::vector<std::shared_ptr<Field>> fields,std::shared_ptr<const KeyValueMetadata> metadata)1538 std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
1539                                std::shared_ptr<const KeyValueMetadata> metadata) {
1540   return std::make_shared<Schema>(std::move(fields), std::move(metadata));
1541 }
1542 
UnifySchemas(const std::vector<std::shared_ptr<Schema>> & schemas,const Field::MergeOptions field_merge_options)1543 Result<std::shared_ptr<Schema>> UnifySchemas(
1544     const std::vector<std::shared_ptr<Schema>>& schemas,
1545     const Field::MergeOptions field_merge_options) {
1546   if (schemas.empty()) {
1547     return Status::Invalid("Must provide at least one schema to unify.");
1548   }
1549 
1550   if (!schemas[0]->HasDistinctFieldNames()) {
1551     return Status::Invalid("Can't unify schema with duplicate field names.");
1552   }
1553 
1554   SchemaBuilder builder{schemas[0], SchemaBuilder::CONFLICT_MERGE, field_merge_options};
1555 
1556   for (size_t i = 1; i < schemas.size(); i++) {
1557     const auto& schema = schemas[i];
1558     if (!schema->HasDistinctFieldNames()) {
1559       return Status::Invalid("Can't unify schema with duplicate field names.");
1560     }
1561     RETURN_NOT_OK(builder.AddSchema(schema));
1562   }
1563 
1564   return builder.Finish();
1565 }
1566 
1567 // ----------------------------------------------------------------------
1568 // Fingerprint computations
1569 
1570 namespace detail {
1571 
~Fingerprintable()1572 Fingerprintable::~Fingerprintable() {
1573   delete fingerprint_.load();
1574   delete metadata_fingerprint_.load();
1575 }
1576 
1577 template <typename ComputeFingerprint>
LoadFingerprint(std::atomic<std::string * > * fingerprint,ComputeFingerprint && compute_fingerprint)1578 static const std::string& LoadFingerprint(std::atomic<std::string*>* fingerprint,
1579                                           ComputeFingerprint&& compute_fingerprint) {
1580   auto new_p = new std::string(std::forward<ComputeFingerprint>(compute_fingerprint)());
1581   // Since fingerprint() and metadata_fingerprint() return a *reference* to the
1582   // allocated string, the first allocation ever should never be replaced by another
1583   // one.  Hence the compare_exchange_strong() against nullptr.
1584   std::string* expected = nullptr;
1585   if (fingerprint->compare_exchange_strong(expected, new_p)) {
1586     return *new_p;
1587   } else {
1588     delete new_p;
1589     DCHECK_NE(expected, nullptr);
1590     return *expected;
1591   }
1592 }
1593 
LoadFingerprintSlow() const1594 const std::string& Fingerprintable::LoadFingerprintSlow() const {
1595   return LoadFingerprint(&fingerprint_, [this]() { return ComputeFingerprint(); });
1596 }
1597 
LoadMetadataFingerprintSlow() const1598 const std::string& Fingerprintable::LoadMetadataFingerprintSlow() const {
1599   return LoadFingerprint(&metadata_fingerprint_,
1600                          [this]() { return ComputeMetadataFingerprint(); });
1601 }
1602 
1603 }  // namespace detail
1604 
TypeIdFingerprint(const DataType & type)1605 static inline std::string TypeIdFingerprint(const DataType& type) {
1606   auto c = static_cast<int>(type.id()) + 'A';
1607   DCHECK_GE(c, 0);
1608   DCHECK_LT(c, 128);  // Unlikely to happen any soon
1609   // Prefix with an unusual character in order to disambiguate
1610   std::string s{'@', static_cast<char>(c)};
1611   return s;
1612 }
1613 
TimeUnitFingerprint(TimeUnit::type unit)1614 static char TimeUnitFingerprint(TimeUnit::type unit) {
1615   switch (unit) {
1616     case TimeUnit::SECOND:
1617       return 's';
1618     case TimeUnit::MILLI:
1619       return 'm';
1620     case TimeUnit::MICRO:
1621       return 'u';
1622     case TimeUnit::NANO:
1623       return 'n';
1624     default:
1625       DCHECK(false) << "Unexpected TimeUnit";
1626       return '\0';
1627   }
1628 }
1629 
IntervalTypeFingerprint(IntervalType::type unit)1630 static char IntervalTypeFingerprint(IntervalType::type unit) {
1631   switch (unit) {
1632     case IntervalType::DAY_TIME:
1633       return 'd';
1634     case IntervalType::MONTHS:
1635       return 'M';
1636     default:
1637       DCHECK(false) << "Unexpected IntervalType::type";
1638       return '\0';
1639   }
1640 }
1641 
AppendMetadataFingerprint(const KeyValueMetadata & metadata,std::stringstream * ss)1642 static void AppendMetadataFingerprint(const KeyValueMetadata& metadata,
1643                                       std::stringstream* ss) {
1644   // Compute metadata fingerprint.  KeyValueMetadata is not immutable,
1645   // so we don't cache the result on the metadata instance.
1646   const auto pairs = metadata.sorted_pairs();
1647   if (!pairs.empty()) {
1648     *ss << "!{";
1649     for (const auto& p : pairs) {
1650       const auto& k = p.first;
1651       const auto& v = p.second;
1652       // Since metadata strings can contain arbitrary characters, prefix with
1653       // string length to disambiguate.
1654       *ss << k.length() << ':' << k << ':';
1655       *ss << v.length() << ':' << v << ';';
1656     }
1657     *ss << '}';
1658   }
1659 }
1660 
ComputeFingerprint() const1661 std::string Field::ComputeFingerprint() const {
1662   const auto& type_fingerprint = type_->fingerprint();
1663   if (type_fingerprint.empty()) {
1664     // Underlying DataType doesn't support fingerprinting.
1665     return "";
1666   }
1667   std::stringstream ss;
1668   ss << 'F';
1669   if (nullable_) {
1670     ss << 'n';
1671   } else {
1672     ss << 'N';
1673   }
1674   ss << name_;
1675   ss << '{' << type_fingerprint << '}';
1676   return ss.str();
1677 }
1678 
ComputeMetadataFingerprint() const1679 std::string Field::ComputeMetadataFingerprint() const {
1680   std::stringstream ss;
1681   if (metadata_) {
1682     AppendMetadataFingerprint(*metadata_, &ss);
1683   }
1684   const auto& type_fingerprint = type_->metadata_fingerprint();
1685   if (!type_fingerprint.empty()) {
1686     ss << "+{" << type_->metadata_fingerprint() << "}";
1687   }
1688   return ss.str();
1689 }
1690 
ComputeFingerprint() const1691 std::string Schema::ComputeFingerprint() const {
1692   std::stringstream ss;
1693   ss << "S{";
1694   for (const auto& field : fields()) {
1695     const auto& field_fingerprint = field->fingerprint();
1696     if (field_fingerprint.empty()) {
1697       return "";
1698     }
1699     ss << field_fingerprint << ";";
1700   }
1701   ss << "}";
1702   return ss.str();
1703 }
1704 
ComputeMetadataFingerprint() const1705 std::string Schema::ComputeMetadataFingerprint() const {
1706   std::stringstream ss;
1707   if (HasMetadata()) {
1708     AppendMetadataFingerprint(*metadata(), &ss);
1709   }
1710   ss << "S{";
1711   for (const auto& field : fields()) {
1712     const auto& field_fingerprint = field->metadata_fingerprint();
1713     ss << field_fingerprint << ";";
1714   }
1715   ss << "}";
1716   return ss.str();
1717 }
1718 
PrintTo(const Schema & s,std::ostream * os)1719 void PrintTo(const Schema& s, std::ostream* os) { *os << s; }
1720 
ComputeFingerprint() const1721 std::string DataType::ComputeFingerprint() const {
1722   // Default implementation returns empty string, signalling non-implemented
1723   // functionality.
1724   return "";
1725 }
1726 
ComputeMetadataFingerprint() const1727 std::string DataType::ComputeMetadataFingerprint() const {
1728   // Whatever the data type, metadata can only be found on child fields
1729   std::string s;
1730   for (const auto& child : children_) {
1731     s += child->metadata_fingerprint() + ";";
1732   }
1733   return s;
1734 }
1735 
1736 #define PARAMETER_LESS_FINGERPRINT(TYPE_CLASS)               \
1737   std::string TYPE_CLASS##Type::ComputeFingerprint() const { \
1738     return TypeIdFingerprint(*this);                         \
1739   }
1740 
1741 PARAMETER_LESS_FINGERPRINT(Null)
PARAMETER_LESS_FINGERPRINT(Boolean)1742 PARAMETER_LESS_FINGERPRINT(Boolean)
1743 PARAMETER_LESS_FINGERPRINT(Int8)
1744 PARAMETER_LESS_FINGERPRINT(Int16)
1745 PARAMETER_LESS_FINGERPRINT(Int32)
1746 PARAMETER_LESS_FINGERPRINT(Int64)
1747 PARAMETER_LESS_FINGERPRINT(UInt8)
1748 PARAMETER_LESS_FINGERPRINT(UInt16)
1749 PARAMETER_LESS_FINGERPRINT(UInt32)
1750 PARAMETER_LESS_FINGERPRINT(UInt64)
1751 PARAMETER_LESS_FINGERPRINT(HalfFloat)
1752 PARAMETER_LESS_FINGERPRINT(Float)
1753 PARAMETER_LESS_FINGERPRINT(Double)
1754 PARAMETER_LESS_FINGERPRINT(Binary)
1755 PARAMETER_LESS_FINGERPRINT(LargeBinary)
1756 PARAMETER_LESS_FINGERPRINT(String)
1757 PARAMETER_LESS_FINGERPRINT(LargeString)
1758 PARAMETER_LESS_FINGERPRINT(Date32)
1759 PARAMETER_LESS_FINGERPRINT(Date64)
1760 
1761 #undef PARAMETER_LESS_FINGERPRINT
1762 
1763 std::string DictionaryType::ComputeFingerprint() const {
1764   const auto& index_fingerprint = index_type_->fingerprint();
1765   const auto& value_fingerprint = value_type_->fingerprint();
1766   std::string ordered_fingerprint = ordered_ ? "1" : "0";
1767 
1768   DCHECK(!index_fingerprint.empty());  // it's an integer type
1769   if (!value_fingerprint.empty()) {
1770     return TypeIdFingerprint(*this) + index_fingerprint + value_fingerprint +
1771            ordered_fingerprint;
1772   }
1773   return ordered_fingerprint;
1774 }
1775 
ComputeFingerprint() const1776 std::string ListType::ComputeFingerprint() const {
1777   const auto& child_fingerprint = children_[0]->fingerprint();
1778   if (!child_fingerprint.empty()) {
1779     return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
1780   }
1781   return "";
1782 }
1783 
ComputeFingerprint() const1784 std::string LargeListType::ComputeFingerprint() const {
1785   const auto& child_fingerprint = children_[0]->fingerprint();
1786   if (!child_fingerprint.empty()) {
1787     return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
1788   }
1789   return "";
1790 }
1791 
ComputeFingerprint() const1792 std::string MapType::ComputeFingerprint() const {
1793   const auto& child_fingerprint = children_[0]->fingerprint();
1794   if (!child_fingerprint.empty()) {
1795     if (keys_sorted_) {
1796       return TypeIdFingerprint(*this) + "s{" + child_fingerprint + "}";
1797     } else {
1798       return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}";
1799     }
1800   }
1801   return "";
1802 }
1803 
ComputeFingerprint() const1804 std::string FixedSizeListType::ComputeFingerprint() const {
1805   const auto& child_fingerprint = children_[0]->fingerprint();
1806   if (!child_fingerprint.empty()) {
1807     std::stringstream ss;
1808     ss << TypeIdFingerprint(*this) << "[" << list_size_ << "]"
1809        << "{" << child_fingerprint << "}";
1810     return ss.str();
1811   }
1812   return "";
1813 }
1814 
ComputeFingerprint() const1815 std::string FixedSizeBinaryType::ComputeFingerprint() const {
1816   std::stringstream ss;
1817   ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "]";
1818   return ss.str();
1819 }
1820 
ComputeFingerprint() const1821 std::string DecimalType::ComputeFingerprint() const {
1822   std::stringstream ss;
1823   ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "," << precision_ << ","
1824      << scale_ << "]";
1825   return ss.str();
1826 }
1827 
ComputeFingerprint() const1828 std::string StructType::ComputeFingerprint() const {
1829   std::stringstream ss;
1830   ss << TypeIdFingerprint(*this) << "{";
1831   for (const auto& child : children_) {
1832     const auto& child_fingerprint = child->fingerprint();
1833     if (child_fingerprint.empty()) {
1834       return "";
1835     }
1836     ss << child_fingerprint << ";";
1837   }
1838   ss << "}";
1839   return ss.str();
1840 }
1841 
ComputeFingerprint() const1842 std::string UnionType::ComputeFingerprint() const {
1843   std::stringstream ss;
1844   ss << TypeIdFingerprint(*this);
1845   switch (mode_) {
1846     case UnionMode::SPARSE:
1847       ss << "[s";
1848       break;
1849     case UnionMode::DENSE:
1850       ss << "[d";
1851       break;
1852     default:
1853       DCHECK(false) << "Unexpected UnionMode";
1854   }
1855   for (const auto code : type_codes_) {
1856     // Represent code as integer, not raw character
1857     ss << ':' << static_cast<int32_t>(code);
1858   }
1859   ss << "]{";
1860   for (const auto& child : children_) {
1861     const auto& child_fingerprint = child->fingerprint();
1862     if (child_fingerprint.empty()) {
1863       return "";
1864     }
1865     ss << child_fingerprint << ";";
1866   }
1867   ss << "}";
1868   return ss.str();
1869 }
1870 
ComputeFingerprint() const1871 std::string TimeType::ComputeFingerprint() const {
1872   std::stringstream ss;
1873   ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_);
1874   return ss.str();
1875 }
1876 
ComputeFingerprint() const1877 std::string TimestampType::ComputeFingerprint() const {
1878   std::stringstream ss;
1879   ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_) << timezone_.length()
1880      << ':' << timezone_;
1881   return ss.str();
1882 }
1883 
ComputeFingerprint() const1884 std::string IntervalType::ComputeFingerprint() const {
1885   std::stringstream ss;
1886   ss << TypeIdFingerprint(*this) << IntervalTypeFingerprint(interval_type());
1887   return ss.str();
1888 }
1889 
ComputeFingerprint() const1890 std::string DurationType::ComputeFingerprint() const {
1891   std::stringstream ss;
1892   ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_);
1893   return ss.str();
1894 }
1895 
1896 // ----------------------------------------------------------------------
1897 // Visitors and factory functions
1898 
Accept(TypeVisitor * visitor) const1899 Status DataType::Accept(TypeVisitor* visitor) const {
1900   return VisitTypeInline(*this, visitor);
1901 }
1902 
1903 #define TYPE_FACTORY(NAME, KLASS)                                        \
1904   std::shared_ptr<DataType> NAME() {                                     \
1905     static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
1906     return result;                                                       \
1907   }
1908 
TYPE_FACTORY(null,NullType)1909 TYPE_FACTORY(null, NullType)
1910 TYPE_FACTORY(boolean, BooleanType)
1911 TYPE_FACTORY(int8, Int8Type)
1912 TYPE_FACTORY(uint8, UInt8Type)
1913 TYPE_FACTORY(int16, Int16Type)
1914 TYPE_FACTORY(uint16, UInt16Type)
1915 TYPE_FACTORY(int32, Int32Type)
1916 TYPE_FACTORY(uint32, UInt32Type)
1917 TYPE_FACTORY(int64, Int64Type)
1918 TYPE_FACTORY(uint64, UInt64Type)
1919 TYPE_FACTORY(float16, HalfFloatType)
1920 TYPE_FACTORY(float32, FloatType)
1921 TYPE_FACTORY(float64, DoubleType)
1922 TYPE_FACTORY(utf8, StringType)
1923 TYPE_FACTORY(large_utf8, LargeStringType)
1924 TYPE_FACTORY(binary, BinaryType)
1925 TYPE_FACTORY(large_binary, LargeBinaryType)
1926 TYPE_FACTORY(date64, Date64Type)
1927 TYPE_FACTORY(date32, Date32Type)
1928 
1929 std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width) {
1930   return std::make_shared<FixedSizeBinaryType>(byte_width);
1931 }
1932 
duration(TimeUnit::type unit)1933 std::shared_ptr<DataType> duration(TimeUnit::type unit) {
1934   return std::make_shared<DurationType>(unit);
1935 }
1936 
day_time_interval()1937 std::shared_ptr<DataType> day_time_interval() {
1938   return std::make_shared<DayTimeIntervalType>();
1939 }
1940 
month_interval()1941 std::shared_ptr<DataType> month_interval() {
1942   return std::make_shared<MonthIntervalType>();
1943 }
1944 
timestamp(TimeUnit::type unit)1945 std::shared_ptr<DataType> timestamp(TimeUnit::type unit) {
1946   return std::make_shared<TimestampType>(unit);
1947 }
1948 
timestamp(TimeUnit::type unit,const std::string & timezone)1949 std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone) {
1950   return std::make_shared<TimestampType>(unit, timezone);
1951 }
1952 
time32(TimeUnit::type unit)1953 std::shared_ptr<DataType> time32(TimeUnit::type unit) {
1954   return std::make_shared<Time32Type>(unit);
1955 }
1956 
time64(TimeUnit::type unit)1957 std::shared_ptr<DataType> time64(TimeUnit::type unit) {
1958   return std::make_shared<Time64Type>(unit);
1959 }
1960 
list(const std::shared_ptr<DataType> & value_type)1961 std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
1962   return std::make_shared<ListType>(value_type);
1963 }
1964 
list(const std::shared_ptr<Field> & value_field)1965 std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) {
1966   return std::make_shared<ListType>(value_field);
1967 }
1968 
large_list(const std::shared_ptr<DataType> & value_type)1969 std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type) {
1970   return std::make_shared<LargeListType>(value_type);
1971 }
1972 
large_list(const std::shared_ptr<Field> & value_field)1973 std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_field) {
1974   return std::make_shared<LargeListType>(value_field);
1975 }
1976 
map(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<DataType> & item_type,bool keys_sorted)1977 std::shared_ptr<DataType> map(const std::shared_ptr<DataType>& key_type,
1978                               const std::shared_ptr<DataType>& item_type,
1979                               bool keys_sorted) {
1980   return std::make_shared<MapType>(key_type, item_type, keys_sorted);
1981 }
1982 
map(const std::shared_ptr<DataType> & key_type,const std::shared_ptr<Field> & item_field,bool keys_sorted)1983 std::shared_ptr<DataType> map(const std::shared_ptr<DataType>& key_type,
1984                               const std::shared_ptr<Field>& item_field,
1985                               bool keys_sorted) {
1986   return std::make_shared<MapType>(key_type, item_field, keys_sorted);
1987 }
1988 
fixed_size_list(const std::shared_ptr<DataType> & value_type,int32_t list_size)1989 std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type,
1990                                           int32_t list_size) {
1991   return std::make_shared<FixedSizeListType>(value_type, list_size);
1992 }
1993 
fixed_size_list(const std::shared_ptr<Field> & value_field,int32_t list_size)1994 std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<Field>& value_field,
1995                                           int32_t list_size) {
1996   return std::make_shared<FixedSizeListType>(value_field, list_size);
1997 }
1998 
struct_(const std::vector<std::shared_ptr<Field>> & fields)1999 std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) {
2000   return std::make_shared<StructType>(fields);
2001 }
2002 
union_(const std::vector<std::shared_ptr<Field>> & child_fields,const std::vector<int8_t> & type_codes,UnionMode::type mode)2003 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
2004                                  const std::vector<int8_t>& type_codes,
2005                                  UnionMode::type mode) {
2006   return std::make_shared<UnionType>(child_fields, type_codes, mode);
2007 }
2008 
union_(const std::vector<std::shared_ptr<Field>> & child_fields,UnionMode::type mode)2009 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
2010                                  UnionMode::type mode) {
2011   std::vector<int8_t> type_codes(child_fields.size());
2012   for (int i = 0; i < static_cast<int>(child_fields.size()); ++i) {
2013     type_codes[i] = static_cast<int8_t>(i);
2014   }
2015   return std::make_shared<UnionType>(child_fields, type_codes, mode);
2016 }
2017 
union_(UnionMode::type mode)2018 std::shared_ptr<DataType> union_(UnionMode::type mode) {
2019   std::vector<std::shared_ptr<Field>> child_fields;
2020   return union_(child_fields, mode);
2021 }
2022 
union_(const std::vector<std::shared_ptr<Array>> & children,const std::vector<std::string> & field_names,const std::vector<int8_t> & given_type_codes,UnionMode::type mode)2023 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children,
2024                                  const std::vector<std::string>& field_names,
2025                                  const std::vector<int8_t>& given_type_codes,
2026                                  UnionMode::type mode) {
2027   std::vector<std::shared_ptr<Field>> fields;
2028   std::vector<int8_t> type_codes(given_type_codes);
2029   int8_t counter = 0;
2030   for (const auto& child : children) {
2031     if (field_names.size() == 0) {
2032       fields.push_back(field(std::to_string(counter), child->type()));
2033     } else {
2034       fields.push_back(field(std::move(field_names[counter]), child->type()));
2035     }
2036     if (given_type_codes.size() == 0) {
2037       type_codes.push_back(counter);
2038     }
2039     counter++;
2040   }
2041   return union_(fields, std::move(type_codes), mode);
2042 }
2043 
dictionary(const std::shared_ptr<DataType> & index_type,const std::shared_ptr<DataType> & dict_type,bool ordered)2044 std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
2045                                      const std::shared_ptr<DataType>& dict_type,
2046                                      bool ordered) {
2047   return std::make_shared<DictionaryType>(index_type, dict_type, ordered);
2048 }
2049 
field(std::string name,std::shared_ptr<DataType> type,bool nullable,std::shared_ptr<const KeyValueMetadata> metadata)2050 std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
2051                              bool nullable,
2052                              std::shared_ptr<const KeyValueMetadata> metadata) {
2053   return std::make_shared<Field>(std::move(name), std::move(type), nullable,
2054                                  std::move(metadata));
2055 }
2056 
decimal(int32_t precision,int32_t scale)2057 std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
2058   return std::make_shared<Decimal128Type>(precision, scale);
2059 }
2060 
ToString() const2061 std::string Decimal128Type::ToString() const {
2062   std::stringstream s;
2063   s << "decimal(" << precision_ << ", " << scale_ << ")";
2064   return s.str();
2065 }
2066 
2067 }  // namespace arrow
2068