1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "parquet/arrow/schema_internal.h"
19
20 #include "arrow/type.h"
21
22 using ArrowType = ::arrow::DataType;
23 using ArrowTypeId = ::arrow::Type;
24 using ParquetType = parquet::Type;
25
26 namespace parquet {
27
28 namespace arrow {
29
30 using ::arrow::Result;
31 using ::arrow::Status;
32 using ::arrow::internal::checked_cast;
33
MakeArrowDecimal(const LogicalType & logical_type)34 Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
35 const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
36 if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
37 return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
38 }
39 return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
40 }
41
MakeArrowInt(const LogicalType & logical_type)42 Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
43 const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
44 switch (integer.bit_width()) {
45 case 8:
46 return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
47 case 16:
48 return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
49 case 32:
50 return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
51 default:
52 return Status::TypeError(logical_type.ToString(),
53 " can not annotate physical type Int32");
54 }
55 }
56
MakeArrowInt64(const LogicalType & logical_type)57 Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
58 const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
59 switch (integer.bit_width()) {
60 case 64:
61 return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
62 default:
63 return Status::TypeError(logical_type.ToString(),
64 " can not annotate physical type Int64");
65 }
66 }
67
MakeArrowTime32(const LogicalType & logical_type)68 Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
69 const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
70 switch (time.time_unit()) {
71 case LogicalType::TimeUnit::MILLIS:
72 return ::arrow::time32(::arrow::TimeUnit::MILLI);
73 default:
74 return Status::TypeError(logical_type.ToString(),
75 " can not annotate physical type Time32");
76 }
77 }
78
MakeArrowTime64(const LogicalType & logical_type)79 Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
80 const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
81 switch (time.time_unit()) {
82 case LogicalType::TimeUnit::MICROS:
83 return ::arrow::time64(::arrow::TimeUnit::MICRO);
84 case LogicalType::TimeUnit::NANOS:
85 return ::arrow::time64(::arrow::TimeUnit::NANO);
86 default:
87 return Status::TypeError(logical_type.ToString(),
88 " can not annotate physical type Time64");
89 }
90 }
91
MakeArrowTimestamp(const LogicalType & logical_type)92 Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
93 const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
94 const bool utc_normalized =
95 timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
96 static const char* utc_timezone = "UTC";
97 switch (timestamp.time_unit()) {
98 case LogicalType::TimeUnit::MILLIS:
99 return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
100 : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
101 case LogicalType::TimeUnit::MICROS:
102 return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
103 : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
104 case LogicalType::TimeUnit::NANOS:
105 return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
106 : ::arrow::timestamp(::arrow::TimeUnit::NANO));
107 default:
108 return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
109 logical_type.ToString());
110 }
111 }
112
FromByteArray(const LogicalType & logical_type)113 Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
114 switch (logical_type.type()) {
115 case LogicalType::Type::STRING:
116 return ::arrow::utf8();
117 case LogicalType::Type::DECIMAL:
118 return MakeArrowDecimal(logical_type);
119 case LogicalType::Type::NONE:
120 case LogicalType::Type::ENUM:
121 case LogicalType::Type::JSON:
122 case LogicalType::Type::BSON:
123 return ::arrow::binary();
124 default:
125 return Status::NotImplemented("Unhandled logical logical_type ",
126 logical_type.ToString(), " for binary array");
127 }
128 }
129
FromFLBA(const LogicalType & logical_type,int32_t physical_length)130 Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
131 int32_t physical_length) {
132 switch (logical_type.type()) {
133 case LogicalType::Type::DECIMAL:
134 return MakeArrowDecimal(logical_type);
135 case LogicalType::Type::NONE:
136 case LogicalType::Type::INTERVAL:
137 case LogicalType::Type::UUID:
138 return ::arrow::fixed_size_binary(physical_length);
139 default:
140 return Status::NotImplemented("Unhandled logical logical_type ",
141 logical_type.ToString(),
142 " for fixed-length binary array");
143 }
144 }
145
FromInt32(const LogicalType & logical_type)146 ::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
147 switch (logical_type.type()) {
148 case LogicalType::Type::INT:
149 return MakeArrowInt(logical_type);
150 case LogicalType::Type::DATE:
151 return ::arrow::date32();
152 case LogicalType::Type::TIME:
153 return MakeArrowTime32(logical_type);
154 case LogicalType::Type::DECIMAL:
155 return MakeArrowDecimal(logical_type);
156 case LogicalType::Type::NONE:
157 return ::arrow::int32();
158 default:
159 return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
160 " for INT32");
161 }
162 }
163
FromInt64(const LogicalType & logical_type)164 Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
165 switch (logical_type.type()) {
166 case LogicalType::Type::INT:
167 return MakeArrowInt64(logical_type);
168 case LogicalType::Type::DECIMAL:
169 return MakeArrowDecimal(logical_type);
170 case LogicalType::Type::TIMESTAMP:
171 return MakeArrowTimestamp(logical_type);
172 case LogicalType::Type::TIME:
173 return MakeArrowTime64(logical_type);
174 case LogicalType::Type::NONE:
175 return ::arrow::int64();
176 default:
177 return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
178 " for INT64");
179 }
180 }
181
GetArrowType(Type::type physical_type,const LogicalType & logical_type,int type_length)182 Result<std::shared_ptr<ArrowType>> GetArrowType(Type::type physical_type,
183 const LogicalType& logical_type,
184 int type_length) {
185 if (logical_type.is_invalid() || logical_type.is_null()) {
186 return ::arrow::null();
187 }
188
189 switch (physical_type) {
190 case ParquetType::BOOLEAN:
191 return ::arrow::boolean();
192 case ParquetType::INT32:
193 return FromInt32(logical_type);
194 case ParquetType::INT64:
195 return FromInt64(logical_type);
196 case ParquetType::INT96:
197 return ::arrow::timestamp(::arrow::TimeUnit::NANO);
198 case ParquetType::FLOAT:
199 return ::arrow::float32();
200 case ParquetType::DOUBLE:
201 return ::arrow::float64();
202 case ParquetType::BYTE_ARRAY:
203 return FromByteArray(logical_type);
204 case ParquetType::FIXED_LEN_BYTE_ARRAY:
205 return FromFLBA(logical_type, type_length);
206 default: {
207 // PARQUET-1565: This can occur if the file is corrupt
208 return Status::IOError("Invalid physical column type: ",
209 TypeToString(physical_type));
210 }
211 }
212 }
213
GetArrowType(const schema::PrimitiveNode & primitive)214 Result<std::shared_ptr<ArrowType>> GetArrowType(const schema::PrimitiveNode& primitive) {
215 return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
216 primitive.type_length());
217 }
218
GetArrowType(const ColumnDescriptor & descriptor)219 Result<std::shared_ptr<ArrowType>> GetArrowType(const ColumnDescriptor& descriptor) {
220 return GetArrowType(descriptor.physical_type(), *descriptor.logical_type(),
221 descriptor.type_length());
222 }
223
224 } // namespace arrow
225 } // namespace parquet
226