1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "parquet/arrow/schema_internal.h"
19 
20 #include "arrow/type.h"
21 
22 using ArrowType = ::arrow::DataType;
23 using ArrowTypeId = ::arrow::Type;
24 using ParquetType = parquet::Type;
25 
26 namespace parquet {
27 
28 namespace arrow {
29 
30 using ::arrow::Result;
31 using ::arrow::Status;
32 using ::arrow::internal::checked_cast;
33 
MakeArrowDecimal(const LogicalType & logical_type)34 Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
35   const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
36   if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
37     return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
38   }
39   return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
40 }
41 
MakeArrowInt(const LogicalType & logical_type)42 Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
43   const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
44   switch (integer.bit_width()) {
45     case 8:
46       return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
47     case 16:
48       return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
49     case 32:
50       return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
51     default:
52       return Status::TypeError(logical_type.ToString(),
53                                " can not annotate physical type Int32");
54   }
55 }
56 
MakeArrowInt64(const LogicalType & logical_type)57 Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
58   const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
59   switch (integer.bit_width()) {
60     case 64:
61       return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
62     default:
63       return Status::TypeError(logical_type.ToString(),
64                                " can not annotate physical type Int64");
65   }
66 }
67 
MakeArrowTime32(const LogicalType & logical_type)68 Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
69   const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
70   switch (time.time_unit()) {
71     case LogicalType::TimeUnit::MILLIS:
72       return ::arrow::time32(::arrow::TimeUnit::MILLI);
73     default:
74       return Status::TypeError(logical_type.ToString(),
75                                " can not annotate physical type Time32");
76   }
77 }
78 
MakeArrowTime64(const LogicalType & logical_type)79 Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
80   const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
81   switch (time.time_unit()) {
82     case LogicalType::TimeUnit::MICROS:
83       return ::arrow::time64(::arrow::TimeUnit::MICRO);
84     case LogicalType::TimeUnit::NANOS:
85       return ::arrow::time64(::arrow::TimeUnit::NANO);
86     default:
87       return Status::TypeError(logical_type.ToString(),
88                                " can not annotate physical type Time64");
89   }
90 }
91 
MakeArrowTimestamp(const LogicalType & logical_type)92 Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
93   const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
94   const bool utc_normalized =
95       timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
96   static const char* utc_timezone = "UTC";
97   switch (timestamp.time_unit()) {
98     case LogicalType::TimeUnit::MILLIS:
99       return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
100                              : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
101     case LogicalType::TimeUnit::MICROS:
102       return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
103                              : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
104     case LogicalType::TimeUnit::NANOS:
105       return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
106                              : ::arrow::timestamp(::arrow::TimeUnit::NANO));
107     default:
108       return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
109                                logical_type.ToString());
110   }
111 }
112 
FromByteArray(const LogicalType & logical_type)113 Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
114   switch (logical_type.type()) {
115     case LogicalType::Type::STRING:
116       return ::arrow::utf8();
117     case LogicalType::Type::DECIMAL:
118       return MakeArrowDecimal(logical_type);
119     case LogicalType::Type::NONE:
120     case LogicalType::Type::ENUM:
121     case LogicalType::Type::JSON:
122     case LogicalType::Type::BSON:
123       return ::arrow::binary();
124     default:
125       return Status::NotImplemented("Unhandled logical logical_type ",
126                                     logical_type.ToString(), " for binary array");
127   }
128 }
129 
FromFLBA(const LogicalType & logical_type,int32_t physical_length)130 Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
131                                             int32_t physical_length) {
132   switch (logical_type.type()) {
133     case LogicalType::Type::DECIMAL:
134       return MakeArrowDecimal(logical_type);
135     case LogicalType::Type::NONE:
136     case LogicalType::Type::INTERVAL:
137     case LogicalType::Type::UUID:
138       return ::arrow::fixed_size_binary(physical_length);
139     default:
140       return Status::NotImplemented("Unhandled logical logical_type ",
141                                     logical_type.ToString(),
142                                     " for fixed-length binary array");
143   }
144 }
145 
FromInt32(const LogicalType & logical_type)146 ::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
147   switch (logical_type.type()) {
148     case LogicalType::Type::INT:
149       return MakeArrowInt(logical_type);
150     case LogicalType::Type::DATE:
151       return ::arrow::date32();
152     case LogicalType::Type::TIME:
153       return MakeArrowTime32(logical_type);
154     case LogicalType::Type::DECIMAL:
155       return MakeArrowDecimal(logical_type);
156     case LogicalType::Type::NONE:
157       return ::arrow::int32();
158     default:
159       return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
160                                     " for INT32");
161   }
162 }
163 
FromInt64(const LogicalType & logical_type)164 Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
165   switch (logical_type.type()) {
166     case LogicalType::Type::INT:
167       return MakeArrowInt64(logical_type);
168     case LogicalType::Type::DECIMAL:
169       return MakeArrowDecimal(logical_type);
170     case LogicalType::Type::TIMESTAMP:
171       return MakeArrowTimestamp(logical_type);
172     case LogicalType::Type::TIME:
173       return MakeArrowTime64(logical_type);
174     case LogicalType::Type::NONE:
175       return ::arrow::int64();
176     default:
177       return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
178                                     " for INT64");
179   }
180 }
181 
GetArrowType(Type::type physical_type,const LogicalType & logical_type,int type_length)182 Result<std::shared_ptr<ArrowType>> GetArrowType(Type::type physical_type,
183                                                 const LogicalType& logical_type,
184                                                 int type_length) {
185   if (logical_type.is_invalid() || logical_type.is_null()) {
186     return ::arrow::null();
187   }
188 
189   switch (physical_type) {
190     case ParquetType::BOOLEAN:
191       return ::arrow::boolean();
192     case ParquetType::INT32:
193       return FromInt32(logical_type);
194     case ParquetType::INT64:
195       return FromInt64(logical_type);
196     case ParquetType::INT96:
197       return ::arrow::timestamp(::arrow::TimeUnit::NANO);
198     case ParquetType::FLOAT:
199       return ::arrow::float32();
200     case ParquetType::DOUBLE:
201       return ::arrow::float64();
202     case ParquetType::BYTE_ARRAY:
203       return FromByteArray(logical_type);
204     case ParquetType::FIXED_LEN_BYTE_ARRAY:
205       return FromFLBA(logical_type, type_length);
206     default: {
207       // PARQUET-1565: This can occur if the file is corrupt
208       return Status::IOError("Invalid physical column type: ",
209                              TypeToString(physical_type));
210     }
211   }
212 }
213 
GetArrowType(const schema::PrimitiveNode & primitive)214 Result<std::shared_ptr<ArrowType>> GetArrowType(const schema::PrimitiveNode& primitive) {
215   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
216                       primitive.type_length());
217 }
218 
GetArrowType(const ColumnDescriptor & descriptor)219 Result<std::shared_ptr<ArrowType>> GetArrowType(const ColumnDescriptor& descriptor) {
220   return GetArrowType(descriptor.physical_type(), *descriptor.logical_type(),
221                       descriptor.type_length());
222 }
223 
224 }  // namespace arrow
225 }  // namespace parquet
226