1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <cstdint> 21 #include <cstring> 22 #include <memory> 23 #include <vector> 24 25 #include "arrow/util/spaced.h" 26 27 #include "parquet/exception.h" 28 #include "parquet/platform.h" 29 #include "parquet/types.h" 30 31 namespace arrow { 32 33 class Array; 34 class ArrayBuilder; 35 class BinaryArray; 36 class BinaryBuilder; 37 class BooleanBuilder; 38 class Int32Type; 39 class Int64Type; 40 class FloatType; 41 class DoubleType; 42 class FixedSizeBinaryType; 43 template <typename T> 44 class NumericBuilder; 45 class FixedSizeBinaryBuilder; 46 template <typename T> 47 class Dictionary32Builder; 48 49 } // namespace arrow 50 51 namespace parquet { 52 53 template <typename DType> 54 class TypedEncoder; 55 56 using BooleanEncoder = TypedEncoder<BooleanType>; 57 using Int32Encoder = TypedEncoder<Int32Type>; 58 using Int64Encoder = TypedEncoder<Int64Type>; 59 using Int96Encoder = TypedEncoder<Int96Type>; 60 using FloatEncoder = TypedEncoder<FloatType>; 61 using DoubleEncoder = TypedEncoder<DoubleType>; 62 using ByteArrayEncoder = TypedEncoder<ByteArrayType>; 63 using FLBAEncoder = TypedEncoder<FLBAType>; 64 65 template <typename DType> 66 class TypedDecoder; 67 68 class BooleanDecoder; 69 using Int32Decoder = TypedDecoder<Int32Type>; 70 using Int64Decoder = TypedDecoder<Int64Type>; 71 using Int96Decoder = TypedDecoder<Int96Type>; 72 using FloatDecoder = TypedDecoder<FloatType>; 73 using DoubleDecoder = TypedDecoder<DoubleType>; 74 using ByteArrayDecoder = TypedDecoder<ByteArrayType>; 75 class FLBADecoder; 76 77 template <typename T> 78 struct EncodingTraits; 79 80 template <> 81 struct EncodingTraits<BooleanType> { 82 using Encoder = BooleanEncoder; 83 using Decoder = BooleanDecoder; 84 85 using ArrowType = ::arrow::BooleanType; 86 using Accumulator = ::arrow::BooleanBuilder; 87 struct DictAccumulator {}; 88 }; 89 90 template <> 91 struct EncodingTraits<Int32Type> { 92 using Encoder = Int32Encoder; 93 using Decoder = Int32Decoder; 94 95 using ArrowType = ::arrow::Int32Type; 96 using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>; 97 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>; 98 }; 99 100 template <> 101 struct EncodingTraits<Int64Type> { 102 using Encoder = Int64Encoder; 103 using Decoder = Int64Decoder; 104 105 using ArrowType = ::arrow::Int64Type; 106 using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>; 107 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>; 108 }; 109 110 template <> 111 struct EncodingTraits<Int96Type> { 112 using Encoder = Int96Encoder; 113 using Decoder = Int96Decoder; 114 115 struct Accumulator {}; 116 struct DictAccumulator {}; 117 }; 118 119 template <> 120 struct EncodingTraits<FloatType> { 121 using Encoder = FloatEncoder; 122 using Decoder = FloatDecoder; 123 124 using ArrowType = ::arrow::FloatType; 125 using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>; 126 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>; 127 }; 128 129 template <> 130 struct EncodingTraits<DoubleType> { 131 using Encoder = DoubleEncoder; 132 using Decoder = DoubleDecoder; 133 134 using ArrowType = ::arrow::DoubleType; 135 using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>; 136 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>; 137 }; 138 139 template <> 140 struct EncodingTraits<ByteArrayType> { 141 using Encoder = ByteArrayEncoder; 142 using Decoder = ByteArrayDecoder; 143 144 /// \brief Internal helper class for decoding BYTE_ARRAY data where we can 145 /// overflow the capacity of a single arrow::BinaryArray 146 struct Accumulator { 147 std::unique_ptr<::arrow::BinaryBuilder> builder; 148 std::vector<std::shared_ptr<::arrow::Array>> chunks; 149 }; 150 using ArrowType = ::arrow::BinaryType; 151 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; 152 }; 153 154 template <> 155 struct EncodingTraits<FLBAType> { 156 using Encoder = FLBAEncoder; 157 using Decoder = FLBADecoder; 158 159 using ArrowType = ::arrow::FixedSizeBinaryType; 160 using Accumulator = ::arrow::FixedSizeBinaryBuilder; 161 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>; 162 }; 163 164 class ColumnDescriptor; 165 166 // Untyped base for all encoders 167 class Encoder { 168 public: 169 virtual ~Encoder() = default; 170 171 virtual int64_t EstimatedDataEncodedSize() = 0; 172 virtual std::shared_ptr<Buffer> FlushValues() = 0; 173 virtual Encoding::type encoding() const = 0; 174 175 virtual void Put(const ::arrow::Array& values) = 0; 176 177 virtual MemoryPool* memory_pool() const = 0; 178 }; 179 180 // Base class for value encoders. Since encoders may or not have state (e.g., 181 // dictionary encoding) we use a class instance to maintain any state. 182 // 183 // Encode interfaces are internal, subject to change without deprecation. 184 template <typename DType> 185 class TypedEncoder : virtual public Encoder { 186 public: 187 typedef typename DType::c_type T; 188 189 using Encoder::Put; 190 191 virtual void Put(const T* src, int num_values) = 0; 192 193 virtual void Put(const std::vector<T>& src, int num_values = -1); 194 195 virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, 196 int64_t valid_bits_offset) = 0; 197 }; 198 199 template <typename DType> 200 void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) { 201 if (num_values == -1) { 202 num_values = static_cast<int>(src.size()); 203 } 204 Put(src.data(), num_values); 205 } 206 207 template <> 208 inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) { 209 // NOTE(wesm): This stub is here only to satisfy the compiler; it is 210 // overridden later with the actual implementation 211 } 212 213 // Base class for dictionary encoders 214 template <typename DType> 215 class DictEncoder : virtual public TypedEncoder<DType> { 216 public: 217 /// Writes out any buffered indices to buffer preceded by the bit width of this data. 218 /// Returns the number of bytes written. 219 /// If the supplied buffer is not big enough, returns -1. 220 /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize() 221 /// to size buffer. 222 virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0; 223 224 virtual int dict_encoded_size() = 0; 225 // virtual int dict_encoded_size() { return dict_encoded_size_; } 226 227 virtual int bit_width() const = 0; 228 229 /// Writes out the encoded dictionary to buffer. buffer must be preallocated to 230 /// dict_encoded_size() bytes. 231 virtual void WriteDict(uint8_t* buffer) = 0; 232 233 virtual int num_entries() const = 0; 234 235 /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is 236 /// assumed (without any boundschecking) that the indices reference 237 /// pre-existing dictionary values 238 /// \param[in] indices the dictionary index values. Only Int32Array currently 239 /// supported 240 virtual void PutIndices(const ::arrow::Array& indices) = 0; 241 242 /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices 243 /// separately. Currently throws exception if the current dictionary memo is 244 /// non-empty 245 /// \param[in] values the dictionary values. Only valid for certain 246 /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray 247 virtual void PutDictionary(const ::arrow::Array& values) = 0; 248 }; 249 250 // ---------------------------------------------------------------------- 251 // Value decoding 252 253 class Decoder { 254 public: 255 virtual ~Decoder() = default; 256 257 // Sets the data for a new page. This will be called multiple times on the same 258 // decoder and should reset all internal state. 259 virtual void SetData(int num_values, const uint8_t* data, int len) = 0; 260 261 // Returns the number of values left (for the last call to SetData()). This is 262 // the number of values left in this page. 263 virtual int values_left() const = 0; 264 virtual Encoding::type encoding() const = 0; 265 }; 266 267 template <typename DType> 268 class TypedDecoder : virtual public Decoder { 269 public: 270 using T = typename DType::c_type; 271 272 /// \brief Decode values into a buffer 273 /// 274 /// Subclasses may override the more specialized Decode methods below. 275 /// 276 /// \param[in] buffer destination for decoded values 277 /// \param[in] max_values maximum number of values to decode 278 /// \return The number of values decoded. Should be identical to max_values except 279 /// at the end of the current data page. 280 virtual int Decode(T* buffer, int max_values) = 0; 281 282 /// \brief Decode the values in this data page but leave spaces for null entries. 283 /// 284 /// \param[in] buffer destination for decoded values 285 /// \param[in] num_values size of the def_levels and buffer arrays including the number 286 /// of null slots 287 /// \param[in] null_count number of null slots 288 /// \param[in] valid_bits bitmap data indicating position of valid slots 289 /// \param[in] valid_bits_offset offset into valid_bits 290 /// \return The number of values decoded, including nulls. 291 virtual int DecodeSpaced(T* buffer, int num_values, int null_count, 292 const uint8_t* valid_bits, int64_t valid_bits_offset) { 293 if (null_count > 0) { 294 int values_to_read = num_values - null_count; 295 int values_read = Decode(buffer, values_to_read); 296 if (values_read != values_to_read) { 297 throw ParquetException("Number of values / definition_levels read did not match"); 298 } 299 300 return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count, 301 valid_bits, valid_bits_offset); 302 } else { 303 return Decode(buffer, num_values); 304 } 305 } 306 307 /// \brief Decode into an ArrayBuilder or other accumulator 308 /// 309 /// This function assumes the definition levels were already decoded 310 /// as a validity bitmap in the given `valid_bits`. `null_count` 311 /// is the number of 0s in `valid_bits`. 312 /// As a space optimization, it is allowed for `valid_bits` to be null 313 /// if `null_count` is zero. 314 /// 315 /// \return number of values decoded 316 virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, 317 int64_t valid_bits_offset, 318 typename EncodingTraits<DType>::Accumulator* out) = 0; 319 320 /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls 321 /// 322 /// \return number of values decoded 323 int DecodeArrowNonNull(int num_values, 324 typename EncodingTraits<DType>::Accumulator* out) { 325 return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out); 326 } 327 328 /// \brief Decode into a DictionaryBuilder 329 /// 330 /// This function assumes the definition levels were already decoded 331 /// as a validity bitmap in the given `valid_bits`. `null_count` 332 /// is the number of 0s in `valid_bits`. 333 /// As a space optimization, it is allowed for `valid_bits` to be null 334 /// if `null_count` is zero. 335 /// 336 /// \return number of values decoded 337 virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, 338 int64_t valid_bits_offset, 339 typename EncodingTraits<DType>::DictAccumulator* builder) = 0; 340 341 /// \brief Decode into a DictionaryBuilder ignoring nulls 342 /// 343 /// \return number of values decoded 344 int DecodeArrowNonNull(int num_values, 345 typename EncodingTraits<DType>::DictAccumulator* builder) { 346 return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder); 347 } 348 }; 349 350 template <typename DType> 351 class DictDecoder : virtual public TypedDecoder<DType> { 352 public: 353 using T = typename DType::c_type; 354 355 virtual void SetDict(TypedDecoder<DType>* dictionary) = 0; 356 357 /// \brief Insert dictionary values into the Arrow dictionary builder's memo, 358 /// but do not append any indices 359 virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0; 360 361 /// \brief Decode only dictionary indices and append to dictionary 362 /// builder. The builder must have had the dictionary from this decoder 363 /// inserted already. 364 /// 365 /// \warning Remember to reset the builder each time the dict decoder is initialized 366 /// with a new dictionary page 367 virtual int DecodeIndicesSpaced(int num_values, int null_count, 368 const uint8_t* valid_bits, int64_t valid_bits_offset, 369 ::arrow::ArrayBuilder* builder) = 0; 370 371 /// \brief Decode only dictionary indices (no nulls) 372 /// 373 /// \warning Remember to reset the builder each time the dict decoder is initialized 374 /// with a new dictionary page 375 virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0; 376 377 /// \brief Decode only dictionary indices (no nulls). Same as above 378 /// DecodeIndices but target is an array instead of a builder. 379 /// 380 /// \note API EXPERIMENTAL 381 virtual int DecodeIndices(int num_values, int32_t* indices) = 0; 382 383 /// \brief Get dictionary. The reader will call this API when it encounters a 384 /// new dictionary. 385 /// 386 /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by 387 /// the decoder and is destroyed when the decoder is destroyed. 388 /// @param[out] dictionary_length The dictionary length. 389 /// 390 /// \note API EXPERIMENTAL 391 virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0; 392 }; 393 394 // ---------------------------------------------------------------------- 395 // TypedEncoder specializations, traits, and factory functions 396 397 class BooleanDecoder : virtual public TypedDecoder<BooleanType> { 398 public: 399 using TypedDecoder<BooleanType>::Decode; 400 virtual int Decode(uint8_t* buffer, int max_values) = 0; 401 }; 402 403 class FLBADecoder : virtual public TypedDecoder<FLBAType> { 404 public: 405 using TypedDecoder<FLBAType>::DecodeSpaced; 406 407 // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if 408 // there is value in adding specialized read methods for 409 // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type 410 // then perhaps not 411 }; 412 413 PARQUET_EXPORT 414 std::unique_ptr<Encoder> MakeEncoder( 415 Type::type type_num, Encoding::type encoding, bool use_dictionary = false, 416 const ColumnDescriptor* descr = NULLPTR, 417 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); 418 419 template <typename DType> 420 std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder( 421 Encoding::type encoding, bool use_dictionary = false, 422 const ColumnDescriptor* descr = NULLPTR, 423 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { 424 using OutType = typename EncodingTraits<DType>::Encoder; 425 std::unique_ptr<Encoder> base = 426 MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool); 427 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release())); 428 } 429 430 PARQUET_EXPORT 431 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding, 432 const ColumnDescriptor* descr = NULLPTR); 433 434 namespace detail { 435 436 PARQUET_EXPORT 437 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num, 438 const ColumnDescriptor* descr, 439 ::arrow::MemoryPool* pool); 440 441 } // namespace detail 442 443 template <typename DType> 444 std::unique_ptr<DictDecoder<DType>> MakeDictDecoder( 445 const ColumnDescriptor* descr = NULLPTR, 446 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { 447 using OutType = DictDecoder<DType>; 448 auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); 449 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release())); 450 } 451 452 template <typename DType> 453 std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder( 454 Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) { 455 using OutType = typename EncodingTraits<DType>::Decoder; 456 std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr); 457 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release())); 458 } 459 460 } // namespace parquet 461