1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <cstdint>
21 #include <cstring>
22 #include <memory>
23 #include <vector>
24 
25 #include "arrow/util/spaced.h"
26 
27 #include "parquet/exception.h"
28 #include "parquet/platform.h"
29 #include "parquet/types.h"
30 
31 namespace arrow {
32 
33 class Array;
34 class ArrayBuilder;
35 class BinaryArray;
36 class BinaryBuilder;
37 class BooleanBuilder;
38 class Int32Type;
39 class Int64Type;
40 class FloatType;
41 class DoubleType;
42 class FixedSizeBinaryType;
43 template <typename T>
44 class NumericBuilder;
45 class FixedSizeBinaryBuilder;
46 template <typename T>
47 class Dictionary32Builder;
48 
49 }  // namespace arrow
50 
51 namespace parquet {
52 
53 template <typename DType>
54 class TypedEncoder;
55 
56 using BooleanEncoder = TypedEncoder<BooleanType>;
57 using Int32Encoder = TypedEncoder<Int32Type>;
58 using Int64Encoder = TypedEncoder<Int64Type>;
59 using Int96Encoder = TypedEncoder<Int96Type>;
60 using FloatEncoder = TypedEncoder<FloatType>;
61 using DoubleEncoder = TypedEncoder<DoubleType>;
62 using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
63 using FLBAEncoder = TypedEncoder<FLBAType>;
64 
65 template <typename DType>
66 class TypedDecoder;
67 
68 class BooleanDecoder;
69 using Int32Decoder = TypedDecoder<Int32Type>;
70 using Int64Decoder = TypedDecoder<Int64Type>;
71 using Int96Decoder = TypedDecoder<Int96Type>;
72 using FloatDecoder = TypedDecoder<FloatType>;
73 using DoubleDecoder = TypedDecoder<DoubleType>;
74 using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
75 class FLBADecoder;
76 
77 template <typename T>
78 struct EncodingTraits;
79 
80 template <>
81 struct EncodingTraits<BooleanType> {
82   using Encoder = BooleanEncoder;
83   using Decoder = BooleanDecoder;
84 
85   using ArrowType = ::arrow::BooleanType;
86   using Accumulator = ::arrow::BooleanBuilder;
87   struct DictAccumulator {};
88 };
89 
90 template <>
91 struct EncodingTraits<Int32Type> {
92   using Encoder = Int32Encoder;
93   using Decoder = Int32Decoder;
94 
95   using ArrowType = ::arrow::Int32Type;
96   using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
97   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
98 };
99 
100 template <>
101 struct EncodingTraits<Int64Type> {
102   using Encoder = Int64Encoder;
103   using Decoder = Int64Decoder;
104 
105   using ArrowType = ::arrow::Int64Type;
106   using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
107   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
108 };
109 
110 template <>
111 struct EncodingTraits<Int96Type> {
112   using Encoder = Int96Encoder;
113   using Decoder = Int96Decoder;
114 
115   struct Accumulator {};
116   struct DictAccumulator {};
117 };
118 
119 template <>
120 struct EncodingTraits<FloatType> {
121   using Encoder = FloatEncoder;
122   using Decoder = FloatDecoder;
123 
124   using ArrowType = ::arrow::FloatType;
125   using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
126   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
127 };
128 
129 template <>
130 struct EncodingTraits<DoubleType> {
131   using Encoder = DoubleEncoder;
132   using Decoder = DoubleDecoder;
133 
134   using ArrowType = ::arrow::DoubleType;
135   using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
136   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
137 };
138 
139 template <>
140 struct EncodingTraits<ByteArrayType> {
141   using Encoder = ByteArrayEncoder;
142   using Decoder = ByteArrayDecoder;
143 
144   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
145   /// overflow the capacity of a single arrow::BinaryArray
146   struct Accumulator {
147     std::unique_ptr<::arrow::BinaryBuilder> builder;
148     std::vector<std::shared_ptr<::arrow::Array>> chunks;
149   };
150   using ArrowType = ::arrow::BinaryType;
151   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
152 };
153 
154 template <>
155 struct EncodingTraits<FLBAType> {
156   using Encoder = FLBAEncoder;
157   using Decoder = FLBADecoder;
158 
159   using ArrowType = ::arrow::FixedSizeBinaryType;
160   using Accumulator = ::arrow::FixedSizeBinaryBuilder;
161   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
162 };
163 
164 class ColumnDescriptor;
165 
166 // Untyped base for all encoders
167 class Encoder {
168  public:
169   virtual ~Encoder() = default;
170 
171   virtual int64_t EstimatedDataEncodedSize() = 0;
172   virtual std::shared_ptr<Buffer> FlushValues() = 0;
173   virtual Encoding::type encoding() const = 0;
174 
175   virtual void Put(const ::arrow::Array& values) = 0;
176 
177   virtual MemoryPool* memory_pool() const = 0;
178 };
179 
180 // Base class for value encoders. Since encoders may or not have state (e.g.,
181 // dictionary encoding) we use a class instance to maintain any state.
182 //
183 // Encode interfaces are internal, subject to change without deprecation.
184 template <typename DType>
185 class TypedEncoder : virtual public Encoder {
186  public:
187   typedef typename DType::c_type T;
188 
189   using Encoder::Put;
190 
191   virtual void Put(const T* src, int num_values) = 0;
192 
193   virtual void Put(const std::vector<T>& src, int num_values = -1);
194 
195   virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
196                          int64_t valid_bits_offset) = 0;
197 };
198 
199 template <typename DType>
200 void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
201   if (num_values == -1) {
202     num_values = static_cast<int>(src.size());
203   }
204   Put(src.data(), num_values);
205 }
206 
207 template <>
208 inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
209   // NOTE(wesm): This stub is here only to satisfy the compiler; it is
210   // overridden later with the actual implementation
211 }
212 
213 // Base class for dictionary encoders
214 template <typename DType>
215 class DictEncoder : virtual public TypedEncoder<DType> {
216  public:
217   /// Writes out any buffered indices to buffer preceded by the bit width of this data.
218   /// Returns the number of bytes written.
219   /// If the supplied buffer is not big enough, returns -1.
220   /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
221   /// to size buffer.
222   virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
223 
224   virtual int dict_encoded_size() = 0;
225   // virtual int dict_encoded_size() { return dict_encoded_size_; }
226 
227   virtual int bit_width() const = 0;
228 
229   /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
230   /// dict_encoded_size() bytes.
231   virtual void WriteDict(uint8_t* buffer) = 0;
232 
233   virtual int num_entries() const = 0;
234 
235   /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
236   /// assumed (without any boundschecking) that the indices reference
237   /// pre-existing dictionary values
238   /// \param[in] indices the dictionary index values. Only Int32Array currently
239   /// supported
240   virtual void PutIndices(const ::arrow::Array& indices) = 0;
241 
242   /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
243   /// separately. Currently throws exception if the current dictionary memo is
244   /// non-empty
245   /// \param[in] values the dictionary values. Only valid for certain
246   /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
247   virtual void PutDictionary(const ::arrow::Array& values) = 0;
248 };
249 
250 // ----------------------------------------------------------------------
251 // Value decoding
252 
253 class Decoder {
254  public:
255   virtual ~Decoder() = default;
256 
257   // Sets the data for a new page. This will be called multiple times on the same
258   // decoder and should reset all internal state.
259   virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
260 
261   // Returns the number of values left (for the last call to SetData()). This is
262   // the number of values left in this page.
263   virtual int values_left() const = 0;
264   virtual Encoding::type encoding() const = 0;
265 };
266 
267 template <typename DType>
268 class TypedDecoder : virtual public Decoder {
269  public:
270   using T = typename DType::c_type;
271 
272   /// \brief Decode values into a buffer
273   ///
274   /// Subclasses may override the more specialized Decode methods below.
275   ///
276   /// \param[in] buffer destination for decoded values
277   /// \param[in] max_values maximum number of values to decode
278   /// \return The number of values decoded. Should be identical to max_values except
279   /// at the end of the current data page.
280   virtual int Decode(T* buffer, int max_values) = 0;
281 
282   /// \brief Decode the values in this data page but leave spaces for null entries.
283   ///
284   /// \param[in] buffer destination for decoded values
285   /// \param[in] num_values size of the def_levels and buffer arrays including the number
286   /// of null slots
287   /// \param[in] null_count number of null slots
288   /// \param[in] valid_bits bitmap data indicating position of valid slots
289   /// \param[in] valid_bits_offset offset into valid_bits
290   /// \return The number of values decoded, including nulls.
291   virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
292                            const uint8_t* valid_bits, int64_t valid_bits_offset) {
293     if (null_count > 0) {
294       int values_to_read = num_values - null_count;
295       int values_read = Decode(buffer, values_to_read);
296       if (values_read != values_to_read) {
297         throw ParquetException("Number of values / definition_levels read did not match");
298       }
299 
300       return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
301                                                       valid_bits, valid_bits_offset);
302     } else {
303       return Decode(buffer, num_values);
304     }
305   }
306 
307   /// \brief Decode into an ArrayBuilder or other accumulator
308   ///
309   /// This function assumes the definition levels were already decoded
310   /// as a validity bitmap in the given `valid_bits`.  `null_count`
311   /// is the number of 0s in `valid_bits`.
312   /// As a space optimization, it is allowed for `valid_bits` to be null
313   /// if `null_count` is zero.
314   ///
315   /// \return number of values decoded
316   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
317                           int64_t valid_bits_offset,
318                           typename EncodingTraits<DType>::Accumulator* out) = 0;
319 
320   /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
321   ///
322   /// \return number of values decoded
323   int DecodeArrowNonNull(int num_values,
324                          typename EncodingTraits<DType>::Accumulator* out) {
325     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
326   }
327 
328   /// \brief Decode into a DictionaryBuilder
329   ///
330   /// This function assumes the definition levels were already decoded
331   /// as a validity bitmap in the given `valid_bits`.  `null_count`
332   /// is the number of 0s in `valid_bits`.
333   /// As a space optimization, it is allowed for `valid_bits` to be null
334   /// if `null_count` is zero.
335   ///
336   /// \return number of values decoded
337   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
338                           int64_t valid_bits_offset,
339                           typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
340 
341   /// \brief Decode into a DictionaryBuilder ignoring nulls
342   ///
343   /// \return number of values decoded
344   int DecodeArrowNonNull(int num_values,
345                          typename EncodingTraits<DType>::DictAccumulator* builder) {
346     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
347   }
348 };
349 
350 template <typename DType>
351 class DictDecoder : virtual public TypedDecoder<DType> {
352  public:
353   using T = typename DType::c_type;
354 
355   virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
356 
357   /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
358   /// but do not append any indices
359   virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
360 
361   /// \brief Decode only dictionary indices and append to dictionary
362   /// builder. The builder must have had the dictionary from this decoder
363   /// inserted already.
364   ///
365   /// \warning Remember to reset the builder each time the dict decoder is initialized
366   /// with a new dictionary page
367   virtual int DecodeIndicesSpaced(int num_values, int null_count,
368                                   const uint8_t* valid_bits, int64_t valid_bits_offset,
369                                   ::arrow::ArrayBuilder* builder) = 0;
370 
371   /// \brief Decode only dictionary indices (no nulls)
372   ///
373   /// \warning Remember to reset the builder each time the dict decoder is initialized
374   /// with a new dictionary page
375   virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
376 
377   /// \brief Decode only dictionary indices (no nulls). Same as above
378   /// DecodeIndices but target is an array instead of a builder.
379   ///
380   /// \note API EXPERIMENTAL
381   virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
382 
383   /// \brief Get dictionary. The reader will call this API when it encounters a
384   /// new dictionary.
385   ///
386   /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
387   /// the decoder and is destroyed when the decoder is destroyed.
388   /// @param[out] dictionary_length The dictionary length.
389   ///
390   /// \note API EXPERIMENTAL
391   virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
392 };
393 
394 // ----------------------------------------------------------------------
395 // TypedEncoder specializations, traits, and factory functions
396 
397 class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
398  public:
399   using TypedDecoder<BooleanType>::Decode;
400   virtual int Decode(uint8_t* buffer, int max_values) = 0;
401 };
402 
403 class FLBADecoder : virtual public TypedDecoder<FLBAType> {
404  public:
405   using TypedDecoder<FLBAType>::DecodeSpaced;
406 
407   // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
408   // there is value in adding specialized read methods for
409   // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
410   // then perhaps not
411 };
412 
413 PARQUET_EXPORT
414 std::unique_ptr<Encoder> MakeEncoder(
415     Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
416     const ColumnDescriptor* descr = NULLPTR,
417     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
418 
419 template <typename DType>
420 std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
421     Encoding::type encoding, bool use_dictionary = false,
422     const ColumnDescriptor* descr = NULLPTR,
423     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
424   using OutType = typename EncodingTraits<DType>::Encoder;
425   std::unique_ptr<Encoder> base =
426       MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
427   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
428 }
429 
430 PARQUET_EXPORT
431 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
432                                      const ColumnDescriptor* descr = NULLPTR);
433 
434 namespace detail {
435 
436 PARQUET_EXPORT
437 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
438                                          const ColumnDescriptor* descr,
439                                          ::arrow::MemoryPool* pool);
440 
441 }  // namespace detail
442 
443 template <typename DType>
444 std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
445     const ColumnDescriptor* descr = NULLPTR,
446     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
447   using OutType = DictDecoder<DType>;
448   auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
449   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
450 }
451 
452 template <typename DType>
453 std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
454     Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
455   using OutType = typename EncodingTraits<DType>::Decoder;
456   std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
457   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
458 }
459 
460 }  // namespace parquet
461