1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <limits>
21 #include <memory>
22 #include <random>
23 #include <string>
24 #include <utility>
25 #include <vector>
26 
27 #include "arrow/api.h"
28 #include "arrow/testing/gtest_util.h"
29 #include "arrow/testing/random.h"
30 #include "arrow/type_traits.h"
31 #include "arrow/util/decimal.h"
32 #include "parquet/column_reader.h"
33 
34 namespace parquet {
35 
36 using internal::RecordReader;
37 
38 namespace arrow {
39 
40 using ::arrow::Array;
41 using ::arrow::ChunkedArray;
42 using ::arrow::Status;
43 
44 template <int32_t PRECISION>
45 struct DecimalWithPrecisionAndScale {
46   static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value");
47 
48   using type = ::arrow::Decimal128Type;
49   static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id;
50   static constexpr int32_t precision = PRECISION;
51   static constexpr int32_t scale = PRECISION - 1;
52 };
53 
54 template <class ArrowType>
NonNullArray(size_t size,std::shared_ptr<Array> * out)55 ::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
56     size_t size, std::shared_ptr<Array>* out) {
57   using c_type = typename ArrowType::c_type;
58   std::vector<c_type> values;
59   ::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1), &values);
60   ::arrow::NumericBuilder<ArrowType> builder;
61   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
62   return builder.Finish(out);
63 }
64 
65 template <class ArrowType>
NonNullArray(size_t size,std::shared_ptr<Array> * out)66 ::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
67                                                            std::shared_ptr<Array>* out) {
68   std::vector<typename ArrowType::c_type> values;
69   ::arrow::randint(size, 0, 64, &values);
70 
71   // Passing data type so this will work with TimestampType too
72   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
73                                              ::arrow::default_memory_pool());
74   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
75   return builder.Finish(out);
76 }
77 
78 template <class ArrowType>
NonNullArray(size_t size,std::shared_ptr<Array> * out)79 ::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
80                                                         std::shared_ptr<Array>* out) {
81   std::vector<typename ArrowType::c_type> values;
82   ::arrow::randint(size, 0, 24, &values);
83   for (size_t i = 0; i < size; i++) {
84     values[i] *= 86400000;
85   }
86 
87   // Passing data type so this will work with TimestampType too
88   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
89                                              ::arrow::default_memory_pool());
90   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
91   return builder.Finish(out);
92 }
93 
94 template <class ArrowType>
NonNullArray(size_t size,std::shared_ptr<Array> * out)95 ::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
96     size_t size, std::shared_ptr<Array>* out) {
97   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
98   BuilderType builder;
99   for (size_t i = 0; i < size; i++) {
100     RETURN_NOT_OK(builder.Append("test-string"));
101   }
102   return builder.Finish(out);
103 }
104 
105 template <typename ArrowType>
NonNullArray(size_t size,std::shared_ptr<Array> * out)106 ::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
107     size_t size, std::shared_ptr<Array>* out) {
108   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
109   // set byte_width to the length of "fixed": 5
110   // todo: find a way to generate test data with more diversity.
111   BuilderType builder(::arrow::fixed_size_binary(5));
112   for (size_t i = 0; i < size; i++) {
113     RETURN_NOT_OK(builder.Append("fixed"));
114   }
115   return builder.Finish(out);
116 }
117 
random_decimals(int64_t n,uint32_t seed,int32_t precision,uint8_t * out)118 static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision,
119                                    uint8_t* out) {
120   std::default_random_engine gen(seed);
121   std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
122   const int32_t required_bytes = ::arrow::DecimalSize(precision);
123   constexpr int32_t byte_width = 16;
124   std::fill(out, out + byte_width * n, '\0');
125 
126   for (int64_t i = 0; i < n; ++i, out += byte_width) {
127     std::generate(out, out + required_bytes,
128                   [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
129 
130     // sign extend if the sign bit is set for the last byte generated
131     // 0b10000000 == 0x80 == 128
132     if ((out[required_bytes - 1] & '\x80') != 0) {
133       std::fill(out + required_bytes, out + byte_width, '\xFF');
134     }
135   }
136 }
137 
138 template <typename ArrowType, int32_t precision = ArrowType::precision>
139 ::arrow::enable_if_t<
140     std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>
NonNullArray(size_t size,std::shared_ptr<Array> * out)141 NonNullArray(size_t size, std::shared_ptr<Array>* out) {
142   constexpr int32_t kDecimalPrecision = precision;
143   constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
144 
145   const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
146   ::arrow::Decimal128Builder builder(type);
147   const int32_t byte_width =
148       static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
149 
150   constexpr int32_t seed = 0;
151 
152   ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
153   random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
154 
155   RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
156   return builder.Finish(out);
157 }
158 
159 template <class ArrowType>
NonNullArray(size_t size,std::shared_ptr<Array> * out)160 ::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
161                                                            std::shared_ptr<Array>* out) {
162   std::vector<uint8_t> values;
163   ::arrow::randint(size, 0, 1, &values);
164   ::arrow::BooleanBuilder builder;
165   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
166   return builder.Finish(out);
167 }
168 
169 // This helper function only supports (size/2) nulls.
170 template <typename ArrowType>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<Array> * out)171 ::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
172     size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
173   using c_type = typename ArrowType::c_type;
174   std::vector<c_type> values;
175   ::arrow::random_real(size, seed, static_cast<c_type>(-1e10), static_cast<c_type>(1e10),
176                        &values);
177   std::vector<uint8_t> valid_bytes(size, 1);
178 
179   for (size_t i = 0; i < num_nulls; i++) {
180     valid_bytes[i * 2] = 0;
181   }
182 
183   ::arrow::NumericBuilder<ArrowType> builder;
184   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
185   return builder.Finish(out);
186 }
187 
188 // This helper function only supports (size/2) nulls.
189 template <typename ArrowType>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<Array> * out)190 ::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
191                                                             uint32_t seed,
192                                                             std::shared_ptr<Array>* out) {
193   std::vector<typename ArrowType::c_type> values;
194 
195   // Seed is random in Arrow right now
196   (void)seed;
197   ::arrow::randint(size, 0, 64, &values);
198   std::vector<uint8_t> valid_bytes(size, 1);
199 
200   for (size_t i = 0; i < num_nulls; i++) {
201     valid_bytes[i * 2] = 0;
202   }
203 
204   // Passing data type so this will work with TimestampType too
205   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
206                                              ::arrow::default_memory_pool());
207   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
208   return builder.Finish(out);
209 }
210 
211 template <typename ArrowType>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<Array> * out)212 ::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
213                                                          uint32_t seed,
214                                                          std::shared_ptr<Array>* out) {
215   std::vector<typename ArrowType::c_type> values;
216 
217   // Seed is random in Arrow right now
218   (void)seed;
219   ::arrow::randint(size, 0, 24, &values);
220   for (size_t i = 0; i < size; i++) {
221     values[i] *= 86400000;
222   }
223   std::vector<uint8_t> valid_bytes(size, 1);
224 
225   for (size_t i = 0; i < num_nulls; i++) {
226     valid_bytes[i * 2] = 0;
227   }
228 
229   // Passing data type so this will work with TimestampType too
230   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
231                                              ::arrow::default_memory_pool());
232   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
233   return builder.Finish(out);
234 }
235 
236 // This helper function only supports (size/2) nulls yet.
237 template <typename ArrowType>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<::arrow::Array> * out)238 ::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
239     size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
240   std::vector<uint8_t> valid_bytes(size, 1);
241 
242   for (size_t i = 0; i < num_nulls; i++) {
243     valid_bytes[i * 2] = 0;
244   }
245 
246   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
247   BuilderType builder;
248 
249   const int kBufferSize = 10;
250   uint8_t buffer[kBufferSize];
251   for (size_t i = 0; i < size; i++) {
252     if (!valid_bytes[i]) {
253       RETURN_NOT_OK(builder.AppendNull());
254     } else {
255       ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
256       RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
257     }
258   }
259   return builder.Finish(out);
260 }
261 
262 // This helper function only supports (size/2) nulls yet,
263 // same as NullableArray<String|Binary>(..)
264 template <typename ArrowType>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<::arrow::Array> * out)265 ::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
266     size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
267   std::vector<uint8_t> valid_bytes(size, 1);
268 
269   for (size_t i = 0; i < num_nulls; i++) {
270     valid_bytes[i * 2] = 0;
271   }
272 
273   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
274   const int byte_width = 10;
275   BuilderType builder(::arrow::fixed_size_binary(byte_width));
276 
277   const int kBufferSize = byte_width;
278   uint8_t buffer[kBufferSize];
279   for (size_t i = 0; i < size; i++) {
280     if (!valid_bytes[i]) {
281       RETURN_NOT_OK(builder.AppendNull());
282     } else {
283       ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
284       RETURN_NOT_OK(builder.Append(buffer));
285     }
286   }
287   return builder.Finish(out);
288 }
289 
290 template <typename ArrowType, int32_t precision = ArrowType::precision>
291 ::arrow::enable_if_t<
292     std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<::arrow::Array> * out)293 NullableArray(size_t size, size_t num_nulls, uint32_t seed,
294               std::shared_ptr<::arrow::Array>* out) {
295   std::vector<uint8_t> valid_bytes(size, '\1');
296 
297   for (size_t i = 0; i < num_nulls; ++i) {
298     valid_bytes[i * 2] = '\0';
299   }
300 
301   constexpr int32_t kDecimalPrecision = precision;
302   constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
303   const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
304   const int32_t byte_width =
305       static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
306 
307   ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
308 
309   random_decimals(size, seed, precision, out_buf->mutable_data());
310 
311   ::arrow::Decimal128Builder builder(type);
312   RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
313   return builder.Finish(out);
314 }
315 
316 // This helper function only supports (size/2) nulls yet.
317 template <class ArrowType>
NullableArray(size_t size,size_t num_nulls,uint32_t seed,std::shared_ptr<Array> * out)318 ::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
319                                                             uint32_t seed,
320                                                             std::shared_ptr<Array>* out) {
321   std::vector<uint8_t> values;
322 
323   // Seed is random in Arrow right now
324   (void)seed;
325 
326   ::arrow::randint(size, 0, 1, &values);
327   std::vector<uint8_t> valid_bytes(size, 1);
328 
329   for (size_t i = 0; i < num_nulls; i++) {
330     valid_bytes[i * 2] = 0;
331   }
332 
333   ::arrow::BooleanBuilder builder;
334   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
335   return builder.Finish(out);
336 }
337 
338 /// Wrap an Array into a ListArray by splitting it up into size lists.
339 ///
340 /// This helper function only supports (size/2) nulls.
MakeListArray(const std::shared_ptr<Array> & values,int64_t size,int64_t null_count,const std::string & item_name,bool nullable_values,std::shared_ptr<::arrow::ListArray> * out)341 Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
342                      int64_t null_count, const std::string& item_name,
343                      bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
344   // We always include an empty list
345   int64_t non_null_entries = size - null_count - 1;
346   int64_t length_per_entry = values->length() / non_null_entries;
347 
348   auto offsets = AllocateBuffer();
349   RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
350   int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
351 
352   auto null_bitmap = AllocateBuffer();
353   int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size);
354   RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
355   uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
356   memset(null_bitmap_ptr, 0, bitmap_size);
357 
358   int32_t current_offset = 0;
359   for (int64_t i = 0; i < size; i++) {
360     offsets_ptr[i] = current_offset;
361     if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
362       // Non-null list (list with index 1 is always empty).
363       ::arrow::BitUtil::SetBit(null_bitmap_ptr, i);
364       if (i != 1) {
365         current_offset += static_cast<int32_t>(length_per_entry);
366       }
367     }
368   }
369   offsets_ptr[size] = static_cast<int32_t>(values->length());
370 
371   auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
372   *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
373                                               values, null_bitmap, null_count);
374 
375   return Status::OK();
376 }
377 
378 // Make an array containing only empty lists, with a null values array
MakeEmptyListsArray(int64_t size,std::shared_ptr<Array> * out_array)379 Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
380   // Allocate an offsets buffer containing only zeroes
381   const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
382   ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
383   memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
384 
385   auto value_field =
386       ::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
387   auto list_type = ::arrow::list(value_field);
388 
389   std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
390                                                         nullptr /* values */};
391   auto child_data =
392       ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
393 
394   std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
395                                                   std::move(offsets_buffer)};
396   auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
397   array_data->child_data.push_back(child_data);
398 
399   *out_array = ::arrow::MakeArray(array_data);
400   return Status::OK();
401 }
402 
MakeSimpleTable(const std::shared_ptr<ChunkedArray> & values,bool nullable)403 std::shared_ptr<::arrow::Table> MakeSimpleTable(
404     const std::shared_ptr<ChunkedArray>& values, bool nullable) {
405   auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
406   return ::arrow::Table::Make(schema, {values});
407 }
408 
MakeSimpleTable(const std::shared_ptr<Array> & values,bool nullable)409 std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
410                                                 bool nullable) {
411   auto carr = std::make_shared<::arrow::ChunkedArray>(values);
412   return MakeSimpleTable(carr, nullable);
413 }
414 
415 template <typename T>
ExpectArray(T * expected,Array * result)416 void ExpectArray(T* expected, Array* result) {
417   auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
418   for (int i = 0; i < result->length(); i++) {
419     EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
420   }
421 }
422 
423 template <typename ArrowType>
ExpectArrayT(void * expected,Array * result)424 void ExpectArrayT(void* expected, Array* result) {
425   ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
426   for (int64_t i = 0; i < result->length(); i++) {
427     EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
428               reinterpret_cast<const typename ArrowType::c_type*>(
429                   p_array->values()->data())[i]);
430   }
431 }
432 
433 template <>
434 void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
435   ::arrow::BooleanBuilder builder;
436   ARROW_EXPECT_OK(
437       builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
438 
439   std::shared_ptr<Array> expected_array;
440   ARROW_EXPECT_OK(builder.Finish(&expected_array));
441   EXPECT_TRUE(result->Equals(*expected_array));
442 }
443 
444 }  // namespace arrow
445 
446 }  // namespace parquet
447