1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "parquet/level_conversion.h"
19 
20 #include "parquet/level_comparison.h"
21 #include "parquet/test_util.h"
22 
23 #include <gmock/gmock.h>
24 #include <gtest/gtest.h>
25 
26 #include <string>
27 #include <vector>
28 
29 #include "arrow/testing/gtest_compat.h"
30 #include "arrow/util/bit_util.h"
31 #include "arrow/util/bitmap.h"
32 #include "arrow/util/ubsan.h"
33 
34 namespace parquet {
35 namespace internal {
36 
37 using ::arrow::internal::Bitmap;
38 using ::testing::ElementsAreArray;
39 
BitmapToString(const uint8_t * bitmap,int64_t bit_count)40 std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) {
41   return ::arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString();
42 }
43 
BitmapToString(const std::vector<uint8_t> & bitmap,int64_t bit_count)44 std::string BitmapToString(const std::vector<uint8_t>& bitmap, int64_t bit_count) {
45   return BitmapToString(bitmap.data(), bit_count);
46 }
47 
TEST(TestColumnReader,DefLevelsToBitmap)48 TEST(TestColumnReader, DefLevelsToBitmap) {
49   // Bugs in this function were exposed in ARROW-3930
50   std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3, 3};
51 
52   std::vector<uint8_t> valid_bits(2, 0);
53 
54   LevelInfo level_info;
55   level_info.def_level = 3;
56   level_info.rep_level = 1;
57 
58   ValidityBitmapInputOutput io;
59   io.values_read_upper_bound = def_levels.size();
60   io.values_read = -1;
61   io.valid_bits = valid_bits.data();
62 
63   internal::DefLevelsToBitmap(def_levels.data(), 9, level_info, &io);
64   ASSERT_EQ(9, io.values_read);
65   ASSERT_EQ(1, io.null_count);
66 
67   // Call again with 0 definition levels, make sure that valid_bits is unmodified
68   const uint8_t current_byte = valid_bits[1];
69   io.null_count = 0;
70   internal::DefLevelsToBitmap(def_levels.data(), 0, level_info, &io);
71 
72   ASSERT_EQ(0, io.values_read);
73   ASSERT_EQ(0, io.null_count);
74   ASSERT_EQ(current_byte, valid_bits[1]);
75 }
76 
TEST(TestColumnReader,DefLevelsToBitmapPowerOfTwo)77 TEST(TestColumnReader, DefLevelsToBitmapPowerOfTwo) {
78   // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a
79   // length equal to a power of two and also using a non-zero valid_bits_offset.  This
80   // should not fail when run with ASAN or valgrind.
81   std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3};
82   std::vector<uint8_t> valid_bits(1, 0);
83 
84   LevelInfo level_info;
85   level_info.rep_level = 1;
86   level_info.def_level = 3;
87 
88   ValidityBitmapInputOutput io;
89   io.values_read_upper_bound = def_levels.size();
90   io.values_read = -1;
91   io.valid_bits = valid_bits.data();
92 
93   // Read the latter half of the validity bitmap
94   internal::DefLevelsToBitmap(def_levels.data() + 4, 4, level_info, &io);
95   ASSERT_EQ(4, io.values_read);
96   ASSERT_EQ(0, io.null_count);
97 }
98 
99 #if defined(ARROW_LITTLE_ENDIAN)
TEST(GreaterThanBitmap,GeneratesExpectedBitmasks)100 TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {
101   std::vector<int16_t> levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
102                                  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
103                                  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
104                                  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
105   EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/0, /*rhs*/ 0), 0);
106   EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 8), 0);
107   EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ -1),
108             0xFFFFFFFFFFFFFFFF);
109   // Should be zero padded.
110   EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/47, /*rhs*/ -1),
111             0x7FFFFFFFFFFF);
112   EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 6),
113             0x8080808080808080);
114 }
115 #endif
116 
TEST(DefLevelsToBitmap,WithRepetitionLevelFiltersOutEmptyListValues)117 TEST(DefLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) {
118   std::vector<uint8_t> validity_bitmap(/*count*/ 8, 0);
119 
120   ValidityBitmapInputOutput io;
121   io.values_read_upper_bound = 64;
122   io.values_read = 1;
123   io.null_count = 5;
124   io.valid_bits = validity_bitmap.data();
125   io.valid_bits_offset = 1;
126 
127   LevelInfo level_info;
128   level_info.repeated_ancestor_def_level = 1;
129   level_info.def_level = 2;
130   level_info.rep_level = 1;
131   // All zeros should be ignored, ones should be unset in the bitmp and 2 should be set.
132   std::vector<int16_t> def_levels = {0, 0, 0, 2, 2, 1, 0, 2};
133   DefLevelsToBitmap(def_levels.data(), def_levels.size(), level_info, &io);
134 
135   EXPECT_EQ(BitmapToString(validity_bitmap, /*bit_count=*/8), "01101000");
136   for (size_t x = 1; x < validity_bitmap.size(); x++) {
137     EXPECT_EQ(validity_bitmap[x], 0) << "index: " << x;
138   }
139   EXPECT_EQ(io.null_count, /*5 + 1 =*/6);
140   EXPECT_EQ(io.values_read, 4);  // value should get overwritten.
141 }
142 
143 struct MultiLevelTestData {
144  public:
145   std::vector<int16_t> def_levels;
146   std::vector<int16_t> rep_levels;
147 };
148 
TriplyNestedList()149 MultiLevelTestData TriplyNestedList() {
150   // Triply nested list values borrow from write_path
151   // [null, [[1 , null, 3], []], []],
152   // [[[]], [[], [1, 2]], null, [[3]]],
153   // null,
154   // []
155   return MultiLevelTestData{
156       /*def_levels=*/std::vector<int16_t>{2, 7, 6, 7, 5, 3,  // first row
157                                           5, 5, 7, 7, 2, 7,  // second row
158                                           0,                 // third row
159                                           1},
160       /*rep_levels=*/std::vector<int16_t>{0, 1, 3, 3, 2, 1,  // first row
161                                           0, 1, 2, 3, 1, 1,  // second row
162                                           0, 0}};
163 }
164 
165 template <typename ConverterType>
166 class NestedListTest : public testing::Test {
167  public:
InitForLength(int length)168   void InitForLength(int length) {
169     this->validity_bits_.clear();
170     this->validity_bits_.insert(this->validity_bits_.end(), length, 0);
171     validity_io_.valid_bits = validity_bits_.data();
172     validity_io_.values_read_upper_bound = length;
173     offsets_.clear();
174     offsets_.insert(offsets_.end(), length + 1, 0);
175   }
176 
Run(const MultiLevelTestData & test_data,LevelInfo level_info)177   typename ConverterType::OffsetsType* Run(const MultiLevelTestData& test_data,
178                                            LevelInfo level_info) {
179     return this->converter_.ComputeListInfo(test_data, level_info, &validity_io_,
180                                             offsets_.data());
181   }
182 
183   ConverterType converter_;
184   ValidityBitmapInputOutput validity_io_;
185   std::vector<uint8_t> validity_bits_;
186   std::vector<typename ConverterType::OffsetsType> offsets_;
187 };
188 
189 template <typename IndexType>
190 struct RepDefLevelConverter {
191   using OffsetsType = IndexType;
ComputeListInfoparquet::internal::RepDefLevelConverter192   OffsetsType* ComputeListInfo(const MultiLevelTestData& test_data, LevelInfo level_info,
193                                ValidityBitmapInputOutput* output, IndexType* offsets) {
194     DefRepLevelsToList(test_data.def_levels.data(), test_data.rep_levels.data(),
195                        test_data.def_levels.size(), level_info, output, offsets);
196     return offsets + output->values_read;
197   }
198 };
199 
200 using ConverterTypes =
201     ::testing::Types<RepDefLevelConverter</*list_length_type=*/int32_t>,
202                      RepDefLevelConverter</*list_length_type=*/int64_t>>;
203 TYPED_TEST_SUITE(NestedListTest, ConverterTypes);
204 
TYPED_TEST(NestedListTest,OuterMostTest)205 TYPED_TEST(NestedListTest, OuterMostTest) {
206   // [null, [[1 , null, 3], []], []],
207   // [[[]], [[], [1, 2]], null, [[3]]],
208   // null,
209   // []
210   // -> 4 outer most lists (len(3), len(4), null, len(0))
211   LevelInfo level_info;
212   level_info.rep_level = 1;
213   level_info.def_level = 2;
214 
215   this->InitForLength(4);
216   typename TypeParam::OffsetsType* next_position =
217       this->Run(TriplyNestedList(), level_info);
218 
219   EXPECT_EQ(next_position, this->offsets_.data() + 4);
220   EXPECT_THAT(this->offsets_, testing::ElementsAre(0, 3, 7, 7, 7));
221 
222   EXPECT_EQ(this->validity_io_.values_read, 4);
223   EXPECT_EQ(this->validity_io_.null_count, 1);
224   EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/4), "1101");
225 }
226 
TYPED_TEST(NestedListTest,MiddleListTest)227 TYPED_TEST(NestedListTest, MiddleListTest) {
228   // [null, [[1 , null, 3], []], []],
229   // [[[]], [[], [1, 2]], null, [[3]]],
230   // null,
231   // []
232   // -> middle lists (null, len(2), len(0),
233   //                  len(1), len(2), null, len(1),
234   //                  N/A,
235   //                  N/A
236   LevelInfo level_info;
237   level_info.rep_level = 2;
238   level_info.def_level = 4;
239   level_info.repeated_ancestor_def_level = 2;
240 
241   this->InitForLength(7);
242   typename TypeParam::OffsetsType* next_position =
243       this->Run(TriplyNestedList(), level_info);
244 
245   EXPECT_EQ(next_position, this->offsets_.data() + 7);
246   EXPECT_THAT(this->offsets_, testing::ElementsAre(0, 0, 2, 2, 3, 5, 5, 6));
247 
248   EXPECT_EQ(this->validity_io_.values_read, 7);
249   EXPECT_EQ(this->validity_io_.null_count, 2);
250   EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/7), "0111101");
251 }
252 
TYPED_TEST(NestedListTest,InnerMostListTest)253 TYPED_TEST(NestedListTest, InnerMostListTest) {
254   // [null, [[1, null, 3], []], []],
255   // [[[]], [[], [1, 2]], null, [[3]]],
256   // null,
257   // []
258   // -> 6 inner lists (N/A, [len(3), len(0)], N/A
259   //                        len(0), [len(0), len(2)], N/A, len(1),
260   //                        N/A,
261   //                        N/A
262   LevelInfo level_info;
263   level_info.rep_level = 3;
264   level_info.def_level = 6;
265   level_info.repeated_ancestor_def_level = 4;
266 
267   this->InitForLength(6);
268   typename TypeParam::OffsetsType* next_position =
269       this->Run(TriplyNestedList(), level_info);
270 
271   EXPECT_EQ(next_position, this->offsets_.data() + 6);
272   EXPECT_THAT(this->offsets_, testing::ElementsAre(0, 3, 3, 3, 3, 5, 6));
273 
274   EXPECT_EQ(this->validity_io_.values_read, 6);
275   EXPECT_EQ(this->validity_io_.null_count, 0);
276   EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/6), "111111");
277 }
278 
TYPED_TEST(NestedListTest,SimpleLongList)279 TYPED_TEST(NestedListTest, SimpleLongList) {
280   LevelInfo level_info;
281   level_info.rep_level = 1;
282   level_info.def_level = 2;
283   level_info.repeated_ancestor_def_level = 0;
284 
285   MultiLevelTestData test_data;
286   // No empty lists.
287   test_data.def_levels = std::vector<int16_t>(65 * 9, 2);
288   for (int x = 0; x < 65; x++) {
289     test_data.rep_levels.push_back(0);
290     test_data.rep_levels.insert(test_data.rep_levels.end(), 8,
291                                 /*rep_level=*/1);
292   }
293 
294   std::vector<typename TypeParam::OffsetsType> expected_offsets(66, 0);
295   for (size_t x = 1; x < expected_offsets.size(); x++) {
296     expected_offsets[x] = static_cast<typename TypeParam::OffsetsType>(x) * 9;
297   }
298   this->InitForLength(65);
299   typename TypeParam::OffsetsType* next_position = this->Run(test_data, level_info);
300 
301   EXPECT_EQ(next_position, this->offsets_.data() + 65);
302   EXPECT_THAT(this->offsets_, testing::ElementsAreArray(expected_offsets));
303 
304   EXPECT_EQ(this->validity_io_.values_read, 65);
305   EXPECT_EQ(this->validity_io_.null_count, 0);
306   EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/65),
307             "11111111 "
308             "11111111 "
309             "11111111 "
310             "11111111 "
311             "11111111 "
312             "11111111 "
313             "11111111 "
314             "11111111 "
315             "1");
316 }
317 
TYPED_TEST(NestedListTest,TestOverflow)318 TYPED_TEST(NestedListTest, TestOverflow) {
319   LevelInfo level_info;
320   level_info.rep_level = 1;
321   level_info.def_level = 2;
322   level_info.repeated_ancestor_def_level = 0;
323 
324   MultiLevelTestData test_data;
325   test_data.def_levels = std::vector<int16_t>{2};
326   test_data.rep_levels = std::vector<int16_t>{0};
327 
328   this->InitForLength(2);
329   // Offsets is populated as the cumulative sum of all elements,
330   // so populating the offsets[0] with max-value impacts the
331   // other values populated.
332   this->offsets_[0] = std::numeric_limits<typename TypeParam::OffsetsType>::max();
333   this->offsets_[1] = std::numeric_limits<typename TypeParam::OffsetsType>::max();
334   ASSERT_THROW(this->Run(test_data, level_info), ParquetException);
335 
336   ASSERT_THROW(this->Run(test_data, level_info), ParquetException);
337 
338   // Same thing should happen if the list already existed.
339   test_data.rep_levels = std::vector<int16_t>{1};
340   ASSERT_THROW(this->Run(test_data, level_info), ParquetException);
341 
342   // Should be OK because it shouldn't increment.
343   test_data.def_levels = std::vector<int16_t>{0};
344   test_data.rep_levels = std::vector<int16_t>{0};
345   this->Run(test_data, level_info);
346 }
347 
TEST(TestOnlyExtractBitsSoftware,BasicTest)348 TEST(TestOnlyExtractBitsSoftware, BasicTest) {
349   auto check = [](uint64_t bitmap, uint64_t selection, uint64_t expected) -> void {
350     EXPECT_EQ(TestOnlyExtractBitsSoftware(bitmap, selection), expected);
351   };
352   check(0xFF, 0, 0);
353   check(0xFF, ~uint64_t{0}, 0xFF);
354   check(0xFF00FF, 0xAAAA, 0x000F);
355   check(0xFF0AFF, 0xAFAA, 0x00AF);
356   check(0xFFAAFF, 0xAFAA, 0x03AF);
357   check(0xFECBDA9876543210ULL, 0xF00FF00FF00FF00FULL, 0xFBD87430ULL);
358 }
359 
360 }  // namespace internal
361 }  // namespace parquet
362