1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "parquet/level_conversion.h"
19
20 #include "parquet/level_comparison.h"
21 #include "parquet/test_util.h"
22
23 #include <gmock/gmock.h>
24 #include <gtest/gtest.h>
25
26 #include <string>
27 #include <vector>
28
29 #include "arrow/testing/gtest_compat.h"
30 #include "arrow/util/bit_util.h"
31 #include "arrow/util/bitmap.h"
32 #include "arrow/util/ubsan.h"
33
34 namespace parquet {
35 namespace internal {
36
37 using ::arrow::internal::Bitmap;
38 using ::testing::ElementsAreArray;
39
BitmapToString(const uint8_t * bitmap,int64_t bit_count)40 std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) {
41 return ::arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString();
42 }
43
BitmapToString(const std::vector<uint8_t> & bitmap,int64_t bit_count)44 std::string BitmapToString(const std::vector<uint8_t>& bitmap, int64_t bit_count) {
45 return BitmapToString(bitmap.data(), bit_count);
46 }
47
TEST(TestColumnReader,DefLevelsToBitmap)48 TEST(TestColumnReader, DefLevelsToBitmap) {
49 // Bugs in this function were exposed in ARROW-3930
50 std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3, 3};
51
52 std::vector<uint8_t> valid_bits(2, 0);
53
54 LevelInfo level_info;
55 level_info.def_level = 3;
56 level_info.rep_level = 1;
57
58 ValidityBitmapInputOutput io;
59 io.values_read_upper_bound = def_levels.size();
60 io.values_read = -1;
61 io.valid_bits = valid_bits.data();
62
63 internal::DefLevelsToBitmap(def_levels.data(), 9, level_info, &io);
64 ASSERT_EQ(9, io.values_read);
65 ASSERT_EQ(1, io.null_count);
66
67 // Call again with 0 definition levels, make sure that valid_bits is unmodified
68 const uint8_t current_byte = valid_bits[1];
69 io.null_count = 0;
70 internal::DefLevelsToBitmap(def_levels.data(), 0, level_info, &io);
71
72 ASSERT_EQ(0, io.values_read);
73 ASSERT_EQ(0, io.null_count);
74 ASSERT_EQ(current_byte, valid_bits[1]);
75 }
76
TEST(TestColumnReader,DefLevelsToBitmapPowerOfTwo)77 TEST(TestColumnReader, DefLevelsToBitmapPowerOfTwo) {
78 // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a
79 // length equal to a power of two and also using a non-zero valid_bits_offset. This
80 // should not fail when run with ASAN or valgrind.
81 std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3};
82 std::vector<uint8_t> valid_bits(1, 0);
83
84 LevelInfo level_info;
85 level_info.rep_level = 1;
86 level_info.def_level = 3;
87
88 ValidityBitmapInputOutput io;
89 io.values_read_upper_bound = def_levels.size();
90 io.values_read = -1;
91 io.valid_bits = valid_bits.data();
92
93 // Read the latter half of the validity bitmap
94 internal::DefLevelsToBitmap(def_levels.data() + 4, 4, level_info, &io);
95 ASSERT_EQ(4, io.values_read);
96 ASSERT_EQ(0, io.null_count);
97 }
98
99 #if defined(ARROW_LITTLE_ENDIAN)
TEST(GreaterThanBitmap,GeneratesExpectedBitmasks)100 TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) {
101 std::vector<int16_t> levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
102 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
103 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
104 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
105 EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/0, /*rhs*/ 0), 0);
106 EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 8), 0);
107 EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ -1),
108 0xFFFFFFFFFFFFFFFF);
109 // Should be zero padded.
110 EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/47, /*rhs*/ -1),
111 0x7FFFFFFFFFFF);
112 EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 6),
113 0x8080808080808080);
114 }
115 #endif
116
TEST(DefLevelsToBitmap,WithRepetitionLevelFiltersOutEmptyListValues)117 TEST(DefLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) {
118 std::vector<uint8_t> validity_bitmap(/*count*/ 8, 0);
119
120 ValidityBitmapInputOutput io;
121 io.values_read_upper_bound = 64;
122 io.values_read = 1;
123 io.null_count = 5;
124 io.valid_bits = validity_bitmap.data();
125 io.valid_bits_offset = 1;
126
127 LevelInfo level_info;
128 level_info.repeated_ancestor_def_level = 1;
129 level_info.def_level = 2;
130 level_info.rep_level = 1;
131 // All zeros should be ignored, ones should be unset in the bitmp and 2 should be set.
132 std::vector<int16_t> def_levels = {0, 0, 0, 2, 2, 1, 0, 2};
133 DefLevelsToBitmap(def_levels.data(), def_levels.size(), level_info, &io);
134
135 EXPECT_EQ(BitmapToString(validity_bitmap, /*bit_count=*/8), "01101000");
136 for (size_t x = 1; x < validity_bitmap.size(); x++) {
137 EXPECT_EQ(validity_bitmap[x], 0) << "index: " << x;
138 }
139 EXPECT_EQ(io.null_count, /*5 + 1 =*/6);
140 EXPECT_EQ(io.values_read, 4); // value should get overwritten.
141 }
142
143 struct MultiLevelTestData {
144 public:
145 std::vector<int16_t> def_levels;
146 std::vector<int16_t> rep_levels;
147 };
148
TriplyNestedList()149 MultiLevelTestData TriplyNestedList() {
150 // Triply nested list values borrow from write_path
151 // [null, [[1 , null, 3], []], []],
152 // [[[]], [[], [1, 2]], null, [[3]]],
153 // null,
154 // []
155 return MultiLevelTestData{
156 /*def_levels=*/std::vector<int16_t>{2, 7, 6, 7, 5, 3, // first row
157 5, 5, 7, 7, 2, 7, // second row
158 0, // third row
159 1},
160 /*rep_levels=*/std::vector<int16_t>{0, 1, 3, 3, 2, 1, // first row
161 0, 1, 2, 3, 1, 1, // second row
162 0, 0}};
163 }
164
165 template <typename ConverterType>
166 class NestedListTest : public testing::Test {
167 public:
InitForLength(int length)168 void InitForLength(int length) {
169 this->validity_bits_.clear();
170 this->validity_bits_.insert(this->validity_bits_.end(), length, 0);
171 validity_io_.valid_bits = validity_bits_.data();
172 validity_io_.values_read_upper_bound = length;
173 offsets_.clear();
174 offsets_.insert(offsets_.end(), length + 1, 0);
175 }
176
Run(const MultiLevelTestData & test_data,LevelInfo level_info)177 typename ConverterType::OffsetsType* Run(const MultiLevelTestData& test_data,
178 LevelInfo level_info) {
179 return this->converter_.ComputeListInfo(test_data, level_info, &validity_io_,
180 offsets_.data());
181 }
182
183 ConverterType converter_;
184 ValidityBitmapInputOutput validity_io_;
185 std::vector<uint8_t> validity_bits_;
186 std::vector<typename ConverterType::OffsetsType> offsets_;
187 };
188
189 template <typename IndexType>
190 struct RepDefLevelConverter {
191 using OffsetsType = IndexType;
ComputeListInfoparquet::internal::RepDefLevelConverter192 OffsetsType* ComputeListInfo(const MultiLevelTestData& test_data, LevelInfo level_info,
193 ValidityBitmapInputOutput* output, IndexType* offsets) {
194 DefRepLevelsToList(test_data.def_levels.data(), test_data.rep_levels.data(),
195 test_data.def_levels.size(), level_info, output, offsets);
196 return offsets + output->values_read;
197 }
198 };
199
200 using ConverterTypes =
201 ::testing::Types<RepDefLevelConverter</*list_length_type=*/int32_t>,
202 RepDefLevelConverter</*list_length_type=*/int64_t>>;
203 TYPED_TEST_SUITE(NestedListTest, ConverterTypes);
204
TYPED_TEST(NestedListTest,OuterMostTest)205 TYPED_TEST(NestedListTest, OuterMostTest) {
206 // [null, [[1 , null, 3], []], []],
207 // [[[]], [[], [1, 2]], null, [[3]]],
208 // null,
209 // []
210 // -> 4 outer most lists (len(3), len(4), null, len(0))
211 LevelInfo level_info;
212 level_info.rep_level = 1;
213 level_info.def_level = 2;
214
215 this->InitForLength(4);
216 typename TypeParam::OffsetsType* next_position =
217 this->Run(TriplyNestedList(), level_info);
218
219 EXPECT_EQ(next_position, this->offsets_.data() + 4);
220 EXPECT_THAT(this->offsets_, testing::ElementsAre(0, 3, 7, 7, 7));
221
222 EXPECT_EQ(this->validity_io_.values_read, 4);
223 EXPECT_EQ(this->validity_io_.null_count, 1);
224 EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/4), "1101");
225 }
226
TYPED_TEST(NestedListTest,MiddleListTest)227 TYPED_TEST(NestedListTest, MiddleListTest) {
228 // [null, [[1 , null, 3], []], []],
229 // [[[]], [[], [1, 2]], null, [[3]]],
230 // null,
231 // []
232 // -> middle lists (null, len(2), len(0),
233 // len(1), len(2), null, len(1),
234 // N/A,
235 // N/A
236 LevelInfo level_info;
237 level_info.rep_level = 2;
238 level_info.def_level = 4;
239 level_info.repeated_ancestor_def_level = 2;
240
241 this->InitForLength(7);
242 typename TypeParam::OffsetsType* next_position =
243 this->Run(TriplyNestedList(), level_info);
244
245 EXPECT_EQ(next_position, this->offsets_.data() + 7);
246 EXPECT_THAT(this->offsets_, testing::ElementsAre(0, 0, 2, 2, 3, 5, 5, 6));
247
248 EXPECT_EQ(this->validity_io_.values_read, 7);
249 EXPECT_EQ(this->validity_io_.null_count, 2);
250 EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/7), "0111101");
251 }
252
TYPED_TEST(NestedListTest,InnerMostListTest)253 TYPED_TEST(NestedListTest, InnerMostListTest) {
254 // [null, [[1, null, 3], []], []],
255 // [[[]], [[], [1, 2]], null, [[3]]],
256 // null,
257 // []
258 // -> 6 inner lists (N/A, [len(3), len(0)], N/A
259 // len(0), [len(0), len(2)], N/A, len(1),
260 // N/A,
261 // N/A
262 LevelInfo level_info;
263 level_info.rep_level = 3;
264 level_info.def_level = 6;
265 level_info.repeated_ancestor_def_level = 4;
266
267 this->InitForLength(6);
268 typename TypeParam::OffsetsType* next_position =
269 this->Run(TriplyNestedList(), level_info);
270
271 EXPECT_EQ(next_position, this->offsets_.data() + 6);
272 EXPECT_THAT(this->offsets_, testing::ElementsAre(0, 3, 3, 3, 3, 5, 6));
273
274 EXPECT_EQ(this->validity_io_.values_read, 6);
275 EXPECT_EQ(this->validity_io_.null_count, 0);
276 EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/6), "111111");
277 }
278
TYPED_TEST(NestedListTest,SimpleLongList)279 TYPED_TEST(NestedListTest, SimpleLongList) {
280 LevelInfo level_info;
281 level_info.rep_level = 1;
282 level_info.def_level = 2;
283 level_info.repeated_ancestor_def_level = 0;
284
285 MultiLevelTestData test_data;
286 // No empty lists.
287 test_data.def_levels = std::vector<int16_t>(65 * 9, 2);
288 for (int x = 0; x < 65; x++) {
289 test_data.rep_levels.push_back(0);
290 test_data.rep_levels.insert(test_data.rep_levels.end(), 8,
291 /*rep_level=*/1);
292 }
293
294 std::vector<typename TypeParam::OffsetsType> expected_offsets(66, 0);
295 for (size_t x = 1; x < expected_offsets.size(); x++) {
296 expected_offsets[x] = static_cast<typename TypeParam::OffsetsType>(x) * 9;
297 }
298 this->InitForLength(65);
299 typename TypeParam::OffsetsType* next_position = this->Run(test_data, level_info);
300
301 EXPECT_EQ(next_position, this->offsets_.data() + 65);
302 EXPECT_THAT(this->offsets_, testing::ElementsAreArray(expected_offsets));
303
304 EXPECT_EQ(this->validity_io_.values_read, 65);
305 EXPECT_EQ(this->validity_io_.null_count, 0);
306 EXPECT_EQ(BitmapToString(this->validity_io_.valid_bits, /*length=*/65),
307 "11111111 "
308 "11111111 "
309 "11111111 "
310 "11111111 "
311 "11111111 "
312 "11111111 "
313 "11111111 "
314 "11111111 "
315 "1");
316 }
317
TYPED_TEST(NestedListTest,TestOverflow)318 TYPED_TEST(NestedListTest, TestOverflow) {
319 LevelInfo level_info;
320 level_info.rep_level = 1;
321 level_info.def_level = 2;
322 level_info.repeated_ancestor_def_level = 0;
323
324 MultiLevelTestData test_data;
325 test_data.def_levels = std::vector<int16_t>{2};
326 test_data.rep_levels = std::vector<int16_t>{0};
327
328 this->InitForLength(2);
329 // Offsets is populated as the cumulative sum of all elements,
330 // so populating the offsets[0] with max-value impacts the
331 // other values populated.
332 this->offsets_[0] = std::numeric_limits<typename TypeParam::OffsetsType>::max();
333 this->offsets_[1] = std::numeric_limits<typename TypeParam::OffsetsType>::max();
334 ASSERT_THROW(this->Run(test_data, level_info), ParquetException);
335
336 ASSERT_THROW(this->Run(test_data, level_info), ParquetException);
337
338 // Same thing should happen if the list already existed.
339 test_data.rep_levels = std::vector<int16_t>{1};
340 ASSERT_THROW(this->Run(test_data, level_info), ParquetException);
341
342 // Should be OK because it shouldn't increment.
343 test_data.def_levels = std::vector<int16_t>{0};
344 test_data.rep_levels = std::vector<int16_t>{0};
345 this->Run(test_data, level_info);
346 }
347
TEST(TestOnlyExtractBitsSoftware,BasicTest)348 TEST(TestOnlyExtractBitsSoftware, BasicTest) {
349 auto check = [](uint64_t bitmap, uint64_t selection, uint64_t expected) -> void {
350 EXPECT_EQ(TestOnlyExtractBitsSoftware(bitmap, selection), expected);
351 };
352 check(0xFF, 0, 0);
353 check(0xFF, ~uint64_t{0}, 0xFF);
354 check(0xFF00FF, 0xAAAA, 0x000F);
355 check(0xFF0AFF, 0xAFAA, 0x00AF);
356 check(0xFFAAFF, 0xAFAA, 0x03AF);
357 check(0xFECBDA9876543210ULL, 0xF00FF00FF00FF00FULL, 0xFBD87430ULL);
358 }
359
360 } // namespace internal
361 } // namespace parquet
362