1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "parquet/arrow/path_internal.h"
19 
20 #include <algorithm>
21 #include <memory>
22 #include <utility>
23 #include <vector>
24 
25 #include <gmock/gmock.h>
26 #include <gtest/gtest.h>
27 
28 #include "arrow/array/concatenate.h"
29 #include "arrow/chunked_array.h"
30 #include "arrow/io/memory.h"
31 #include "arrow/result.h"
32 #include "arrow/testing/gtest_util.h"
33 #include "arrow/type_fwd.h"
34 #include "arrow/util/checked_cast.h"
35 #include "arrow/util/logging.h"
36 
37 #include "parquet/arrow/reader.h"
38 #include "parquet/arrow/schema.h"
39 #include "parquet/column_writer.h"
40 #include "parquet/file_writer.h"
41 #include "parquet/properties.h"
42 
43 using arrow::Array;
44 using arrow::ArrayFromJSON;
45 using arrow::AssertArraysEqual;
46 using arrow::ChunkedArray;
47 using arrow::DataType;
48 using arrow::field;
49 using arrow::int32;
50 using arrow::int64;
51 using arrow::list;
52 using arrow::MemoryPool;
53 using arrow::Result;
54 using arrow::Status;
55 using arrow::struct_;
56 using arrow::internal::checked_cast;
57 using arrow::internal::checked_pointer_cast;
58 using arrow::io::BufferOutputStream;
59 using arrow::io::BufferReader;
60 
61 using testing::ElementsAre;
62 using testing::ElementsAreArray;
63 using testing::Eq;
64 using testing::NotNull;
65 using testing::SizeIs;
66 
67 namespace parquet {
68 namespace arrow {
69 
70 using parquet::schema::GroupNode;
71 using parquet::schema::NodePtr;
72 using parquet::schema::PrimitiveNode;
73 
74 using ParquetType = parquet::Type::type;
75 template <ParquetType T>
76 using ParquetTraits = parquet::type_traits<T>;
77 
78 using LevelVector = std::vector<int16_t>;
79 // For readability
80 using DefLevels = LevelVector;
81 using RepLevels = LevelVector;
82 using Int32Vector = std::vector<int32_t>;
83 using Int64Vector = std::vector<int64_t>;
84 
85 // A Parquet file builder that allows writing values one leaf column at a time
86 class FileBuilder {
87  public:
Make(const NodePtr & group_node,int num_columns)88   static Result<std::shared_ptr<FileBuilder>> Make(const NodePtr& group_node,
89                                                    int num_columns) {
90     auto self = std::make_shared<FileBuilder>();
91     RETURN_NOT_OK(self->Open(group_node, num_columns));
92     return self;
93   }
94 
Finish()95   Result<std::shared_ptr<Buffer>> Finish() {
96     DCHECK_EQ(column_index_, num_columns_);
97     row_group_writer_->Close();
98     file_writer_->Close();
99     return stream_->Finish();
100   }
101 
102   // Write a leaf (primitive) column
103   template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type>
WriteColumn(const LevelVector & def_levels,const LevelVector & rep_levels,const std::vector<C_TYPE> & values)104   Status WriteColumn(const LevelVector& def_levels, const LevelVector& rep_levels,
105                      const std::vector<C_TYPE>& values) {
106     auto column_writer = row_group_writer_->NextColumn();
107     auto column_descr = column_writer->descr();
108     const int16_t max_def_level = column_descr->max_definition_level();
109     const int16_t max_rep_level = column_descr->max_repetition_level();
110     CheckTestedLevels(def_levels, max_def_level);
111     CheckTestedLevels(rep_levels, max_rep_level);
112 
113     auto typed_writer =
114         checked_cast<TypedColumnWriter<PhysicalType<TYPE>>*>(column_writer);
115 
116     const int64_t num_values = static_cast<int64_t>(
117         (max_def_level > 0) ? def_levels.size()
118                             : (max_rep_level > 0) ? rep_levels.size() : values.size());
119     const int64_t values_written = typed_writer->WriteBatch(
120         num_values, LevelPointerOrNull(def_levels, max_def_level),
121         LevelPointerOrNull(rep_levels, max_rep_level), values.data());
122     DCHECK_EQ(values_written, static_cast<int64_t>(values.size()));  // Sanity check
123 
124     column_writer->Close();
125     ++column_index_;
126     return Status::OK();
127   }
128 
129  protected:
Open(const NodePtr & group_node,int num_columns)130   Status Open(const NodePtr& group_node, int num_columns) {
131     ARROW_ASSIGN_OR_RAISE(stream_, BufferOutputStream::Create());
132     file_writer_ =
133         ParquetFileWriter::Open(stream_, checked_pointer_cast<GroupNode>(group_node));
134     row_group_writer_ = file_writer_->AppendRowGroup();
135     num_columns_ = num_columns;
136     column_index_ = 0;
137     return Status::OK();
138   }
139 
CheckTestedLevels(const LevelVector & levels,int16_t max_level)140   void CheckTestedLevels(const LevelVector& levels, int16_t max_level) {
141     // Tests are expected to exercise all possible levels in [0, max_level]
142     if (!levels.empty()) {
143       const int16_t max_seen_level = *std::max_element(levels.begin(), levels.end());
144       DCHECK_EQ(max_seen_level, max_level);
145     }
146   }
147 
LevelPointerOrNull(const LevelVector & levels,int16_t max_level)148   const int16_t* LevelPointerOrNull(const LevelVector& levels, int16_t max_level) {
149     if (max_level > 0) {
150       DCHECK_GT(levels.size(), 0);
151       return levels.data();
152     } else {
153       DCHECK_EQ(levels.size(), 0);
154       return nullptr;
155     }
156   }
157 
158   std::shared_ptr<BufferOutputStream> stream_;
159   std::unique_ptr<ParquetFileWriter> file_writer_;
160   RowGroupWriter* row_group_writer_;
161   int num_columns_;
162   int column_index_;
163 };
164 
165 // A Parquet file tester that allows reading Arrow columns, corresponding to
166 // children of the top-level group node.
167 class FileTester {
168  public:
Make(std::shared_ptr<Buffer> buffer,MemoryPool * pool)169   static Result<std::shared_ptr<FileTester>> Make(std::shared_ptr<Buffer> buffer,
170                                                   MemoryPool* pool) {
171     auto self = std::make_shared<FileTester>();
172     RETURN_NOT_OK(self->Open(buffer, pool));
173     return self;
174   }
175 
ReadColumn(int column_index)176   Result<std::shared_ptr<Array>> ReadColumn(int column_index) {
177     std::shared_ptr<ChunkedArray> column;
178     RETURN_NOT_OK(file_reader_->ReadColumn(column_index, &column));
179     return ::arrow::Concatenate(column->chunks(), pool_);
180   }
181 
CheckColumn(int column_index,const Array & expected)182   void CheckColumn(int column_index, const Array& expected) {
183     ASSERT_OK_AND_ASSIGN(const auto actual, ReadColumn(column_index));
184     ASSERT_OK(actual->ValidateFull());
185     AssertArraysEqual(expected, *actual, /*verbose=*/true);
186   }
187 
188  protected:
Open(std::shared_ptr<Buffer> buffer,MemoryPool * pool)189   Status Open(std::shared_ptr<Buffer> buffer, MemoryPool* pool) {
190     pool_ = pool;
191     return OpenFile(std::make_shared<BufferReader>(buffer), pool_, &file_reader_);
192   }
193 
194   MemoryPool* pool_;
195   std::unique_ptr<FileReader> file_reader_;
196 };
197 
198 class TestReconstructColumn : public testing::Test {
199  public:
SetUp()200   void SetUp() override { pool_ = ::arrow::default_memory_pool(); }
201 
202   // Write the next leaf (primitive) column
203   template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type>
WriteColumn(const LevelVector & def_levels,const LevelVector & rep_levels,const std::vector<C_TYPE> & values)204   Status WriteColumn(const LevelVector& def_levels, const LevelVector& rep_levels,
205                      const std::vector<C_TYPE>& values) {
206     if (!builder_) {
207       ARROW_ASSIGN_OR_RAISE(builder_,
208                             FileBuilder::Make(group_node_, descriptor_->num_columns()));
209     }
210     return builder_->WriteColumn<TYPE, C_TYPE>(def_levels, rep_levels, values);
211   }
212 
213   template <typename C_TYPE>
WriteInt32Column(const LevelVector & def_levels,const LevelVector & rep_levels,const std::vector<C_TYPE> & values)214   Status WriteInt32Column(const LevelVector& def_levels, const LevelVector& rep_levels,
215                           const std::vector<C_TYPE>& values) {
216     return WriteColumn<ParquetType::INT32>(def_levels, rep_levels, values);
217   }
218 
219   template <typename C_TYPE>
WriteInt64Column(const LevelVector & def_levels,const LevelVector & rep_levels,const std::vector<C_TYPE> & values)220   Status WriteInt64Column(const LevelVector& def_levels, const LevelVector& rep_levels,
221                           const std::vector<C_TYPE>& values) {
222     return WriteColumn<ParquetType::INT64>(def_levels, rep_levels, values);
223   }
224 
225   // Read a Arrow column and check its values
CheckColumn(int column_index,const Array & expected)226   void CheckColumn(int column_index, const Array& expected) {
227     if (!tester_) {
228       ASSERT_OK_AND_ASSIGN(auto buffer, builder_->Finish());
229       ASSERT_OK_AND_ASSIGN(tester_, FileTester::Make(buffer, pool_));
230     }
231     tester_->CheckColumn(column_index, expected);
232   }
233 
CheckColumn(const Array & expected)234   void CheckColumn(const Array& expected) { CheckColumn(/*column_index=*/0, expected); }
235 
236   // One-column shortcut
237   template <ParquetType TYPE, typename C_TYPE = typename ParquetTraits<TYPE>::value_type>
AssertReconstruct(const Array & expected,const LevelVector & def_levels,const LevelVector & rep_levels,const std::vector<C_TYPE> & values)238   void AssertReconstruct(const Array& expected, const LevelVector& def_levels,
239                          const LevelVector& rep_levels,
240                          const std::vector<C_TYPE>& values) {
241     ASSERT_OK((WriteColumn<TYPE, C_TYPE>(def_levels, rep_levels, values)));
242     CheckColumn(/*column_index=*/0, expected);
243   }
244 
MaybeSetParquetSchema(const NodePtr & column)245   ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
246     descriptor_.reset(new SchemaDescriptor());
247     manifest_.reset(new SchemaManifest());
248     group_node_ = GroupNode::Make("root", Repetition::REQUIRED, {column});
249     descriptor_->Init(group_node_);
250     return SchemaManifest::Make(descriptor_.get(),
251                                 std::shared_ptr<const ::arrow::KeyValueMetadata>(),
252                                 ArrowReaderProperties(), manifest_.get());
253   }
254 
SetParquetSchema(const NodePtr & column)255   void SetParquetSchema(const NodePtr& column) {
256     ASSERT_OK(MaybeSetParquetSchema(column));
257   }
258 
259  protected:
260   MemoryPool* pool_;
261   NodePtr group_node_;
262   std::unique_ptr<SchemaDescriptor> descriptor_;
263   std::unique_ptr<SchemaManifest> manifest_;
264 
265   std::shared_ptr<FileBuilder> builder_;
266   std::shared_ptr<FileTester> tester_;
267 };
268 
OneFieldStruct(const std::string & name,std::shared_ptr<DataType> type,bool nullable=true)269 static std::shared_ptr<DataType> OneFieldStruct(const std::string& name,
270                                                 std::shared_ptr<DataType> type,
271                                                 bool nullable = true) {
272   return struct_({field(name, type, nullable)});
273 }
274 
List(std::shared_ptr<DataType> type,bool nullable=true)275 static std::shared_ptr<DataType> List(std::shared_ptr<DataType> type,
276                                       bool nullable = true) {
277   // TODO should field name "element" (Parquet convention for List nodes)
278   // be changed to "item" (Arrow convention for List types)?
279   return list(field("element", type, nullable));
280 }
281 
282 //
283 // Primitive columns with no intermediate group node
284 //
285 
TEST_F(TestReconstructColumn,PrimitiveOptional)286 TEST_F(TestReconstructColumn, PrimitiveOptional) {
287   SetParquetSchema(
288       PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::INT32));
289 
290   LevelVector def_levels = {1, 0, 1, 1};
291   LevelVector rep_levels = {};
292   std::vector<int32_t> values = {4, 5, 6};
293 
294   auto expected = ArrayFromJSON(int32(), "[4, null, 5, 6]");
295   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
296 }
297 
TEST_F(TestReconstructColumn,PrimitiveRequired)298 TEST_F(TestReconstructColumn, PrimitiveRequired) {
299   SetParquetSchema(
300       PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::INT32));
301 
302   LevelVector def_levels = {};
303   LevelVector rep_levels = {};
304   std::vector<int32_t> values = {4, 5, 6};
305 
306   auto expected = ArrayFromJSON(int32(), "[4, 5, 6]");
307   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
308 }
309 
TEST_F(TestReconstructColumn,PrimitiveRepeated)310 TEST_F(TestReconstructColumn, PrimitiveRepeated) {
311   // Arrow schema: list(int32 not null) not null
312   this->SetParquetSchema(
313       PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::INT32));
314 
315   LevelVector def_levels = {0, 1, 1, 1};
316   LevelVector rep_levels = {0, 0, 1, 0};
317   std::vector<int32_t> values = {4, 5, 6};
318 
319   auto expected = ArrayFromJSON(list(field("node_name", int32(), /*nullable=*/false)),
320                                 "[[], [4, 5], [6]]");
321   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
322 }
323 
324 //
325 // Struct encodings (one field each)
326 //
327 
TEST_F(TestReconstructColumn,NestedRequiredRequired)328 TEST_F(TestReconstructColumn, NestedRequiredRequired) {
329   // Arrow schema: struct(a: int32 not null) not null
330   SetParquetSchema(GroupNode::Make(
331       "parent", Repetition::REQUIRED,
332       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32)}));
333 
334   LevelVector def_levels = {};
335   LevelVector rep_levels = {};
336   std::vector<int32_t> values = {4, 5, 6};
337 
338   auto expected = ArrayFromJSON(OneFieldStruct("a", int32(), false),
339                                 R"([{"a": 4}, {"a": 5}, {"a": 6}])");
340   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
341 }
342 
TEST_F(TestReconstructColumn,NestedOptionalRequired)343 TEST_F(TestReconstructColumn, NestedOptionalRequired) {
344   // Arrow schema: struct(a: int32 not null)
345   SetParquetSchema(GroupNode::Make(
346       "parent", Repetition::OPTIONAL,
347       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32)}));
348 
349   LevelVector def_levels = {0, 1, 1, 1};
350   LevelVector rep_levels = {};
351   std::vector<int32_t> values = {4, 5, 6};
352 
353   auto expected = ArrayFromJSON(OneFieldStruct("a", int32(), false),
354                                 R"([null, {"a": 4}, {"a": 5}, {"a": 6}])");
355   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
356 }
357 
TEST_F(TestReconstructColumn,NestedRequiredOptional)358 TEST_F(TestReconstructColumn, NestedRequiredOptional) {
359   // Arrow schema: struct(a: int32) not null
360   SetParquetSchema(GroupNode::Make(
361       "parent", Repetition::REQUIRED,
362       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32)}));
363 
364   LevelVector def_levels = {0, 1, 1, 1};
365   LevelVector rep_levels = {};
366   std::vector<int32_t> values = {4, 5, 6};
367 
368   auto expected = ArrayFromJSON(OneFieldStruct("a", int32()),
369                                 R"([{"a": null}, {"a": 4}, {"a": 5}, {"a": 6}])");
370   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
371 }
372 
TEST_F(TestReconstructColumn,NestedOptionalOptional)373 TEST_F(TestReconstructColumn, NestedOptionalOptional) {
374   // Arrow schema: struct(a: int32)
375   SetParquetSchema(GroupNode::Make(
376       "parent", Repetition::OPTIONAL,
377       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32)}));
378 
379   LevelVector def_levels = {0, 1, 2, 2};
380   LevelVector rep_levels = {};
381   std::vector<int32_t> values = {4, 5};
382 
383   auto expected = ArrayFromJSON(OneFieldStruct("a", int32()),
384                                 R"([null, {"a": null}, {"a": 4}, {"a": 5}])");
385   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
386 }
387 
388 //
389 // Nested struct encodings (one field each)
390 //
391 
TEST_F(TestReconstructColumn,NestedRequiredRequiredRequired)392 TEST_F(TestReconstructColumn, NestedRequiredRequiredRequired) {
393   // Arrow schema: struct(a: struct(b: int32 not null) not null) not null
394   SetParquetSchema(GroupNode::Make(
395       "parent", Repetition::REQUIRED,
396       {GroupNode::Make(
397           "a", Repetition::REQUIRED,
398           {PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})}));
399 
400   LevelVector def_levels = {};
401   LevelVector rep_levels = {};
402   std::vector<int32_t> values = {4, 5, 6};
403 
404   auto expected =
405       ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32(), false), false),
406                     R"([{"a": {"b": 4}},
407                         {"a": {"b": 5}},
408                         {"a": {"b": 6}}
409                         ])");
410   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
411 }
412 
TEST_F(TestReconstructColumn,NestedRequiredOptionalRequired)413 TEST_F(TestReconstructColumn, NestedRequiredOptionalRequired) {
414   // Arrow schema: struct(a: struct(b: int32 not null)) not null
415   SetParquetSchema(GroupNode::Make(
416       "parent", Repetition::REQUIRED,
417       {GroupNode::Make(
418           "a", Repetition::OPTIONAL,
419           {PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)})}));
420 
421   LevelVector def_levels = {1, 0, 1, 1};
422   LevelVector rep_levels = {};
423   std::vector<int32_t> values = {4, 5, 6};
424 
425   auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32(), false)),
426                                 R"([{"a": {"b": 4}},
427                                     {"a": null},
428                                     {"a": {"b": 5}},
429                                     {"a": {"b": 6}}
430                                     ])");
431   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
432 }
433 
TEST_F(TestReconstructColumn,NestedOptionalRequiredOptional)434 TEST_F(TestReconstructColumn, NestedOptionalRequiredOptional) {
435   // Arrow schema: struct(a: struct(b: int32) not null)
436   SetParquetSchema(GroupNode::Make(
437       "parent", Repetition::OPTIONAL,
438       {GroupNode::Make(
439           "a", Repetition::REQUIRED,
440           {PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})}));
441 
442   LevelVector def_levels = {1, 2, 0, 2, 2};
443   LevelVector rep_levels = {};
444   std::vector<int32_t> values = {4, 5, 6};
445 
446   auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32()), false),
447                                 R"([{"a": {"b": null}},
448                                     {"a": {"b": 4}},
449                                     null,
450                                     {"a": {"b": 5}},
451                                     {"a": {"b": 6}}
452                                     ])");
453   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
454 }
455 
TEST_F(TestReconstructColumn,NestedOptionalOptionalOptional)456 TEST_F(TestReconstructColumn, NestedOptionalOptionalOptional) {
457   // Arrow schema: struct(a: struct(b: int32) not null)
458   SetParquetSchema(GroupNode::Make(
459       "parent", Repetition::OPTIONAL,
460       {GroupNode::Make(
461           "a", Repetition::OPTIONAL,
462           {PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)})}));
463 
464   LevelVector def_levels = {1, 2, 0, 3, 3, 3};
465   LevelVector rep_levels = {};
466   std::vector<int32_t> values = {4, 5, 6};
467 
468   auto expected = ArrayFromJSON(OneFieldStruct("a", OneFieldStruct("b", int32())),
469                                 R"([{"a": null},
470                                     {"a": {"b": null}},
471                                     null,
472                                     {"a": {"b": 4}},
473                                     {"a": {"b": 5}},
474                                     {"a": {"b": 6}}
475                                     ])");
476   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
477 }
478 
479 //
480 // Struct encodings (two fields)
481 //
482 
TEST_F(TestReconstructColumn,NestedTwoFields1)483 TEST_F(TestReconstructColumn, NestedTwoFields1) {
484   // Arrow schema: struct(a: int32 not null, b: int64 not null) not null
485   SetParquetSchema(GroupNode::Make(
486       "parent", Repetition::REQUIRED,
487       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
488        PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)}));
489 
490   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
491   ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9}));
492 
493   auto type = struct_(
494       {field("a", int32(), /*nullable=*/false), field("b", int64(), /*nullable=*/false)});
495   auto expected = ArrayFromJSON(type, R"([{"a": 4, "b": 7},
496                                           {"a": 5, "b": 8},
497                                           {"a": 6, "b": 9}])");
498 
499   CheckColumn(/*column_index=*/0, *expected);
500 }
501 
TEST_F(TestReconstructColumn,NestedTwoFields2)502 TEST_F(TestReconstructColumn, NestedTwoFields2) {
503   // Arrow schema: struct(a: int32 not null, b: int64) not null
504   SetParquetSchema(GroupNode::Make(
505       "parent", Repetition::REQUIRED,
506       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
507        PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)}));
508 
509   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
510   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
511 
512   auto type = struct_({field("a", int32(), /*nullable=*/false), field("b", int64())});
513   auto expected = ArrayFromJSON(type, R"([{"a": 4, "b": null},
514                                           {"a": 5, "b": 7},
515                                           {"a": 6, "b": 8}])");
516 
517   CheckColumn(/*column_index=*/0, *expected);
518 }
519 
TEST_F(TestReconstructColumn,NestedTwoFields3)520 TEST_F(TestReconstructColumn, NestedTwoFields3) {
521   // Arrow schema: struct(a: int32 not null, b: int64 not null)
522   SetParquetSchema(GroupNode::Make(
523       "parent", Repetition::OPTIONAL,
524       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
525        PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)}));
526 
527   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5}));
528   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
529 
530   auto type = struct_(
531       {field("a", int32(), /*nullable=*/false), field("b", int64(), /*nullable=*/false)});
532   auto expected = ArrayFromJSON(type, R"([null,
533                                          {"a": 4, "b": 7},
534                                          {"a": 5, "b": 8}])");
535 
536   CheckColumn(/*column_index=*/0, *expected);
537 }
538 
TEST_F(TestReconstructColumn,NestedTwoFields4)539 TEST_F(TestReconstructColumn, NestedTwoFields4) {
540   // Arrow schema: struct(a: int32, b: int64 not null)
541   SetParquetSchema(GroupNode::Make(
542       "parent", Repetition::OPTIONAL,
543       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
544        PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)}));
545 
546   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2}, RepLevels{}, Int32Vector{4}));
547   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
548 
549   auto type = struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)});
550   auto expected = ArrayFromJSON(type, R"([null,
551                                          {"a": null, "b": 7},
552                                          {"a": 4, "b": 8}])");
553 
554   CheckColumn(/*column_index=*/0, *expected);
555 }
556 
TEST_F(TestReconstructColumn,NestedTwoFields5)557 TEST_F(TestReconstructColumn, NestedTwoFields5) {
558   // Arrow schema: struct(a: int32, b: int64)
559   SetParquetSchema(GroupNode::Make(
560       "parent", Repetition::OPTIONAL,
561       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
562        PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)}));
563 
564   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2}, RepLevels{}, Int32Vector{4}));
565   ASSERT_OK(WriteInt64Column(DefLevels{0, 2, 1}, RepLevels{}, Int64Vector{7}));
566 
567   auto type = struct_({field("a", int32()), field("b", int64())});
568   auto expected = ArrayFromJSON(type, R"([null,
569                                          {"a": null, "b": 7},
570                                          {"a": 4, "b": null}])");
571 
572   CheckColumn(/*column_index=*/0, *expected);
573 }
574 
575 //
576 // Nested struct encodings (two fields)
577 //
578 
TEST_F(TestReconstructColumn,NestedNestedTwoFields1)579 TEST_F(TestReconstructColumn, NestedNestedTwoFields1) {
580   // Arrow schema: struct(a: struct(aa: int32 not null,
581   //                                ab: int64 not null) not null,
582   //                      b: int32 not null) not null
583   SetParquetSchema(GroupNode::Make(
584       "parent", Repetition::REQUIRED,
585       {GroupNode::Make(
586            "a", Repetition::REQUIRED,
587            {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
588             PrimitiveNode::Make("ab", Repetition::REQUIRED, ParquetType::INT64)}),
589        PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)}));
590 
591   // aa
592   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
593   // ab
594   ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9}));
595   // b
596   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12}));
597 
598   auto type = struct_({field("a",
599                              struct_({field("aa", int32(), /*nullable=*/false),
600                                       field("ab", int64(), /*nullable=*/false)}),
601                              /*nullable=*/false),
602                        field("b", int32(), /*nullable=*/false)});
603   auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": 7}, "b": 10},
604                                           {"a": {"aa": 5, "ab": 8}, "b": 11},
605                                           {"a": {"aa": 6, "ab": 9}, "b": 12}])");
606 
607   CheckColumn(/*column_index=*/0, *expected);
608 }
609 
TEST_F(TestReconstructColumn,NestedNestedTwoFields2)610 TEST_F(TestReconstructColumn, NestedNestedTwoFields2) {
611   // Arrow schema: struct(a: struct(aa: int32,
612   //                                ab: int64 not null) not null,
613   //                      b: int32 not null) not null
614   SetParquetSchema(GroupNode::Make(
615       "parent", Repetition::REQUIRED,
616       {GroupNode::Make(
617            "a", Repetition::REQUIRED,
618            {PrimitiveNode::Make("aa", Repetition::OPTIONAL, ParquetType::INT32),
619             PrimitiveNode::Make("ab", Repetition::REQUIRED, ParquetType::INT64)}),
620        PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)}));
621 
622   // aa
623   ASSERT_OK(WriteInt32Column(DefLevels{1, 0, 1}, RepLevels{}, Int32Vector{4, 5}));
624   // ab
625   ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{7, 8, 9}));
626   // b
627   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12}));
628 
629   auto type = struct_(
630       {field("a",
631              struct_({field("aa", int32()), field("ab", int64(), /*nullable=*/false)}),
632              /*nullable=*/false),
633        field("b", int32(), /*nullable=*/false)});
634   auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": 7}, "b": 10},
635                                           {"a": {"aa": null, "ab": 8}, "b": 11},
636                                           {"a": {"aa": 5, "ab": 9}, "b": 12}])");
637 
638   CheckColumn(/*column_index=*/0, *expected);
639 }
640 
TEST_F(TestReconstructColumn,NestedNestedTwoFields3)641 TEST_F(TestReconstructColumn, NestedNestedTwoFields3) {
642   // Arrow schema: struct(a: struct(aa: int32 not null,
643   //                                ab: int64) not null,
644   //                      b: int32) not null
645   SetParquetSchema(GroupNode::Make(
646       "parent", Repetition::REQUIRED,
647       {GroupNode::Make(
648            "a", Repetition::REQUIRED,
649            {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
650             PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
651        PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)}));
652 
653   // aa
654   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{4, 5, 6}));
655   // ab
656   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1}, RepLevels{}, Int64Vector{7, 8}));
657   // b
658   ASSERT_OK(WriteInt32Column(DefLevels{1, 0, 1}, RepLevels{}, Int32Vector{10, 11}));
659 
660   auto type = struct_(
661       {field("a",
662              struct_({field("aa", int32(), /*nullable=*/false), field("ab", int64())}),
663              /*nullable=*/false),
664        field("b", int32())});
665   auto expected = ArrayFromJSON(type, R"([{"a": {"aa": 4, "ab": null}, "b": 10},
666                                           {"a": {"aa": 5, "ab": 7}, "b": null},
667                                           {"a": {"aa": 6, "ab": 8}, "b": 11}])");
668 
669   CheckColumn(/*column_index=*/0, *expected);
670 }
671 
TEST_F(TestReconstructColumn,NestedNestedTwoFields4)672 TEST_F(TestReconstructColumn, NestedNestedTwoFields4) {
673   // Arrow schema: struct(a: struct(aa: int32 not null,
674   //                                ab: int64),
675   //                      b: int32 not null) not null
676   SetParquetSchema(GroupNode::Make(
677       "parent", Repetition::REQUIRED,
678       {GroupNode::Make(
679            "a", Repetition::OPTIONAL,
680            {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
681             PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
682        PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT32)}));
683 
684   // aa
685   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5}));
686   // ab
687   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2}, RepLevels{}, Int64Vector{7}));
688   // b
689   ASSERT_OK(WriteInt32Column(DefLevels{}, RepLevels{}, Int32Vector{10, 11, 12}));
690 
691   auto type = struct_({field("a", struct_({field("aa", int32(), /*nullable=*/false),
692                                            field("ab", int64())})),
693                        field("b", int32(), /*nullable=*/false)});
694   auto expected = ArrayFromJSON(type, R"([{"a": null, "b": 10},
695                                           {"a": {"aa": 4, "ab": null}, "b": 11},
696                                           {"a": {"aa": 5, "ab": 7}, "b": 12}])");
697 
698   CheckColumn(/*column_index=*/0, *expected);
699 }
700 
TEST_F(TestReconstructColumn,NestedNestedTwoFields5)701 TEST_F(TestReconstructColumn, NestedNestedTwoFields5) {
702   // Arrow schema: struct(a: struct(aa: int32 not null,
703   //                                ab: int64) not null,
704   //                      b: int32)
705   SetParquetSchema(GroupNode::Make(
706       "parent", Repetition::OPTIONAL,
707       {GroupNode::Make(
708            "a", Repetition::REQUIRED,
709            {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
710             PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
711        PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)}));
712 
713   // aa
714   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1}, RepLevels{}, Int32Vector{4, 5}));
715   // ab
716   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2}, RepLevels{}, Int64Vector{7}));
717   // b
718   ASSERT_OK(WriteInt32Column(DefLevels{0, 2, 1}, RepLevels{}, Int32Vector{10}));
719 
720   auto type = struct_(
721       {field("a",
722              struct_({field("aa", int32(), /*nullable=*/false), field("ab", int64())}),
723              /*nullable=*/false),
724        field("b", int32())});
725   auto expected = ArrayFromJSON(type, R"([null,
726                                           {"a": {"aa": 4, "ab": null}, "b": 10},
727                                           {"a": {"aa": 5, "ab": 7}, "b": null}])");
728 
729   CheckColumn(/*column_index=*/0, *expected);
730 }
731 
TEST_F(TestReconstructColumn,NestedNestedTwoFields6)732 TEST_F(TestReconstructColumn, NestedNestedTwoFields6) {
733   // Arrow schema: struct(a: struct(aa: int32 not null,
734   //                                ab: int64),
735   //                      b: int32)
736   SetParquetSchema(GroupNode::Make(
737       "parent", Repetition::OPTIONAL,
738       {GroupNode::Make(
739            "a", Repetition::OPTIONAL,
740            {PrimitiveNode::Make("aa", Repetition::REQUIRED, ParquetType::INT32),
741             PrimitiveNode::Make("ab", Repetition::OPTIONAL, ParquetType::INT64)}),
742        PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT32)}));
743 
744   // aa
745   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2}, RepLevels{}, Int32Vector{4, 5}));
746   // ab
747   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 3}, RepLevels{}, Int64Vector{7}));
748   // b
749   ASSERT_OK(WriteInt32Column(DefLevels{0, 2, 1, 2}, RepLevels{}, Int32Vector{10, 11}));
750 
751   auto type = struct_({field("a", struct_({field("aa", int32(), /*nullable=*/false),
752                                            field("ab", int64())})),
753                        field("b", int32())});
754   auto expected = ArrayFromJSON(type, R"([null,
755                                           {"a": null, "b": 10},
756                                           {"a": {"aa": 4, "ab": null}, "b": null},
757                                           {"a": {"aa": 5, "ab": 7}, "b": 11}])");
758 
759   CheckColumn(/*column_index=*/0, *expected);
760 }
761 
762 //
763 // Three-level list encodings
764 //
765 
TEST_F(TestReconstructColumn,ThreeLevelListRequiredRequired)766 TEST_F(TestReconstructColumn, ThreeLevelListRequiredRequired) {
767   // Arrow schema: list(int32 not null) not null
768   SetParquetSchema(GroupNode::Make(
769       "parent", Repetition::REQUIRED,
770       {GroupNode::Make(
771           "list", Repetition::REPEATED,
772           {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
773       LogicalType::List()));
774 
775   LevelVector def_levels = {0, 1, 1, 1};
776   LevelVector rep_levels = {0, 0, 1, 0};
777   std::vector<int32_t> values = {4, 5, 6};
778 
779   // TODO should field name "element" (Parquet convention for List nodes)
780   // be changed to "item" (Arrow convention for List types)?
781   auto expected = ArrayFromJSON(List(int32(), /*nullable=*/false), "[[], [4, 5], [6]]");
782   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
783 }
784 
TEST_F(TestReconstructColumn,ThreeLevelListOptionalRequired)785 TEST_F(TestReconstructColumn, ThreeLevelListOptionalRequired) {
786   // Arrow schema: list(int32 not null)
787   SetParquetSchema(GroupNode::Make(
788       "parent", Repetition::OPTIONAL,
789       {GroupNode::Make(
790           "list", Repetition::REPEATED,
791           {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
792       LogicalType::List()));
793 
794   LevelVector def_levels = {0, 1, 2, 2, 2};
795   LevelVector rep_levels = {0, 0, 0, 1, 0};
796   std::vector<int32_t> values = {4, 5, 6};
797 
798   auto expected =
799       ArrayFromJSON(List(int32(), /*nullable=*/false), "[null, [], [4, 5], [6]]");
800   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
801 }
802 
TEST_F(TestReconstructColumn,ThreeLevelListRequiredOptional)803 TEST_F(TestReconstructColumn, ThreeLevelListRequiredOptional) {
804   // Arrow schema: list(int32) not null
805   SetParquetSchema(GroupNode::Make(
806       "parent", Repetition::REQUIRED,
807       {GroupNode::Make(
808           "list", Repetition::REPEATED,
809           {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
810       LogicalType::List()));
811 
812   LevelVector def_levels = {0, 1, 2, 2, 2};
813   LevelVector rep_levels = {0, 0, 1, 0, 1};
814   std::vector<int32_t> values = {4, 5, 6};
815 
816   auto expected = ArrayFromJSON(List(int32()), "[[], [null, 4], [5, 6]]");
817   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
818 }
819 
TEST_F(TestReconstructColumn,ThreeLevelListOptionalOptional)820 TEST_F(TestReconstructColumn, ThreeLevelListOptionalOptional) {
821   // Arrow schema: list(int32)
822   SetParquetSchema(GroupNode::Make(
823       "parent", Repetition::OPTIONAL,
824       {GroupNode::Make(
825           "list", Repetition::REPEATED,
826           {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
827       LogicalType::List()));
828 
829   LevelVector def_levels = {0, 1, 2, 3, 3, 3};
830   LevelVector rep_levels = {0, 0, 0, 1, 0, 1};
831   std::vector<int32_t> values = {4, 5, 6};
832 
833   auto expected = ArrayFromJSON(List(int32()), "[null, [], [null, 4], [5, 6]]");
834   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
835 }
836 
837 //
838 // Legacy list encodings
839 //
840 
TEST_F(TestReconstructColumn,TwoLevelListRequired)841 TEST_F(TestReconstructColumn, TwoLevelListRequired) {
842   // Arrow schema: list(int32 not null) not null
843   SetParquetSchema(GroupNode::Make(
844       "parent", Repetition::REQUIRED,
845       {PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32)},
846       LogicalType::List()));
847 
848   LevelVector def_levels = {0, 1, 1, 1};
849   LevelVector rep_levels = {0, 0, 1, 0};
850   std::vector<int32_t> values = {4, 5, 6};
851 
852   // TODO should field name "element" (Parquet convention for List nodes)
853   // be changed to "item" (Arrow convention for List types)?
854   auto expected = ArrayFromJSON(List(int32(), /*nullable=*/false), "[[], [4, 5], [6]]");
855   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
856 }
857 
TEST_F(TestReconstructColumn,TwoLevelListOptional)858 TEST_F(TestReconstructColumn, TwoLevelListOptional) {
859   // Arrow schema: list(int32 not null)
860   SetParquetSchema(GroupNode::Make(
861       "parent", Repetition::OPTIONAL,
862       {PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32)},
863       LogicalType::List()));
864 
865   LevelVector def_levels = {0, 1, 2, 2, 2};
866   LevelVector rep_levels = {0, 0, 0, 1, 0};
867   std::vector<int32_t> values = {4, 5, 6};
868 
869   auto expected =
870       ArrayFromJSON(List(int32(), /*nullable=*/false), "[null, [], [4, 5], [6]]");
871   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
872 }
873 
874 //
875 // List-in-struct
876 //
877 
TEST_F(TestReconstructColumn,NestedList1)878 TEST_F(TestReconstructColumn, NestedList1) {
879   // Arrow schema: struct(a: list(int32 not null) not null) not null
880   SetParquetSchema(GroupNode::Make(
881       "a", Repetition::REQUIRED,
882       {GroupNode::Make(
883           "p", Repetition::REQUIRED,
884           {GroupNode::Make("list", Repetition::REPEATED,
885                            {PrimitiveNode::Make("element", Repetition::REQUIRED,
886                                                 ParquetType::INT32)})},
887           LogicalType::List())}));
888 
889   LevelVector def_levels = {0, 1, 1, 1};
890   LevelVector rep_levels = {0, 0, 1, 0};
891   std::vector<int32_t> values = {4, 5, 6};
892 
893   auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false),
894                              /*nullable=*/false);
895   auto expected = ArrayFromJSON(type, R"([{"p": []},
896                                           {"p": [4, 5]},
897                                           {"p": [6]}])");
898   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
899 }
900 
TEST_F(TestReconstructColumn,NestedList2)901 TEST_F(TestReconstructColumn, NestedList2) {
902   // Arrow schema: struct(a: list(int32 not null) not null)
903   SetParquetSchema(GroupNode::Make(
904       "a", Repetition::OPTIONAL,
905       {GroupNode::Make(
906           "p", Repetition::REQUIRED,
907           {GroupNode::Make("list", Repetition::REPEATED,
908                            {PrimitiveNode::Make("element", Repetition::REQUIRED,
909                                                 ParquetType::INT32)})},
910           LogicalType::List())}));
911 
912   LevelVector def_levels = {0, 1, 2, 2, 2};
913   LevelVector rep_levels = {0, 0, 0, 1, 0};
914   std::vector<int32_t> values = {4, 5, 6};
915 
916   auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false),
917                              /*nullable=*/false);
918   auto expected = ArrayFromJSON(type, R"([null,
919                                           {"p": []},
920                                           {"p": [4, 5]},
921                                           {"p": [6]}])");
922   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
923 }
924 
TEST_F(TestReconstructColumn,NestedList3)925 TEST_F(TestReconstructColumn, NestedList3) {
926   // Arrow schema: struct(a: list(int32 not null)) not null
927   SetParquetSchema(GroupNode::Make(
928       "a", Repetition::REQUIRED,  // column name (column a is a struct of)
929       {GroupNode::Make(
930           "p", Repetition::OPTIONAL,  // name in struct
931           {GroupNode::Make("list", Repetition::REPEATED,
932                            {PrimitiveNode::Make("element", Repetition::REQUIRED,
933                                                 ParquetType::INT32)})},
934           LogicalType::List())}));
935 
936   LevelVector def_levels = {0, 1, 2, 2, 2};
937   LevelVector rep_levels = {0, 0, 0, 1, 0};
938   std::vector<int32_t> values = {4, 5, 6};
939 
940   auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false));
941   auto expected = ArrayFromJSON(type, R"([{"p": null},
942                                           {"p": []},
943                                           {"p": [4, 5]},
944                                           {"p": [6]}])");
945   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
946 }
947 
TEST_F(TestReconstructColumn,NestedList4)948 TEST_F(TestReconstructColumn, NestedList4) {
949   // Arrow schema: struct(a: list(int32 not null))
950   SetParquetSchema(GroupNode::Make(
951       "a", Repetition::OPTIONAL,
952       {GroupNode::Make(
953           "p", Repetition::OPTIONAL,
954           {GroupNode::Make("list", Repetition::REPEATED,
955                            {PrimitiveNode::Make("element", Repetition::REQUIRED,
956                                                 ParquetType::INT32)})},
957           LogicalType::List())}));
958 
959   LevelVector def_levels = {0, 1, 2, 3, 3, 3};
960   LevelVector rep_levels = {0, 0, 0, 0, 1, 0};
961   std::vector<int32_t> values = {4, 5, 6};
962 
963   auto type = OneFieldStruct("p", List(int32(), /*nullable=*/false));
964   auto expected = ArrayFromJSON(type, R"([null,
965                                           {"p": null},
966                                           {"p": []},
967                                           {"p": [4, 5]},
968                                           {"p": [6]}])");
969   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
970 }
971 
TEST_F(TestReconstructColumn,NestedList5)972 TEST_F(TestReconstructColumn, NestedList5) {
973   // Arrow schema: struct(a: list(int32) not null)
974   SetParquetSchema(GroupNode::Make(
975       "a", Repetition::OPTIONAL,
976       {GroupNode::Make(
977           "p", Repetition::REQUIRED,
978           {GroupNode::Make("list", Repetition::REPEATED,
979                            {PrimitiveNode::Make("element", Repetition::OPTIONAL,
980                                                 ParquetType::INT32)})},
981           LogicalType::List())}));
982 
983   LevelVector def_levels = {0, 1, 3, 2, 3, 3};
984   LevelVector rep_levels = {0, 0, 0, 1, 0, 1};
985   std::vector<int32_t> values = {4, 5, 6};
986 
987   auto type = OneFieldStruct("p", List(int32()), /*nullable=*/false);
988   auto expected = ArrayFromJSON(type, R"([null,
989                                           {"p": []},
990                                           {"p": [4, null]},
991                                           {"p": [5, 6]}])");
992   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
993 }
994 
TEST_F(TestReconstructColumn,NestedList6)995 TEST_F(TestReconstructColumn, NestedList6) {
996   // Arrow schema: struct(a: list(int32))
997   SetParquetSchema(GroupNode::Make(
998       "a", Repetition::OPTIONAL,
999       {GroupNode::Make(
1000           "p", Repetition::OPTIONAL,
1001           {GroupNode::Make("list", Repetition::REPEATED,
1002                            {PrimitiveNode::Make("element", Repetition::OPTIONAL,
1003                                                 ParquetType::INT32)})},
1004           LogicalType::List())}));
1005 
1006   LevelVector def_levels = {0, 1, 2, 4, 3, 4, 4};
1007   LevelVector rep_levels = {0, 0, 0, 0, 1, 0, 1};
1008   std::vector<int32_t> values = {4, 5, 6};
1009 
1010   auto type = OneFieldStruct("p", List(int32()));
1011   auto expected = ArrayFromJSON(type, R"([null,
1012                                           {"p": null},
1013                                           {"p": []},
1014                                           {"p": [4, null]},
1015                                           {"p": [5, 6]}])");
1016   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1017 }
1018 
1019 //
1020 // Struct-in-list
1021 //
1022 
TEST_F(TestReconstructColumn,ListNested1)1023 TEST_F(TestReconstructColumn, ListNested1) {
1024   // Arrow schema: list(struct(a: int32 not null) not null) not null
1025   SetParquetSchema(GroupNode::Make(
1026       "parent", Repetition::REQUIRED,
1027       {GroupNode::Make("list", Repetition::REPEATED,
1028                        {GroupNode::Make("element", Repetition::REQUIRED,
1029                                         {PrimitiveNode::Make("a", Repetition::REQUIRED,
1030                                                              ParquetType::INT32)})})},
1031       LogicalType::List()));
1032 
1033   LevelVector def_levels = {0, 1, 1, 1};
1034   LevelVector rep_levels = {0, 0, 1, 0};
1035   std::vector<int32_t> values = {4, 5, 6};
1036 
1037   auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false),
1038                    /*nullable=*/false);
1039   auto expected = ArrayFromJSON(type,
1040                                 R"([[],
1041                                     [{"a": 4}, {"a": 5}],
1042                                     [{"a": 6}]])");
1043   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1044 }
1045 
TEST_F(TestReconstructColumn,ListNested2)1046 TEST_F(TestReconstructColumn, ListNested2) {
1047   // Arrow schema: list(struct(a: int32 not null) not null)
1048   SetParquetSchema(GroupNode::Make(
1049       "parent", Repetition::OPTIONAL,
1050       {GroupNode::Make("list", Repetition::REPEATED,
1051                        {GroupNode::Make("element", Repetition::REQUIRED,
1052                                         {PrimitiveNode::Make("a", Repetition::REQUIRED,
1053                                                              ParquetType::INT32)})})},
1054       LogicalType::List()));
1055 
1056   LevelVector def_levels = {0, 1, 2, 2, 2};
1057   LevelVector rep_levels = {0, 0, 0, 1, 0};
1058   std::vector<int32_t> values = {4, 5, 6};
1059 
1060   auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false),
1061                    /*nullable=*/false);
1062   auto expected = ArrayFromJSON(type,
1063                                 R"([null,
1064                                     [],
1065                                     [{"a": 4}, {"a": 5}],
1066                                     [{"a": 6}]])");
1067   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1068 }
1069 
TEST_F(TestReconstructColumn,ListNested3)1070 TEST_F(TestReconstructColumn, ListNested3) {
1071   // Arrow schema: list(struct(a: int32 not null)) not null
1072   SetParquetSchema(GroupNode::Make(
1073       "parent", Repetition::REQUIRED,
1074       {GroupNode::Make("list", Repetition::REPEATED,
1075                        {GroupNode::Make("element", Repetition::OPTIONAL,
1076                                         {PrimitiveNode::Make("a", Repetition::REQUIRED,
1077                                                              ParquetType::INT32)})})},
1078       LogicalType::List()));
1079 
1080   LevelVector def_levels = {0, 1, 2, 2, 2};
1081   LevelVector rep_levels = {0, 0, 1, 1, 0};
1082   std::vector<int32_t> values = {4, 5, 6};
1083 
1084   auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false));
1085   auto expected = ArrayFromJSON(type,
1086                                 R"([[],
1087                                     [null, {"a": 4}, {"a": 5}],
1088                                     [{"a": 6}]])");
1089   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1090 }
1091 
TEST_F(TestReconstructColumn,ListNested4)1092 TEST_F(TestReconstructColumn, ListNested4) {
1093   // Arrow schema: list(struct(a: int32 not null))
1094   SetParquetSchema(GroupNode::Make(
1095       "parent", Repetition::OPTIONAL,
1096       {GroupNode::Make("list", Repetition::REPEATED,
1097                        {GroupNode::Make("element", Repetition::OPTIONAL,
1098                                         {PrimitiveNode::Make("a", Repetition::REQUIRED,
1099                                                              ParquetType::INT32)})})},
1100       LogicalType::List()));
1101 
1102   LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1103   LevelVector rep_levels = {0, 0, 0, 1, 1, 0};
1104   std::vector<int32_t> values = {4, 5, 6};
1105 
1106   auto type = List(OneFieldStruct("a", int32(), /*nullable=*/false));
1107   auto expected = ArrayFromJSON(type,
1108                                 R"([null,
1109                                     [],
1110                                     [null, {"a": 4}, {"a": 5}],
1111                                     [{"a": 6}]])");
1112   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1113 }
1114 
TEST_F(TestReconstructColumn,ListNested5)1115 TEST_F(TestReconstructColumn, ListNested5) {
1116   // Arrow schema: list(struct(a: int32) not null)
1117   SetParquetSchema(GroupNode::Make(
1118       "parent", Repetition::OPTIONAL,
1119       {GroupNode::Make("list", Repetition::REPEATED,
1120                        {GroupNode::Make("element", Repetition::REQUIRED,
1121                                         {PrimitiveNode::Make("a", Repetition::OPTIONAL,
1122                                                              ParquetType::INT32)})})},
1123       LogicalType::List()));
1124 
1125   LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1126   LevelVector rep_levels = {0, 0, 0, 1, 0, 1};
1127   std::vector<int32_t> values = {4, 5, 6};
1128 
1129   auto type = List(OneFieldStruct("a", int32()),
1130                    /*nullable=*/false);
1131   auto expected = ArrayFromJSON(type,
1132                                 R"([null,
1133                                     [],
1134                                     [{"a": null}, {"a": 4}],
1135                                     [{"a": 5}, {"a": 6}]])");
1136   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1137 }
1138 
TEST_F(TestReconstructColumn,ListNested6)1139 TEST_F(TestReconstructColumn, ListNested6) {
1140   // Arrow schema: list(struct(a: int32))
1141   SetParquetSchema(GroupNode::Make(
1142       "parent", Repetition::OPTIONAL,
1143       {GroupNode::Make("list", Repetition::REPEATED,
1144                        {GroupNode::Make("element", Repetition::OPTIONAL,
1145                                         {PrimitiveNode::Make("a", Repetition::OPTIONAL,
1146                                                              ParquetType::INT32)})})},
1147       LogicalType::List()));
1148 
1149   LevelVector def_levels = {0, 1, 2, 3, 4, 4, 4};
1150   LevelVector rep_levels = {0, 0, 0, 1, 1, 0, 1};
1151   std::vector<int32_t> values = {4, 5, 6};
1152 
1153   auto type = List(OneFieldStruct("a", int32()));
1154   auto expected = ArrayFromJSON(type,
1155                                 R"([null,
1156                                     [],
1157                                     [null, {"a": null}, {"a": 4}],
1158                                     [{"a": 5}, {"a": 6}]])");
1159   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1160 }
1161 
1162 //
1163 // Struct (two fields)-in-list
1164 //
1165 
TEST_F(TestReconstructColumn,ListNestedTwoFields1)1166 TEST_F(TestReconstructColumn, ListNestedTwoFields1) {
1167   // Arrow schema: list(struct(a: int32 not null,
1168   //                           b: int64 not null) not null) not null
1169   SetParquetSchema(GroupNode::Make(
1170       "parent", Repetition::REQUIRED,
1171       {GroupNode::Make(
1172           "list", Repetition::REPEATED,
1173           {GroupNode::Make(
1174               "element", Repetition::REQUIRED,
1175               {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
1176                PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1177       LogicalType::List()));
1178 
1179   // a
1180   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1181                              Int32Vector{4, 5, 6}));
1182   // b
1183   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1184                              Int64Vector{7, 8, 9}));
1185 
1186   auto type = List(struct_({field("a", int32(), /*nullable=*/false),
1187                             field("b", int64(), /*nullable=*/false)}),
1188                    /*nullable=*/false);
1189   auto expected = ArrayFromJSON(type,
1190                                 R"([[],
1191                                     [{"a": 4, "b": 7}, {"a": 5, "b": 8}],
1192                                     [{"a": 6, "b": 9}]])");
1193   CheckColumn(/*column_index=*/0, *expected);
1194 }
1195 
TEST_F(TestReconstructColumn,ListNestedTwoFields2)1196 TEST_F(TestReconstructColumn, ListNestedTwoFields2) {
1197   // Arrow schema: list(struct(a: int32,
1198   //                           b: int64 not null) not null) not null
1199   SetParquetSchema(GroupNode::Make(
1200       "parent", Repetition::REQUIRED,
1201       {GroupNode::Make(
1202           "list", Repetition::REPEATED,
1203           {GroupNode::Make(
1204               "element", Repetition::REQUIRED,
1205               {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1206                PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1207       LogicalType::List()));
1208 
1209   // a
1210   ASSERT_OK(
1211       WriteInt32Column(DefLevels{0, 2, 1, 2}, RepLevels{0, 0, 1, 0}, Int32Vector{4, 5}));
1212   // b
1213   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1214                              Int64Vector{7, 8, 9}));
1215 
1216   auto type =
1217       List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}),
1218            /*nullable=*/false);
1219   auto expected = ArrayFromJSON(type,
1220                                 R"([[],
1221                                     [{"a": 4, "b": 7}, {"a": null, "b": 8}],
1222                                     [{"a": 5, "b": 9}]])");
1223   CheckColumn(/*column_index=*/0, *expected);
1224 }
1225 
TEST_F(TestReconstructColumn,ListNestedTwoFields3)1226 TEST_F(TestReconstructColumn, ListNestedTwoFields3) {
1227   // Arrow schema: list(struct(a: int32 not null,
1228   //                           b: int64 not null)) not null
1229   SetParquetSchema(GroupNode::Make(
1230       "parent", Repetition::REQUIRED,
1231       {GroupNode::Make(
1232           "list", Repetition::REPEATED,
1233           {GroupNode::Make(
1234               "element", Repetition::OPTIONAL,
1235               {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT32),
1236                PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1237       LogicalType::List()));
1238 
1239   // a
1240   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 1, 1, 0},
1241                              Int32Vector{4, 5, 6}));
1242   // b
1243   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 1, 1, 0},
1244                              Int64Vector{7, 8, 9}));
1245 
1246   auto type = List(struct_({field("a", int32(), /*nullable=*/false),
1247                             field("b", int64(), /*nullable=*/false)}));
1248   auto expected = ArrayFromJSON(type,
1249                                 R"([[],
1250                                     [null, {"a": 4, "b": 7}, {"a": 5, "b": 8}],
1251                                     [{"a": 6, "b": 9}]])");
1252   CheckColumn(/*column_index=*/0, *expected);
1253 }
1254 
TEST_F(TestReconstructColumn,ListNestedTwoFields4)1255 TEST_F(TestReconstructColumn, ListNestedTwoFields4) {
1256   // Arrow schema: list(struct(a: int32,
1257   //                           b: int64 not null) not null)
1258   SetParquetSchema(GroupNode::Make(
1259       "parent", Repetition::OPTIONAL,
1260       {GroupNode::Make(
1261           "list", Repetition::REPEATED,
1262           {GroupNode::Make(
1263               "element", Repetition::REQUIRED,
1264               {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1265                PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1266       LogicalType::List()));
1267 
1268   // a
1269   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 3, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1270                              Int32Vector{4, 5}));
1271   // b
1272   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0},
1273                              Int64Vector{7, 8, 9}));
1274 
1275   auto type =
1276       List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}),
1277            /*nullable=*/false);
1278   auto expected = ArrayFromJSON(type,
1279                                 R"([null,
1280                                     [],
1281                                     [{"a": 4, "b": 7}, {"a": null, "b": 8}],
1282                                     [{"a": 5, "b": 9}]])");
1283   CheckColumn(/*column_index=*/0, *expected);
1284 }
1285 
TEST_F(TestReconstructColumn,ListNestedTwoFields5)1286 TEST_F(TestReconstructColumn, ListNestedTwoFields5) {
1287   // Arrow schema: list(struct(a: int32,
1288   //                           b: int64 not null))
1289   SetParquetSchema(GroupNode::Make(
1290       "parent", Repetition::OPTIONAL,
1291       {GroupNode::Make(
1292           "list", Repetition::REPEATED,
1293           {GroupNode::Make(
1294               "element", Repetition::OPTIONAL,
1295               {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1296                PrimitiveNode::Make("b", Repetition::REQUIRED, ParquetType::INT64)})})},
1297       LogicalType::List()));
1298 
1299   // a
1300   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 4, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1301                              Int32Vector{4}));
1302   // b
1303   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 3, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1304                              Int64Vector{7, 8}));
1305 
1306   auto type =
1307       List(struct_({field("a", int32()), field("b", int64(), /*nullable=*/false)}));
1308   auto expected = ArrayFromJSON(type,
1309                                 R"([null,
1310                                     [],
1311                                     [{"a": 4, "b": 7}, null],
1312                                     [{"a": null, "b": 8}]])");
1313   CheckColumn(/*column_index=*/0, *expected);
1314 }
1315 
TEST_F(TestReconstructColumn,ListNestedTwoFields6)1316 TEST_F(TestReconstructColumn, ListNestedTwoFields6) {
1317   // Arrow schema: list(struct(a: int32,
1318   //                           b: int64))
1319   SetParquetSchema(GroupNode::Make(
1320       "parent", Repetition::OPTIONAL,
1321       {GroupNode::Make(
1322           "list", Repetition::REPEATED,
1323           {GroupNode::Make(
1324               "element", Repetition::OPTIONAL,
1325               {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT32),
1326                PrimitiveNode::Make("b", Repetition::OPTIONAL, ParquetType::INT64)})})},
1327       LogicalType::List()));
1328 
1329   // a
1330   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 4, 2, 3}, RepLevels{0, 0, 0, 1, 0},
1331                              Int32Vector{4}));
1332   // b
1333   ASSERT_OK(WriteInt64Column(DefLevels{0, 1, 3, 2, 4}, RepLevels{0, 0, 0, 1, 0},
1334                              Int64Vector{7}));
1335 
1336   auto type = List(struct_({field("a", int32()), field("b", int64())}));
1337   auto expected = ArrayFromJSON(type,
1338                                 R"([null,
1339                                     [],
1340                                     [{"a": 4, "b": null}, null],
1341                                     [{"a": null, "b": 7}]])");
1342   CheckColumn(/*column_index=*/0, *expected);
1343 }
1344 
1345 //
1346 // List-in-struct (two fields)
1347 //
1348 
TEST_F(TestReconstructColumn,NestedTwoFieldsList1)1349 TEST_F(TestReconstructColumn, NestedTwoFieldsList1) {
1350   // Arrow schema: struct(a: int64 not null,
1351   //                      b: list(int32 not null) not null
1352   //                     ) not null
1353   SetParquetSchema(GroupNode::Make(
1354       "parent", Repetition::REQUIRED,
1355       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT64),
1356        GroupNode::Make(
1357            "b", Repetition::REQUIRED,
1358            {GroupNode::Make("list", Repetition::REPEATED,
1359                             {PrimitiveNode::Make("element", Repetition::REQUIRED,
1360                                                  ParquetType::INT32)})},
1361            LogicalType::List())}));
1362 
1363   // a
1364   ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{4, 5, 6}));
1365   // b
1366   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 1, 1}, RepLevels{0, 0, 1, 0},
1367                              Int32Vector{7, 8, 9}));
1368 
1369   auto type =
1370       struct_({field("a", int64(), /*nullable=*/false),
1371                field("b", List(int32(), /*nullable=*/false), /*nullable=*/false)});
1372   auto expected = ArrayFromJSON(type,
1373                                 R"([{"a": 4, "b": []},
1374                                     {"a": 5, "b": [7, 8]},
1375                                     {"a": 6, "b": [9]}])");
1376   CheckColumn(/*column_index=*/0, *expected);
1377 }
1378 
TEST_F(TestReconstructColumn,NestedTwoFieldsList2)1379 TEST_F(TestReconstructColumn, NestedTwoFieldsList2) {
1380   // Arrow schema: struct(a: int64 not null,
1381   //                      b: list(int32 not null)
1382   //                     ) not null
1383   SetParquetSchema(GroupNode::Make(
1384       "parent", Repetition::REQUIRED,
1385       {PrimitiveNode::Make("a", Repetition::REQUIRED, ParquetType::INT64),
1386        GroupNode::Make(
1387            "b", Repetition::OPTIONAL,
1388            {GroupNode::Make("list", Repetition::REPEATED,
1389                             {PrimitiveNode::Make("element", Repetition::REQUIRED,
1390                                                  ParquetType::INT32)})},
1391            LogicalType::List())}));
1392 
1393   // a
1394   ASSERT_OK(WriteInt64Column(DefLevels{}, RepLevels{}, Int64Vector{3, 4, 5, 6}));
1395   // b
1396   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0},
1397                              Int32Vector{7, 8, 9}));
1398 
1399   auto type = struct_({field("a", int64(), /*nullable=*/false),
1400                        field("b", List(int32(), /*nullable=*/false))});
1401   auto expected = ArrayFromJSON(type,
1402                                 R"([{"a": 3, "b": null},
1403                                     {"a": 4, "b": []},
1404                                     {"a": 5, "b": [7, 8]},
1405                                     {"a": 6, "b": [9]}])");
1406   CheckColumn(/*column_index=*/0, *expected);
1407 }
1408 
TEST_F(TestReconstructColumn,NestedTwoFieldsList3)1409 TEST_F(TestReconstructColumn, NestedTwoFieldsList3) {
1410   // Arrow schema: struct(a: int64,
1411   //                      b: list(int32 not null)
1412   //                     ) not null
1413   SetParquetSchema(GroupNode::Make(
1414       "parent", Repetition::REQUIRED,
1415       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64),
1416        GroupNode::Make(
1417            "b", Repetition::OPTIONAL,
1418            {GroupNode::Make("list", Repetition::REPEATED,
1419                             {PrimitiveNode::Make("element", Repetition::REQUIRED,
1420                                                  ParquetType::INT32)})},
1421            LogicalType::List())}));
1422 
1423   // a
1424   ASSERT_OK(WriteInt64Column(DefLevels{1, 1, 0, 1}, RepLevels{}, Int64Vector{4, 5, 6}));
1425   // b
1426   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 2, 2}, RepLevels{0, 0, 0, 1, 0},
1427                              Int32Vector{7, 8, 9}));
1428 
1429   auto type =
1430       struct_({field("a", int64()), field("b", List(int32(), /*nullable=*/false))});
1431   auto expected = ArrayFromJSON(type,
1432                                 R"([{"a": 4, "b": null},
1433                                     {"a": 5, "b": []},
1434                                     {"a": null, "b": [7, 8]},
1435                                     {"a": 6, "b": [9]}])");
1436   CheckColumn(/*column_index=*/0, *expected);
1437 }
1438 
TEST_F(TestReconstructColumn,NestedTwoFieldsList4)1439 TEST_F(TestReconstructColumn, NestedTwoFieldsList4) {
1440   // Arrow schema: struct(a: int64,
1441   //                      b: list(int32 not null)
1442   //                     )
1443   SetParquetSchema(GroupNode::Make(
1444       "parent", Repetition::OPTIONAL,
1445       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64),
1446        GroupNode::Make(
1447            "b", Repetition::OPTIONAL,
1448            {GroupNode::Make("list", Repetition::REPEATED,
1449                             {PrimitiveNode::Make("element", Repetition::REQUIRED,
1450                                                  ParquetType::INT32)})},
1451            LogicalType::List())}));
1452 
1453   // a
1454   ASSERT_OK(
1455       WriteInt64Column(DefLevels{0, 2, 2, 1, 2}, RepLevels{}, Int64Vector{4, 5, 6}));
1456   // b
1457   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 3, 3, 3}, RepLevels{0, 0, 0, 0, 1, 0},
1458                              Int32Vector{7, 8, 9}));
1459 
1460   auto type =
1461       struct_({field("a", int64()), field("b", List(int32(), /*nullable=*/false))});
1462   auto expected = ArrayFromJSON(type,
1463                                 R"([null,
1464                                     {"a": 4, "b": null},
1465                                     {"a": 5, "b": []},
1466                                     {"a": null, "b": [7, 8]},
1467                                     {"a": 6, "b": [9]}])");
1468   CheckColumn(/*column_index=*/0, *expected);
1469 }
1470 
TEST_F(TestReconstructColumn,NestedTwoFieldsList5)1471 TEST_F(TestReconstructColumn, NestedTwoFieldsList5) {
1472   // Arrow schema: struct(a: int64, b: list(int32))
1473   SetParquetSchema(GroupNode::Make(
1474       "parent", Repetition::OPTIONAL,
1475       {PrimitiveNode::Make("a", Repetition::OPTIONAL, ParquetType::INT64),
1476        GroupNode::Make(
1477            "b", Repetition::OPTIONAL,
1478            {GroupNode::Make("list", Repetition::REPEATED,
1479                             {PrimitiveNode::Make("element", Repetition::OPTIONAL,
1480                                                  ParquetType::INT32)})},
1481            LogicalType::List())}));
1482 
1483   // a
1484   ASSERT_OK(
1485       WriteInt64Column(DefLevels{0, 2, 2, 1, 2}, RepLevels{}, Int64Vector{4, 5, 6}));
1486   // b
1487   ASSERT_OK(WriteInt32Column(DefLevels{0, 1, 2, 4, 3, 4}, RepLevels{0, 0, 0, 0, 1, 0},
1488                              Int32Vector{7, 8}));
1489 
1490   auto type = struct_({field("a", int64()), field("b", List(int32()))});
1491   auto expected = ArrayFromJSON(type,
1492                                 R"([null,
1493                                     {"a": 4, "b": null},
1494                                     {"a": 5, "b": []},
1495                                     {"a": null, "b": [7, null]},
1496                                     {"a": 6, "b": [8]}])");
1497   CheckColumn(/*column_index=*/0, *expected);
1498 }
1499 
1500 //
1501 // List-in-list
1502 //
1503 
TEST_F(TestReconstructColumn,ListList1)1504 TEST_F(TestReconstructColumn, ListList1) {
1505   // Arrow schema: list(list(int32 not null) not null) not null
1506   auto inner_list = GroupNode::Make(
1507       "element", Repetition::REQUIRED,
1508       {GroupNode::Make(
1509           "list", Repetition::REPEATED,
1510           {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1511       LogicalType::List());
1512   SetParquetSchema(
1513       GroupNode::Make("parent", Repetition::REQUIRED,
1514                       {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1515                       LogicalType::List()));
1516 
1517   LevelVector def_levels = {0, 1, 2, 2, 2};
1518   LevelVector rep_levels = {0, 0, 1, 0, 2};
1519   std::vector<int32_t> values = {4, 5, 6};
1520 
1521   auto type = List(List(int32(), /*nullable=*/false), /*nullable=*/false);
1522   auto expected = ArrayFromJSON(type, "[[], [[], [4]], [[5, 6]]]");
1523   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1524 }
1525 
TEST_F(TestReconstructColumn,ListList2)1526 TEST_F(TestReconstructColumn, ListList2) {
1527   // Arrow schema: list(list(int32 not null) not null)
1528   auto inner_list = GroupNode::Make(
1529       "element", Repetition::REQUIRED,
1530       {GroupNode::Make(
1531           "list", Repetition::REPEATED,
1532           {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1533       LogicalType::List());
1534   SetParquetSchema(
1535       GroupNode::Make("parent", Repetition::OPTIONAL,
1536                       {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1537                       LogicalType::List()));
1538 
1539   LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1540   LevelVector rep_levels = {0, 0, 0, 1, 0, 2};
1541   std::vector<int32_t> values = {4, 5, 6};
1542 
1543   auto type = List(List(int32(), /*nullable=*/false), /*nullable=*/false);
1544   auto expected = ArrayFromJSON(type, "[null, [], [[], [4]], [[5, 6]]]");
1545   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1546 }
1547 
TEST_F(TestReconstructColumn,ListList3)1548 TEST_F(TestReconstructColumn, ListList3) {
1549   // Arrow schema: list(list(int32 not null)) not null
1550   auto inner_list = GroupNode::Make(
1551       "element", Repetition::OPTIONAL,
1552       {GroupNode::Make(
1553           "list", Repetition::REPEATED,
1554           {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1555       LogicalType::List());
1556   SetParquetSchema(
1557       GroupNode::Make("parent", Repetition::REQUIRED,
1558                       {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1559                       LogicalType::List()));
1560 
1561   LevelVector def_levels = {0, 1, 2, 3, 3, 3};
1562   LevelVector rep_levels = {0, 0, 1, 0, 1, 2};
1563   std::vector<int32_t> values = {4, 5, 6};
1564 
1565   auto type = List(List(int32(), /*nullable=*/false));
1566   auto expected = ArrayFromJSON(type, "[[], [null, []], [[4], [5, 6]]]");
1567   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1568 }
1569 
TEST_F(TestReconstructColumn,ListList4)1570 TEST_F(TestReconstructColumn, ListList4) {
1571   // Arrow schema: list(list(int32 not null))
1572   auto inner_list = GroupNode::Make(
1573       "element", Repetition::OPTIONAL,
1574       {GroupNode::Make(
1575           "list", Repetition::REPEATED,
1576           {PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32)})},
1577       LogicalType::List());
1578   SetParquetSchema(
1579       GroupNode::Make("parent", Repetition::OPTIONAL,
1580                       {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1581                       LogicalType::List()));
1582 
1583   LevelVector def_levels = {0, 1, 2, 3, 4, 4, 4};
1584   LevelVector rep_levels = {0, 0, 0, 1, 1, 0, 2};
1585   std::vector<int32_t> values = {4, 5, 6};
1586 
1587   auto type = List(List(int32(), /*nullable=*/false));
1588   auto expected = ArrayFromJSON(type, "[null, [], [null, [], [4]], [[5, 6]]]");
1589   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1590 }
1591 
TEST_F(TestReconstructColumn,ListList5)1592 TEST_F(TestReconstructColumn, ListList5) {
1593   // Arrow schema: list(list(int32) not null)
1594   auto inner_list = GroupNode::Make(
1595       "element", Repetition::REQUIRED,
1596       {GroupNode::Make(
1597           "list", Repetition::REPEATED,
1598           {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
1599       LogicalType::List());
1600   SetParquetSchema(
1601       GroupNode::Make("parent", Repetition::OPTIONAL,
1602                       {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1603                       LogicalType::List()));
1604 
1605   LevelVector def_levels = {0, 1, 2, 4, 4, 3, 4};
1606   LevelVector rep_levels = {0, 0, 0, 1, 0, 1, 2};
1607   std::vector<int32_t> values = {4, 5, 6};
1608 
1609   auto type = List(List(int32()), /*nullable=*/false);
1610   auto expected = ArrayFromJSON(type, "[null, [], [[], [4]], [[5], [null, 6]]]");
1611   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1612 }
1613 
TEST_F(TestReconstructColumn,ListList6)1614 TEST_F(TestReconstructColumn, ListList6) {
1615   // Arrow schema: list(list(int32))
1616   auto inner_list = GroupNode::Make(
1617       "element", Repetition::OPTIONAL,
1618       {GroupNode::Make(
1619           "list", Repetition::REPEATED,
1620           {PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::INT32)})},
1621       LogicalType::List());
1622   SetParquetSchema(
1623       GroupNode::Make("parent", Repetition::OPTIONAL,
1624                       {GroupNode::Make("list", Repetition::REPEATED, {inner_list})},
1625                       LogicalType::List()));
1626 
1627   LevelVector def_levels = {0, 1, 2, 3, 4, 5, 5, 5};
1628   LevelVector rep_levels = {0, 0, 0, 1, 1, 2, 0, 2};
1629   std::vector<int32_t> values = {4, 5, 6};
1630 
1631   auto type = List(List(int32()));
1632   auto expected = ArrayFromJSON(type, "[null, [], [null, [], [null, 4]], [[5, 6]]]");
1633   AssertReconstruct<ParquetType::INT32>(*expected, def_levels, rep_levels, values);
1634 }
1635 
1636 // TODO legacy-list-in-struct etc.?
1637 
1638 }  // namespace arrow
1639 }  // namespace parquet
1640