1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "gtest/gtest.h"
19
20 #include "arrow/table.h"
21 #include "arrow/testing/gtest_util.h"
22
23 #include "parquet/api/reader.h"
24 #include "parquet/api/writer.h"
25
26 #include "parquet/arrow/schema.h"
27 #include "parquet/arrow/writer.h"
28 #include "parquet/file_writer.h"
29 #include "parquet/test_util.h"
30
31 using arrow::ArrayFromJSON;
32 using arrow::Buffer;
33 using arrow::default_memory_pool;
34 using arrow::ResizableBuffer;
35 using arrow::Table;
36
37 using arrow::io::BufferReader;
38
39 namespace parquet {
40 namespace arrow {
41
42 struct StatisticsTestParam {
43 std::shared_ptr<::arrow::Table> table;
44 int expected_null_count;
45 // This is the non-null count and not the num_values in the page headers.
46 int expected_value_count;
47 std::string expected_min;
48 std::string expected_max;
49 };
50
51 // Define a custom print since the default Googletest print trips Valgrind
PrintTo(const StatisticsTestParam & param,std::ostream * os)52 void PrintTo(const StatisticsTestParam& param, std::ostream* os) {
53 (*os) << "StatisticsTestParam{"
54 << "table.schema=" << param.table->schema()->ToString()
55 << ", expected_null_count=" << param.expected_null_count
56 << ", expected_value_count=" << param.expected_value_count
57 << ", expected_min=" << param.expected_min
58 << ", expected_max=" << param.expected_max << "}";
59 }
60
61 class ParameterizedStatisticsTest : public ::testing::TestWithParam<StatisticsTestParam> {
62 };
63
GetManyEmptyLists()64 std::string GetManyEmptyLists() {
65 std::string many_empty_lists = "[";
66 for (int i = 0; i < 2000; ++i) {
67 many_empty_lists += "[],";
68 }
69 many_empty_lists += "[1,2,3,4,5,6,7,8,null]]";
70 return many_empty_lists;
71 }
72
73 // PARQUET-2067: Tests that nulls from parent fields are included in null statistics.
TEST_P(ParameterizedStatisticsTest,NoNullCountWrittenForRepeatedFields)74 TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
75 std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
76 auto out_stream = std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
77 std::unique_ptr<FileWriter> writer;
78 ASSERT_OK(FileWriter::Open(*GetParam().table->schema(), default_memory_pool(),
79 out_stream, default_writer_properties(),
80 default_arrow_writer_properties(), &writer));
81 ASSERT_OK(writer->WriteTable(*GetParam().table, std::numeric_limits<int64_t>::max()));
82 ASSERT_OK(writer->Close());
83 ASSERT_OK(out_stream->Close());
84
85 auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(serialized_data);
86 auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
87 std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
88 std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
89 EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
90 EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
91 ASSERT_TRUE(stats->HasMinMax());
92 EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);
93 EXPECT_EQ(stats->EncodeMax(), GetParam().expected_max);
94 }
95
96 INSTANTIATE_TEST_SUITE_P(
97 StatsTests, ParameterizedStatisticsTest,
98 ::testing::Values(
99 StatisticsTestParam{
100 /*table=*/Table::Make(::arrow::schema({::arrow::field("a", ::arrow::utf8())}),
101 {ArrayFromJSON(::arrow::utf8(),
102 R"(["1", null, "3"])")}),
103 /*expected_null_count=*/1, /* empty list counts as null as well */
104 /*expected_value_count=*/2,
105 /*expected_min=*/"1",
106 /*expected_max=*/"3"},
107 StatisticsTestParam{
108 /*table=*/Table::Make(
109 ::arrow::schema({::arrow::field("a", list(::arrow::utf8()))}),
110 {ArrayFromJSON(list(::arrow::utf8()),
111 R"([["1"], [], null, ["1", null, "3"]])")}),
112 /*expected_null_count=*/3, /* empty list counts as null as well */
113 /*expected_value_count=*/3,
114 /*expected_min=*/"1",
115 /*expected_max=*/"3"},
116 StatisticsTestParam{
117 /*table=*/Table::Make(
118 ::arrow::schema({::arrow::field("a", ::arrow::int64())}),
119 {ArrayFromJSON(::arrow::int64(), R"([1, null, 3, null])")}),
120 /*expected_null_count=*/2, /* empty list counts as null as well */
121 /*expected_value_count=*/2,
122 /*expected_min=*/std::string("\x1\0\0\0\0\0\0\0", 8),
123 /*expected_max=*/std::string("\x3\0\0\0\0\0\0\0", 8)},
124 StatisticsTestParam{
125 /*table=*/Table::Make(
126 ::arrow::schema({::arrow::field("a", list(::arrow::utf8()))}),
127 {ArrayFromJSON(list(::arrow::utf8()), R"([["1"], [], ["1", "3"]])")}),
128 /*expected_null_count=*/1, /* empty list counts as null as well */
129 /*expected_value_count=*/3,
130 /*expected_min=*/"1",
131 /*expected_max=*/"3"},
132 StatisticsTestParam{
133 /*table=*/Table::Make(
134 ::arrow::schema({::arrow::field("a", list(::arrow::int64()))}),
135 {ArrayFromJSON(list(::arrow::int64()),
136 R"([[1], [], null, [1, null, 3]])")}),
137 /*expected_null_count=*/3, /* empty list counts as null as well */
138 /*expected_value_count=*/3,
139 /*expected_min=*/std::string("\x1\0\0\0\0\0\0\0", 8),
140 /*expected_max=*/std::string("\x3\0\0\0\0\0\0\0", 8)},
141 StatisticsTestParam{
142 /*table=*/Table::Make(
143 ::arrow::schema({::arrow::field("a", list(::arrow::int64()), false)}),
144 {ArrayFromJSON(list(::arrow::int64()), GetManyEmptyLists())}),
145 /*expected_null_count=*/2001, /* empty list counts as null as well */
146 /*expected_value_count=*/8,
147 /*expected_min=*/std::string("\x1\0\0\0\0\0\0\0", 8),
148 /*expected_max=*/std::string("\x8\0\0\0\0\0\0\0", 8)},
149 StatisticsTestParam{
150 /*table=*/Table::Make(
151 ::arrow::schema({::arrow::field("a", list(dictionary(::arrow::int32(),
152 ::arrow::utf8())))}),
153 {ArrayFromJSON(list(dictionary(::arrow::int32(), ::arrow::utf8())),
154 R"([null, ["z", null, "z"], null, null, null])")}),
155 /*expected_null_count=*/5,
156 /*expected_value_count=*/2,
157 /*expected_min=*/"z",
158 /*expected_max=*/"z"}));
159
160 } // namespace arrow
161 } // namespace parquet
162