1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "gtest/gtest.h"
19 
20 #include "arrow/table.h"
21 #include "arrow/testing/gtest_util.h"
22 
23 #include "parquet/api/reader.h"
24 #include "parquet/api/writer.h"
25 
26 #include "parquet/arrow/schema.h"
27 #include "parquet/arrow/writer.h"
28 #include "parquet/file_writer.h"
29 #include "parquet/test_util.h"
30 
31 using arrow::ArrayFromJSON;
32 using arrow::Buffer;
33 using arrow::default_memory_pool;
34 using arrow::ResizableBuffer;
35 using arrow::Table;
36 
37 using arrow::io::BufferReader;
38 
39 namespace parquet {
40 namespace arrow {
41 
42 struct StatisticsTestParam {
43   std::shared_ptr<::arrow::Table> table;
44   int expected_null_count;
45   // This is the non-null count and not the num_values in the page headers.
46   int expected_value_count;
47   std::string expected_min;
48   std::string expected_max;
49 };
50 
51 // Define a custom print since the default Googletest print trips Valgrind
PrintTo(const StatisticsTestParam & param,std::ostream * os)52 void PrintTo(const StatisticsTestParam& param, std::ostream* os) {
53   (*os) << "StatisticsTestParam{"
54         << "table.schema=" << param.table->schema()->ToString()
55         << ", expected_null_count=" << param.expected_null_count
56         << ", expected_value_count=" << param.expected_value_count
57         << ", expected_min=" << param.expected_min
58         << ", expected_max=" << param.expected_max << "}";
59 }
60 
61 class ParameterizedStatisticsTest : public ::testing::TestWithParam<StatisticsTestParam> {
62 };
63 
GetManyEmptyLists()64 std::string GetManyEmptyLists() {
65   std::string many_empty_lists = "[";
66   for (int i = 0; i < 2000; ++i) {
67     many_empty_lists += "[],";
68   }
69   many_empty_lists += "[1,2,3,4,5,6,7,8,null]]";
70   return many_empty_lists;
71 }
72 
73 // PARQUET-2067: Tests that nulls from parent fields are included in null statistics.
TEST_P(ParameterizedStatisticsTest,NoNullCountWrittenForRepeatedFields)74 TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
75   std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
76   auto out_stream = std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
77   std::unique_ptr<FileWriter> writer;
78   ASSERT_OK(FileWriter::Open(*GetParam().table->schema(), default_memory_pool(),
79                              out_stream, default_writer_properties(),
80                              default_arrow_writer_properties(), &writer));
81   ASSERT_OK(writer->WriteTable(*GetParam().table, std::numeric_limits<int64_t>::max()));
82   ASSERT_OK(writer->Close());
83   ASSERT_OK(out_stream->Close());
84 
85   auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(serialized_data);
86   auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
87   std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
88   std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
89   EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
90   EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
91   ASSERT_TRUE(stats->HasMinMax());
92   EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);
93   EXPECT_EQ(stats->EncodeMax(), GetParam().expected_max);
94 }
95 
96 INSTANTIATE_TEST_SUITE_P(
97     StatsTests, ParameterizedStatisticsTest,
98     ::testing::Values(
99         StatisticsTestParam{
100             /*table=*/Table::Make(::arrow::schema({::arrow::field("a", ::arrow::utf8())}),
101                                   {ArrayFromJSON(::arrow::utf8(),
102                                                  R"(["1", null, "3"])")}),
103             /*expected_null_count=*/1, /* empty list counts as null as well */
104             /*expected_value_count=*/2,
105             /*expected_min=*/"1",
106             /*expected_max=*/"3"},
107         StatisticsTestParam{
108             /*table=*/Table::Make(
109                 ::arrow::schema({::arrow::field("a", list(::arrow::utf8()))}),
110                 {ArrayFromJSON(list(::arrow::utf8()),
111                                R"([["1"], [], null, ["1", null, "3"]])")}),
112             /*expected_null_count=*/3, /* empty list counts as null as well */
113             /*expected_value_count=*/3,
114             /*expected_min=*/"1",
115             /*expected_max=*/"3"},
116         StatisticsTestParam{
117             /*table=*/Table::Make(
118                 ::arrow::schema({::arrow::field("a", ::arrow::int64())}),
119                 {ArrayFromJSON(::arrow::int64(), R"([1, null, 3, null])")}),
120             /*expected_null_count=*/2, /* empty list counts as null as well */
121             /*expected_value_count=*/2,
122             /*expected_min=*/std::string("\x1\0\0\0\0\0\0\0", 8),
123             /*expected_max=*/std::string("\x3\0\0\0\0\0\0\0", 8)},
124         StatisticsTestParam{
125             /*table=*/Table::Make(
126                 ::arrow::schema({::arrow::field("a", list(::arrow::utf8()))}),
127                 {ArrayFromJSON(list(::arrow::utf8()), R"([["1"], [], ["1", "3"]])")}),
128             /*expected_null_count=*/1, /* empty list counts as null as well */
129             /*expected_value_count=*/3,
130             /*expected_min=*/"1",
131             /*expected_max=*/"3"},
132         StatisticsTestParam{
133             /*table=*/Table::Make(
134                 ::arrow::schema({::arrow::field("a", list(::arrow::int64()))}),
135                 {ArrayFromJSON(list(::arrow::int64()),
136                                R"([[1], [], null, [1, null, 3]])")}),
137             /*expected_null_count=*/3, /* empty list counts as null as well */
138             /*expected_value_count=*/3,
139             /*expected_min=*/std::string("\x1\0\0\0\0\0\0\0", 8),
140             /*expected_max=*/std::string("\x3\0\0\0\0\0\0\0", 8)},
141         StatisticsTestParam{
142             /*table=*/Table::Make(
143                 ::arrow::schema({::arrow::field("a", list(::arrow::int64()), false)}),
144                 {ArrayFromJSON(list(::arrow::int64()), GetManyEmptyLists())}),
145             /*expected_null_count=*/2001, /* empty list counts as null as well */
146             /*expected_value_count=*/8,
147             /*expected_min=*/std::string("\x1\0\0\0\0\0\0\0", 8),
148             /*expected_max=*/std::string("\x8\0\0\0\0\0\0\0", 8)},
149         StatisticsTestParam{
150             /*table=*/Table::Make(
151                 ::arrow::schema({::arrow::field("a", list(dictionary(::arrow::int32(),
152                                                                      ::arrow::utf8())))}),
153                 {ArrayFromJSON(list(dictionary(::arrow::int32(), ::arrow::utf8())),
154                                R"([null, ["z", null, "z"], null, null, null])")}),
155             /*expected_null_count=*/5,
156             /*expected_value_count=*/2,
157             /*expected_min=*/"z",
158             /*expected_max=*/"z"}));
159 
160 }  // namespace arrow
161 }  // namespace parquet
162