1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "benchmark/benchmark.h"
19
20 #include <array>
21 #include <iostream>
22 #include <random>
23
24 #include "parquet/arrow/reader.h"
25 #include "parquet/arrow/writer.h"
26 #include "parquet/column_reader.h"
27 #include "parquet/column_writer.h"
28 #include "parquet/file_reader.h"
29 #include "parquet/file_writer.h"
30 #include "parquet/platform.h"
31
32 #include "arrow/array.h"
33 #include "arrow/array/builder_primitive.h"
34 #include "arrow/io/memory.h"
35 #include "arrow/table.h"
36 #include "arrow/testing/gtest_util.h"
37 #include "arrow/testing/random.h"
38 #include "arrow/util/async_generator.h"
39 #include "arrow/util/bitmap_ops.h"
40 #include "arrow/util/logging.h"
41
42 using arrow::Array;
43 using arrow::ArrayVector;
44 using arrow::BooleanBuilder;
45 using arrow::FieldVector;
46 using arrow::NumericBuilder;
47
48 #define EXIT_NOT_OK(s) \
49 do { \
50 ::arrow::Status _s = (s); \
51 if (ARROW_PREDICT_FALSE(!_s.ok())) { \
52 std::cout << "Exiting: " << _s.ToString() << std::endl; \
53 exit(EXIT_FAILURE); \
54 } \
55 } while (0)
56
57 namespace parquet {
58
59 using arrow::FileReader;
60 using arrow::WriteTable;
61 using schema::PrimitiveNode;
62
63 namespace benchmark {
64
65 // This should result in multiple pages for most primitive types
66 constexpr int64_t BENCHMARK_SIZE = 10 * 1024 * 1024;
67
68 template <typename ParquetType>
69 struct benchmark_traits {};
70
71 template <>
72 struct benchmark_traits<Int32Type> {
73 using arrow_type = ::arrow::Int32Type;
74 };
75
76 template <>
77 struct benchmark_traits<Int64Type> {
78 using arrow_type = ::arrow::Int64Type;
79 };
80
81 template <>
82 struct benchmark_traits<DoubleType> {
83 using arrow_type = ::arrow::DoubleType;
84 };
85
86 template <>
87 struct benchmark_traits<BooleanType> {
88 using arrow_type = ::arrow::BooleanType;
89 };
90
91 template <typename ParquetType>
92 using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
93
94 template <typename ParquetType>
MakeSchema(Repetition::type repetition)95 std::shared_ptr<ColumnDescriptor> MakeSchema(Repetition::type repetition) {
96 auto node = PrimitiveNode::Make("int64", repetition, ParquetType::type_num);
97 return std::make_shared<ColumnDescriptor>(node, repetition != Repetition::REQUIRED,
98 repetition == Repetition::REPEATED);
99 }
100
101 template <bool nullable, typename ParquetType>
SetBytesProcessed(::benchmark::State & state,int64_t num_values=BENCHMARK_SIZE)102 void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) {
103 const int64_t items_processed = state.iterations() * num_values;
104 const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type);
105
106 state.SetItemsProcessed(bytes_processed);
107 state.SetBytesProcessed(bytes_processed);
108 }
109
110 constexpr int64_t kAlternatingOrNa = -1;
111
112 template <typename T>
RandomVector(int64_t true_percentage,int64_t vector_size,const std::array<T,2> & sample_values,int seed=500)113 std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
114 const std::array<T, 2>& sample_values, int seed = 500) {
115 std::vector<T> values(vector_size, {});
116 if (true_percentage == kAlternatingOrNa) {
117 int n = {0};
118 std::generate(values.begin(), values.end(), [&n] { return n++ % 2; });
119 } else {
120 std::default_random_engine rng(seed);
121 double true_probability = static_cast<double>(true_percentage) / 100.0;
122 std::bernoulli_distribution dist(true_probability);
123 std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; });
124 }
125 return values;
126 }
127
128 template <typename ParquetType>
TableFromVector(const std::vector<typename ParquetType::c_type> & vec,bool nullable,int64_t null_percentage=kAlternatingOrNa)129 std::shared_ptr<::arrow::Table> TableFromVector(
130 const std::vector<typename ParquetType::c_type>& vec, bool nullable,
131 int64_t null_percentage = kAlternatingOrNa) {
132 if (!nullable) {
133 ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
134 }
135 std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
136 NumericBuilder<ArrowType<ParquetType>> builder;
137 if (nullable) {
138 // Note true values select index 1 of sample_values
139 auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
140 vec.size(), /*sample_values=*/{1, 0});
141 EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data()));
142 } else {
143 EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr));
144 }
145 std::shared_ptr<::arrow::Array> array;
146 EXIT_NOT_OK(builder.Finish(&array));
147
148 auto field = ::arrow::field("column", type, nullable);
149 auto schema = ::arrow::schema({field});
150 return ::arrow::Table::Make(schema, {array});
151 }
152
153 template <>
TableFromVector(const std::vector<bool> & vec,bool nullable,int64_t null_percentage)154 std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
155 bool nullable,
156 int64_t null_percentage) {
157 BooleanBuilder builder;
158 if (nullable) {
159 auto valid_bytes = RandomVector<bool>(/*true_percentage=*/null_percentage, vec.size(),
160 {true, false});
161 EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes));
162 } else {
163 EXIT_NOT_OK(builder.AppendValues(vec));
164 }
165 std::shared_ptr<::arrow::Array> array;
166 EXIT_NOT_OK(builder.Finish(&array));
167
168 auto field = ::arrow::field("column", ::arrow::boolean(), nullable);
169 auto schema = std::make_shared<::arrow::Schema>(
170 std::vector<std::shared_ptr<::arrow::Field>>({field}));
171 return ::arrow::Table::Make(schema, {array});
172 }
173
174 template <bool nullable, typename ParquetType>
BM_WriteColumn(::benchmark::State & state)175 static void BM_WriteColumn(::benchmark::State& state) {
176 using T = typename ParquetType::c_type;
177 std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
178 std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
179
180 while (state.KeepRunning()) {
181 auto output = CreateOutputStream();
182 EXIT_NOT_OK(
183 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
184 }
185 SetBytesProcessed<nullable, ParquetType>(state);
186 }
187
188 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type);
189 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int32Type);
190
191 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int64Type);
192 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int64Type);
193
194 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, DoubleType);
195 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
196
197 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
198 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
199
200 template <typename T>
201 struct Examples {
valuesparquet::benchmark::Examples202 static constexpr std::array<T, 2> values() { return {127, 128}; }
203 };
204
205 template <>
206 struct Examples<bool> {
valuesparquet::benchmark::Examples207 static constexpr std::array<bool, 2> values() { return {false, true}; }
208 };
209
BenchmarkReadTable(::benchmark::State & state,const::arrow::Table & table,int64_t num_values=-1,int64_t bytes_per_value=-1)210 static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
211 int64_t num_values = -1, int64_t bytes_per_value = -1) {
212 auto output = CreateOutputStream();
213 EXIT_NOT_OK(
214 WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
215 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
216
217 while (state.KeepRunning()) {
218 auto reader =
219 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
220 std::unique_ptr<FileReader> arrow_reader;
221 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
222 &arrow_reader));
223 std::shared_ptr<::arrow::Table> table;
224 EXIT_NOT_OK(arrow_reader->ReadTable(&table));
225 }
226
227 if (num_values == -1) {
228 num_values = table.num_rows();
229 }
230 state.SetItemsProcessed(num_values * state.iterations());
231 if (bytes_per_value != -1) {
232 state.SetBytesProcessed(num_values * state.iterations() * bytes_per_value);
233 }
234 }
235
BenchmarkReadArray(::benchmark::State & state,const std::shared_ptr<Array> & array,bool nullable,int64_t num_values=-1,int64_t bytes_per_value=-1)236 static void BenchmarkReadArray(::benchmark::State& state,
237 const std::shared_ptr<Array>& array, bool nullable,
238 int64_t num_values = -1, int64_t bytes_per_value = -1) {
239 auto schema = ::arrow::schema({field("s", array->type(), nullable)});
240 auto table = ::arrow::Table::Make(schema, {array}, array->length());
241
242 EXIT_NOT_OK(table->Validate());
243
244 BenchmarkReadTable(state, *table, num_values, bytes_per_value);
245 }
246
247 //
248 // Benchmark reading a primitive column
249 //
250
251 template <bool nullable, typename ParquetType>
BM_ReadColumn(::benchmark::State & state)252 static void BM_ReadColumn(::benchmark::State& state) {
253 using T = typename ParquetType::c_type;
254
255 auto values = RandomVector<T>(/*percentage=*/state.range(1), BENCHMARK_SIZE,
256 Examples<T>::values());
257
258 std::shared_ptr<::arrow::Table> table =
259 TableFromVector<ParquetType>(values, nullable, state.range(0));
260
261 BenchmarkReadTable(state, *table, table->num_rows(),
262 sizeof(typename ParquetType::c_type));
263 }
264
265 // There are two parameters here that cover different data distributions.
266 // null_percentage governs distribution and therefore runs of null values.
267 // first_value_percentage governs distribution of values (we select from 1 of 2)
268 // so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
269 // there will be some percentage of RLE encoded values and some percentage of literal
270 // encoded values (RLE is much less likely with percentages close to 50).
271 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
272 ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
273 ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
274 ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
275
276 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
277 ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
278 ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
279 ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
280 ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
281 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
282 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/0})
283 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
284 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
285
286 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
287 ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
288 ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
289 ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
290 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
291 ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
292 ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
293 ->Args({/*null_percentage=*/5, /*first_value_percentage=*/5})
294 ->Args({/*null_percentage=*/10, /*first_value_percentage=*/5})
295 ->Args({/*null_percentage=*/25, /*first_value_percentage=*/10})
296 ->Args({/*null_percentage=*/30, /*first_value_percentage=*/10})
297 ->Args({/*null_percentage=*/35, /*first_value_percentage=*/10})
298 ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
299 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
300 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
301 ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
302 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
303 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
304
305 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
306 ->Args({kAlternatingOrNa, 0})
307 ->Args({kAlternatingOrNa, 20});
308 // Less coverage because int64_t should be pretty good representation for nullability and
309 // repeating values.
310 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType)
311 ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
312 ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
313 ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25});
314
315 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType)
316 ->Args({kAlternatingOrNa, 0})
317 ->Args({1, 20});
318 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
319 ->Args({kAlternatingOrNa, 1})
320 ->Args({5, 10});
321
322 //
323 // Benchmark reading a nested column
324 //
325
326 const std::vector<int64_t> kNestedNullPercents = {0, 1, 50, 99};
327
328 // XXX We can use ArgsProduct() starting from Benchmark 1.5.2
NestedReadArguments(::benchmark::internal::Benchmark * b)329 static void NestedReadArguments(::benchmark::internal::Benchmark* b) {
330 for (const auto null_percentage : kNestedNullPercents) {
331 b->Arg(null_percentage);
332 }
333 }
334
MakeStructArray(::arrow::random::RandomArrayGenerator * rng,const ArrayVector & children,double null_probability,bool propagate_validity=false)335 static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
336 const ArrayVector& children,
337 double null_probability,
338 bool propagate_validity = false) {
339 ARROW_CHECK_GT(children.size(), 0);
340 const int64_t length = children[0]->length();
341
342 std::shared_ptr<::arrow::Buffer> null_bitmap;
343 if (null_probability > 0.0) {
344 null_bitmap = rng->NullBitmap(length, null_probability);
345 if (propagate_validity) {
346 // HACK: the Parquet writer currently doesn't allow non-empty list
347 // entries where a parent node is null (for instance, a struct-of-list
348 // where the outer struct is marked null but the inner list value is
349 // non-empty).
350 for (const auto& child : children) {
351 null_bitmap = *::arrow::internal::BitmapOr(
352 ::arrow::default_memory_pool(), null_bitmap->data(), 0,
353 child->null_bitmap_data(), 0, length, 0);
354 }
355 }
356 }
357 FieldVector fields(children.size());
358 char field_name = 'a';
359 for (size_t i = 0; i < children.size(); ++i) {
360 fields[i] = field(std::string{field_name++}, children[i]->type(),
361 /*nullable=*/null_probability > 0.0);
362 }
363 return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap);
364 }
365
366 // Make a (int32, int64) struct array
MakeStructArray(::arrow::random::RandomArrayGenerator * rng,int64_t size,double null_probability)367 static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
368 int64_t size, double null_probability) {
369 auto values1 = rng->Int32(size, -5, 5, null_probability);
370 auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL, null_probability);
371 return MakeStructArray(rng, {values1, values2}, null_probability);
372 }
373
BM_ReadStructColumn(::benchmark::State & state)374 static void BM_ReadStructColumn(::benchmark::State& state) {
375 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
376 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
377 const bool nullable = (null_probability != 0.0);
378
379 ARROW_CHECK_GE(null_probability, 0.0);
380
381 const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
382
383 ::arrow::random::RandomArrayGenerator rng(42);
384 auto array = MakeStructArray(&rng, kNumValues, null_probability);
385
386 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
387 }
388
389 BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);
390
BM_ReadStructOfStructColumn(::benchmark::State & state)391 static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
392 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
393 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
394 const bool nullable = (null_probability != 0.0);
395
396 ARROW_CHECK_GE(null_probability, 0.0);
397
398 const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t));
399
400 ::arrow::random::RandomArrayGenerator rng(42);
401 auto values1 = MakeStructArray(&rng, kNumValues, null_probability);
402 auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
403 auto array = MakeStructArray(&rng, {values1, values2}, null_probability);
404
405 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
406 }
407
408 BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);
409
BM_ReadStructOfListColumn(::benchmark::State & state)410 static void BM_ReadStructOfListColumn(::benchmark::State& state) {
411 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
412 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
413 const bool nullable = (null_probability != 0.0);
414
415 ARROW_CHECK_GE(null_probability, 0.0);
416
417 ::arrow::random::RandomArrayGenerator rng(42);
418
419 const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
420
421 auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
422 auto values2 =
423 rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
424 auto list1 = rng.List(*values1, kNumValues / 10, null_probability);
425 auto list2 = rng.List(*values2, kNumValues / 10, null_probability);
426 auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
427 /*propagate_validity =*/true);
428
429 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
430 }
431
432 BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);
433
BM_ReadListColumn(::benchmark::State & state)434 static void BM_ReadListColumn(::benchmark::State& state) {
435 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
436 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
437 const bool nullable = (null_probability != 0.0);
438
439 ARROW_CHECK_GE(null_probability, 0.0);
440
441 ::arrow::random::RandomArrayGenerator rng(42);
442
443 auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
444 const int64_t kBytesPerValue = sizeof(int64_t);
445
446 auto array = rng.List(*values, kNumValues / 10, null_probability);
447
448 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
449 }
450
451 BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);
452
BM_ReadListOfStructColumn(::benchmark::State & state)453 static void BM_ReadListOfStructColumn(::benchmark::State& state) {
454 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
455 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
456 const bool nullable = (null_probability != 0.0);
457
458 ARROW_CHECK_GE(null_probability, 0.0);
459
460 ::arrow::random::RandomArrayGenerator rng(42);
461
462 auto values = MakeStructArray(&rng, kNumValues, null_probability);
463 const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
464
465 auto array = rng.List(*values, kNumValues / 10, null_probability);
466
467 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
468 }
469
470 BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);
471
BM_ReadListOfListColumn(::benchmark::State & state)472 static void BM_ReadListOfListColumn(::benchmark::State& state) {
473 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
474 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
475 const bool nullable = (null_probability != 0.0);
476
477 ARROW_CHECK_GE(null_probability, 0.0);
478
479 ::arrow::random::RandomArrayGenerator rng(42);
480
481 auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
482 const int64_t kBytesPerValue = sizeof(int64_t);
483
484 auto inner = rng.List(*values, kNumValues / 10, null_probability);
485 auto array = rng.List(*inner, kNumValues / 100, null_probability);
486
487 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
488 }
489
490 BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
491
492 //
493 // Benchmark different ways of reading select row groups
494 //
495
BM_ReadIndividualRowGroups(::benchmark::State & state)496 static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
497 std::vector<int64_t> values(BENCHMARK_SIZE, 128);
498 std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
499 auto output = CreateOutputStream();
500 // This writes 10 RowGroups
501 EXIT_NOT_OK(
502 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
503
504 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
505
506 while (state.KeepRunning()) {
507 auto reader =
508 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
509 std::unique_ptr<FileReader> arrow_reader;
510 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
511 &arrow_reader));
512
513 std::vector<std::shared_ptr<::arrow::Table>> tables;
514 for (int i = 0; i < arrow_reader->num_row_groups(); i++) {
515 // Only read the even numbered RowGroups
516 if ((i % 2) == 0) {
517 std::shared_ptr<::arrow::Table> table;
518 EXIT_NOT_OK(arrow_reader->RowGroup(i)->ReadTable(&table));
519 tables.push_back(table);
520 }
521 }
522
523 std::shared_ptr<::arrow::Table> final_table;
524 PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables));
525 }
526 SetBytesProcessed<true, Int64Type>(state);
527 }
528
529 BENCHMARK(BM_ReadIndividualRowGroups);
530
BM_ReadMultipleRowGroups(::benchmark::State & state)531 static void BM_ReadMultipleRowGroups(::benchmark::State& state) {
532 std::vector<int64_t> values(BENCHMARK_SIZE, 128);
533 std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
534 auto output = CreateOutputStream();
535 // This writes 10 RowGroups
536 EXIT_NOT_OK(
537 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
538 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
539 std::vector<int> rgs{0, 2, 4, 6, 8};
540
541 while (state.KeepRunning()) {
542 auto reader =
543 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
544 std::unique_ptr<FileReader> arrow_reader;
545 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
546 &arrow_reader));
547 std::shared_ptr<::arrow::Table> table;
548 EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table));
549 }
550 SetBytesProcessed<true, Int64Type>(state);
551 }
552
553 BENCHMARK(BM_ReadMultipleRowGroups);
554
BM_ReadMultipleRowGroupsGenerator(::benchmark::State & state)555 static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) {
556 std::vector<int64_t> values(BENCHMARK_SIZE, 128);
557 std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
558 auto output = CreateOutputStream();
559 // This writes 10 RowGroups
560 EXIT_NOT_OK(
561 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
562 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
563 std::vector<int> rgs{0, 2, 4, 6, 8};
564
565 while (state.KeepRunning()) {
566 auto reader =
567 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
568 std::unique_ptr<FileReader> unique_reader;
569 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
570 &unique_reader));
571 std::shared_ptr<FileReader> arrow_reader = std::move(unique_reader);
572 ASSIGN_OR_ABORT(auto generator,
573 arrow_reader->GetRecordBatchGenerator(arrow_reader, rgs, {0}));
574 auto fut = ::arrow::CollectAsyncGenerator(generator);
575 ASSIGN_OR_ABORT(auto batches, fut.result());
576 ASSIGN_OR_ABORT(auto actual, ::arrow::Table::FromRecordBatches(std::move(batches)));
577 }
578 SetBytesProcessed<true, Int64Type>(state);
579 }
580
581 BENCHMARK(BM_ReadMultipleRowGroupsGenerator);
582
583 } // namespace benchmark
584
585 } // namespace parquet
586