1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "benchmark/benchmark.h"
19 
20 #include <array>
21 #include <iostream>
22 #include <random>
23 
24 #include "parquet/arrow/reader.h"
25 #include "parquet/arrow/writer.h"
26 #include "parquet/column_reader.h"
27 #include "parquet/column_writer.h"
28 #include "parquet/file_reader.h"
29 #include "parquet/file_writer.h"
30 #include "parquet/platform.h"
31 
32 #include "arrow/array.h"
33 #include "arrow/array/builder_primitive.h"
34 #include "arrow/io/memory.h"
35 #include "arrow/table.h"
36 #include "arrow/testing/gtest_util.h"
37 #include "arrow/testing/random.h"
38 #include "arrow/util/async_generator.h"
39 #include "arrow/util/bitmap_ops.h"
40 #include "arrow/util/logging.h"
41 
42 using arrow::Array;
43 using arrow::ArrayVector;
44 using arrow::BooleanBuilder;
45 using arrow::FieldVector;
46 using arrow::NumericBuilder;
47 
48 #define EXIT_NOT_OK(s)                                        \
49   do {                                                        \
50     ::arrow::Status _s = (s);                                 \
51     if (ARROW_PREDICT_FALSE(!_s.ok())) {                      \
52       std::cout << "Exiting: " << _s.ToString() << std::endl; \
53       exit(EXIT_FAILURE);                                     \
54     }                                                         \
55   } while (0)
56 
57 namespace parquet {
58 
59 using arrow::FileReader;
60 using arrow::WriteTable;
61 using schema::PrimitiveNode;
62 
63 namespace benchmark {
64 
65 // This should result in multiple pages for most primitive types
66 constexpr int64_t BENCHMARK_SIZE = 10 * 1024 * 1024;
67 
68 template <typename ParquetType>
69 struct benchmark_traits {};
70 
71 template <>
72 struct benchmark_traits<Int32Type> {
73   using arrow_type = ::arrow::Int32Type;
74 };
75 
76 template <>
77 struct benchmark_traits<Int64Type> {
78   using arrow_type = ::arrow::Int64Type;
79 };
80 
81 template <>
82 struct benchmark_traits<DoubleType> {
83   using arrow_type = ::arrow::DoubleType;
84 };
85 
86 template <>
87 struct benchmark_traits<BooleanType> {
88   using arrow_type = ::arrow::BooleanType;
89 };
90 
91 template <typename ParquetType>
92 using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
93 
94 template <typename ParquetType>
MakeSchema(Repetition::type repetition)95 std::shared_ptr<ColumnDescriptor> MakeSchema(Repetition::type repetition) {
96   auto node = PrimitiveNode::Make("int64", repetition, ParquetType::type_num);
97   return std::make_shared<ColumnDescriptor>(node, repetition != Repetition::REQUIRED,
98                                             repetition == Repetition::REPEATED);
99 }
100 
101 template <bool nullable, typename ParquetType>
SetBytesProcessed(::benchmark::State & state,int64_t num_values=BENCHMARK_SIZE)102 void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) {
103   const int64_t items_processed = state.iterations() * num_values;
104   const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type);
105 
106   state.SetItemsProcessed(bytes_processed);
107   state.SetBytesProcessed(bytes_processed);
108 }
109 
110 constexpr int64_t kAlternatingOrNa = -1;
111 
112 template <typename T>
RandomVector(int64_t true_percentage,int64_t vector_size,const std::array<T,2> & sample_values,int seed=500)113 std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
114                             const std::array<T, 2>& sample_values, int seed = 500) {
115   std::vector<T> values(vector_size, {});
116   if (true_percentage == kAlternatingOrNa) {
117     int n = {0};
118     std::generate(values.begin(), values.end(), [&n] { return n++ % 2; });
119   } else {
120     std::default_random_engine rng(seed);
121     double true_probability = static_cast<double>(true_percentage) / 100.0;
122     std::bernoulli_distribution dist(true_probability);
123     std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; });
124   }
125   return values;
126 }
127 
128 template <typename ParquetType>
TableFromVector(const std::vector<typename ParquetType::c_type> & vec,bool nullable,int64_t null_percentage=kAlternatingOrNa)129 std::shared_ptr<::arrow::Table> TableFromVector(
130     const std::vector<typename ParquetType::c_type>& vec, bool nullable,
131     int64_t null_percentage = kAlternatingOrNa) {
132   if (!nullable) {
133     ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
134   }
135   std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
136   NumericBuilder<ArrowType<ParquetType>> builder;
137   if (nullable) {
138     // Note true values select index 1 of sample_values
139     auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
140                                              vec.size(), /*sample_values=*/{1, 0});
141     EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data()));
142   } else {
143     EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr));
144   }
145   std::shared_ptr<::arrow::Array> array;
146   EXIT_NOT_OK(builder.Finish(&array));
147 
148   auto field = ::arrow::field("column", type, nullable);
149   auto schema = ::arrow::schema({field});
150   return ::arrow::Table::Make(schema, {array});
151 }
152 
153 template <>
TableFromVector(const std::vector<bool> & vec,bool nullable,int64_t null_percentage)154 std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
155                                                              bool nullable,
156                                                              int64_t null_percentage) {
157   BooleanBuilder builder;
158   if (nullable) {
159     auto valid_bytes = RandomVector<bool>(/*true_percentage=*/null_percentage, vec.size(),
160                                           {true, false});
161     EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes));
162   } else {
163     EXIT_NOT_OK(builder.AppendValues(vec));
164   }
165   std::shared_ptr<::arrow::Array> array;
166   EXIT_NOT_OK(builder.Finish(&array));
167 
168   auto field = ::arrow::field("column", ::arrow::boolean(), nullable);
169   auto schema = std::make_shared<::arrow::Schema>(
170       std::vector<std::shared_ptr<::arrow::Field>>({field}));
171   return ::arrow::Table::Make(schema, {array});
172 }
173 
174 template <bool nullable, typename ParquetType>
BM_WriteColumn(::benchmark::State & state)175 static void BM_WriteColumn(::benchmark::State& state) {
176   using T = typename ParquetType::c_type;
177   std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
178   std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
179 
180   while (state.KeepRunning()) {
181     auto output = CreateOutputStream();
182     EXIT_NOT_OK(
183         WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
184   }
185   SetBytesProcessed<nullable, ParquetType>(state);
186 }
187 
188 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type);
189 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int32Type);
190 
191 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int64Type);
192 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int64Type);
193 
194 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, DoubleType);
195 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
196 
197 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
198 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
199 
200 template <typename T>
201 struct Examples {
valuesparquet::benchmark::Examples202   static constexpr std::array<T, 2> values() { return {127, 128}; }
203 };
204 
205 template <>
206 struct Examples<bool> {
valuesparquet::benchmark::Examples207   static constexpr std::array<bool, 2> values() { return {false, true}; }
208 };
209 
BenchmarkReadTable(::benchmark::State & state,const::arrow::Table & table,int64_t num_values=-1,int64_t bytes_per_value=-1)210 static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
211                                int64_t num_values = -1, int64_t bytes_per_value = -1) {
212   auto output = CreateOutputStream();
213   EXIT_NOT_OK(
214       WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
215   PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
216 
217   while (state.KeepRunning()) {
218     auto reader =
219         ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
220     std::unique_ptr<FileReader> arrow_reader;
221     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
222                                  &arrow_reader));
223     std::shared_ptr<::arrow::Table> table;
224     EXIT_NOT_OK(arrow_reader->ReadTable(&table));
225   }
226 
227   if (num_values == -1) {
228     num_values = table.num_rows();
229   }
230   state.SetItemsProcessed(num_values * state.iterations());
231   if (bytes_per_value != -1) {
232     state.SetBytesProcessed(num_values * state.iterations() * bytes_per_value);
233   }
234 }
235 
BenchmarkReadArray(::benchmark::State & state,const std::shared_ptr<Array> & array,bool nullable,int64_t num_values=-1,int64_t bytes_per_value=-1)236 static void BenchmarkReadArray(::benchmark::State& state,
237                                const std::shared_ptr<Array>& array, bool nullable,
238                                int64_t num_values = -1, int64_t bytes_per_value = -1) {
239   auto schema = ::arrow::schema({field("s", array->type(), nullable)});
240   auto table = ::arrow::Table::Make(schema, {array}, array->length());
241 
242   EXIT_NOT_OK(table->Validate());
243 
244   BenchmarkReadTable(state, *table, num_values, bytes_per_value);
245 }
246 
247 //
248 // Benchmark reading a primitive column
249 //
250 
251 template <bool nullable, typename ParquetType>
BM_ReadColumn(::benchmark::State & state)252 static void BM_ReadColumn(::benchmark::State& state) {
253   using T = typename ParquetType::c_type;
254 
255   auto values = RandomVector<T>(/*percentage=*/state.range(1), BENCHMARK_SIZE,
256                                 Examples<T>::values());
257 
258   std::shared_ptr<::arrow::Table> table =
259       TableFromVector<ParquetType>(values, nullable, state.range(0));
260 
261   BenchmarkReadTable(state, *table, table->num_rows(),
262                      sizeof(typename ParquetType::c_type));
263 }
264 
265 // There are two parameters here that cover different data distributions.
266 // null_percentage governs distribution and therefore runs of null values.
267 // first_value_percentage governs distribution of values (we select from 1 of 2)
268 // so when 0 or 100 RLE is triggered all the time.  When a value in the range (0, 100)
269 // there will be some percentage of RLE encoded values and some percentage of literal
270 // encoded values (RLE is much less likely with percentages close to 50).
271 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
272     ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
273     ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
274     ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
275 
276 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
277     ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
278     ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
279     ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
280     ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
281     ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
282     ->Args({/*null_percentage=*/50, /*first_value_percentage=*/0})
283     ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
284     ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
285 
286 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
287     ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
288     ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
289     ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
290 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
291     ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
292     ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
293     ->Args({/*null_percentage=*/5, /*first_value_percentage=*/5})
294     ->Args({/*null_percentage=*/10, /*first_value_percentage=*/5})
295     ->Args({/*null_percentage=*/25, /*first_value_percentage=*/10})
296     ->Args({/*null_percentage=*/30, /*first_value_percentage=*/10})
297     ->Args({/*null_percentage=*/35, /*first_value_percentage=*/10})
298     ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
299     ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
300     ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
301     ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
302     ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
303     ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
304 
305 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
306     ->Args({kAlternatingOrNa, 0})
307     ->Args({kAlternatingOrNa, 20});
308 // Less coverage because int64_t should be pretty good representation for nullability and
309 // repeating values.
310 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType)
311     ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
312     ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
313     ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25});
314 
315 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType)
316     ->Args({kAlternatingOrNa, 0})
317     ->Args({1, 20});
318 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
319     ->Args({kAlternatingOrNa, 1})
320     ->Args({5, 10});
321 
322 //
323 // Benchmark reading a nested column
324 //
325 
326 const std::vector<int64_t> kNestedNullPercents = {0, 1, 50, 99};
327 
328 // XXX We can use ArgsProduct() starting from Benchmark 1.5.2
NestedReadArguments(::benchmark::internal::Benchmark * b)329 static void NestedReadArguments(::benchmark::internal::Benchmark* b) {
330   for (const auto null_percentage : kNestedNullPercents) {
331     b->Arg(null_percentage);
332   }
333 }
334 
MakeStructArray(::arrow::random::RandomArrayGenerator * rng,const ArrayVector & children,double null_probability,bool propagate_validity=false)335 static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
336                                               const ArrayVector& children,
337                                               double null_probability,
338                                               bool propagate_validity = false) {
339   ARROW_CHECK_GT(children.size(), 0);
340   const int64_t length = children[0]->length();
341 
342   std::shared_ptr<::arrow::Buffer> null_bitmap;
343   if (null_probability > 0.0) {
344     null_bitmap = rng->NullBitmap(length, null_probability);
345     if (propagate_validity) {
346       // HACK: the Parquet writer currently doesn't allow non-empty list
347       // entries where a parent node is null (for instance, a struct-of-list
348       // where the outer struct is marked null but the inner list value is
349       // non-empty).
350       for (const auto& child : children) {
351         null_bitmap = *::arrow::internal::BitmapOr(
352             ::arrow::default_memory_pool(), null_bitmap->data(), 0,
353             child->null_bitmap_data(), 0, length, 0);
354       }
355     }
356   }
357   FieldVector fields(children.size());
358   char field_name = 'a';
359   for (size_t i = 0; i < children.size(); ++i) {
360     fields[i] = field(std::string{field_name++}, children[i]->type(),
361                       /*nullable=*/null_probability > 0.0);
362   }
363   return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap);
364 }
365 
366 // Make a (int32, int64) struct array
MakeStructArray(::arrow::random::RandomArrayGenerator * rng,int64_t size,double null_probability)367 static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
368                                               int64_t size, double null_probability) {
369   auto values1 = rng->Int32(size, -5, 5, null_probability);
370   auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL, null_probability);
371   return MakeStructArray(rng, {values1, values2}, null_probability);
372 }
373 
BM_ReadStructColumn(::benchmark::State & state)374 static void BM_ReadStructColumn(::benchmark::State& state) {
375   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
376   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
377   const bool nullable = (null_probability != 0.0);
378 
379   ARROW_CHECK_GE(null_probability, 0.0);
380 
381   const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
382 
383   ::arrow::random::RandomArrayGenerator rng(42);
384   auto array = MakeStructArray(&rng, kNumValues, null_probability);
385 
386   BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
387 }
388 
389 BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);
390 
BM_ReadStructOfStructColumn(::benchmark::State & state)391 static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
392   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
393   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
394   const bool nullable = (null_probability != 0.0);
395 
396   ARROW_CHECK_GE(null_probability, 0.0);
397 
398   const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t));
399 
400   ::arrow::random::RandomArrayGenerator rng(42);
401   auto values1 = MakeStructArray(&rng, kNumValues, null_probability);
402   auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
403   auto array = MakeStructArray(&rng, {values1, values2}, null_probability);
404 
405   BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
406 }
407 
408 BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);
409 
BM_ReadStructOfListColumn(::benchmark::State & state)410 static void BM_ReadStructOfListColumn(::benchmark::State& state) {
411   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
412   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
413   const bool nullable = (null_probability != 0.0);
414 
415   ARROW_CHECK_GE(null_probability, 0.0);
416 
417   ::arrow::random::RandomArrayGenerator rng(42);
418 
419   const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
420 
421   auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
422   auto values2 =
423       rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
424   auto list1 = rng.List(*values1, kNumValues / 10, null_probability);
425   auto list2 = rng.List(*values2, kNumValues / 10, null_probability);
426   auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
427                                /*propagate_validity =*/true);
428 
429   BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
430 }
431 
432 BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);
433 
BM_ReadListColumn(::benchmark::State & state)434 static void BM_ReadListColumn(::benchmark::State& state) {
435   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
436   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
437   const bool nullable = (null_probability != 0.0);
438 
439   ARROW_CHECK_GE(null_probability, 0.0);
440 
441   ::arrow::random::RandomArrayGenerator rng(42);
442 
443   auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
444   const int64_t kBytesPerValue = sizeof(int64_t);
445 
446   auto array = rng.List(*values, kNumValues / 10, null_probability);
447 
448   BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
449 }
450 
451 BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);
452 
BM_ReadListOfStructColumn(::benchmark::State & state)453 static void BM_ReadListOfStructColumn(::benchmark::State& state) {
454   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
455   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
456   const bool nullable = (null_probability != 0.0);
457 
458   ARROW_CHECK_GE(null_probability, 0.0);
459 
460   ::arrow::random::RandomArrayGenerator rng(42);
461 
462   auto values = MakeStructArray(&rng, kNumValues, null_probability);
463   const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
464 
465   auto array = rng.List(*values, kNumValues / 10, null_probability);
466 
467   BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
468 }
469 
470 BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);
471 
BM_ReadListOfListColumn(::benchmark::State & state)472 static void BM_ReadListOfListColumn(::benchmark::State& state) {
473   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
474   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
475   const bool nullable = (null_probability != 0.0);
476 
477   ARROW_CHECK_GE(null_probability, 0.0);
478 
479   ::arrow::random::RandomArrayGenerator rng(42);
480 
481   auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
482   const int64_t kBytesPerValue = sizeof(int64_t);
483 
484   auto inner = rng.List(*values, kNumValues / 10, null_probability);
485   auto array = rng.List(*inner, kNumValues / 100, null_probability);
486 
487   BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
488 }
489 
490 BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
491 
492 //
493 // Benchmark different ways of reading select row groups
494 //
495 
BM_ReadIndividualRowGroups(::benchmark::State & state)496 static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
497   std::vector<int64_t> values(BENCHMARK_SIZE, 128);
498   std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
499   auto output = CreateOutputStream();
500   // This writes 10 RowGroups
501   EXIT_NOT_OK(
502       WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
503 
504   PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
505 
506   while (state.KeepRunning()) {
507     auto reader =
508         ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
509     std::unique_ptr<FileReader> arrow_reader;
510     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
511                                  &arrow_reader));
512 
513     std::vector<std::shared_ptr<::arrow::Table>> tables;
514     for (int i = 0; i < arrow_reader->num_row_groups(); i++) {
515       // Only read the even numbered RowGroups
516       if ((i % 2) == 0) {
517         std::shared_ptr<::arrow::Table> table;
518         EXIT_NOT_OK(arrow_reader->RowGroup(i)->ReadTable(&table));
519         tables.push_back(table);
520       }
521     }
522 
523     std::shared_ptr<::arrow::Table> final_table;
524     PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables));
525   }
526   SetBytesProcessed<true, Int64Type>(state);
527 }
528 
529 BENCHMARK(BM_ReadIndividualRowGroups);
530 
BM_ReadMultipleRowGroups(::benchmark::State & state)531 static void BM_ReadMultipleRowGroups(::benchmark::State& state) {
532   std::vector<int64_t> values(BENCHMARK_SIZE, 128);
533   std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
534   auto output = CreateOutputStream();
535   // This writes 10 RowGroups
536   EXIT_NOT_OK(
537       WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
538   PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
539   std::vector<int> rgs{0, 2, 4, 6, 8};
540 
541   while (state.KeepRunning()) {
542     auto reader =
543         ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
544     std::unique_ptr<FileReader> arrow_reader;
545     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
546                                  &arrow_reader));
547     std::shared_ptr<::arrow::Table> table;
548     EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table));
549   }
550   SetBytesProcessed<true, Int64Type>(state);
551 }
552 
553 BENCHMARK(BM_ReadMultipleRowGroups);
554 
BM_ReadMultipleRowGroupsGenerator(::benchmark::State & state)555 static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) {
556   std::vector<int64_t> values(BENCHMARK_SIZE, 128);
557   std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
558   auto output = CreateOutputStream();
559   // This writes 10 RowGroups
560   EXIT_NOT_OK(
561       WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
562   PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
563   std::vector<int> rgs{0, 2, 4, 6, 8};
564 
565   while (state.KeepRunning()) {
566     auto reader =
567         ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
568     std::unique_ptr<FileReader> unique_reader;
569     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
570                                  &unique_reader));
571     std::shared_ptr<FileReader> arrow_reader = std::move(unique_reader);
572     ASSIGN_OR_ABORT(auto generator,
573                     arrow_reader->GetRecordBatchGenerator(arrow_reader, rgs, {0}));
574     auto fut = ::arrow::CollectAsyncGenerator(generator);
575     ASSIGN_OR_ABORT(auto batches, fut.result());
576     ASSIGN_OR_ABORT(auto actual, ::arrow::Table::FromRecordBatches(std::move(batches)));
577   }
578   SetBytesProcessed<true, Int64Type>(state);
579 }
580 
581 BENCHMARK(BM_ReadMultipleRowGroupsGenerator);
582 
583 }  // namespace benchmark
584 
585 }  // namespace parquet
586