1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "benchmark/benchmark.h"
19
20 #include <array>
21 #include <iostream>
22 #include <random>
23
24 #include "parquet/arrow/reader.h"
25 #include "parquet/arrow/writer.h"
26 #include "parquet/column_reader.h"
27 #include "parquet/column_writer.h"
28 #include "parquet/file_reader.h"
29 #include "parquet/file_writer.h"
30 #include "parquet/platform.h"
31
32 #include "arrow/array.h"
33 #include "arrow/array/builder_primitive.h"
34 #include "arrow/io/memory.h"
35 #include "arrow/table.h"
36 #include "arrow/testing/random.h"
37 #include "arrow/util/bitmap_ops.h"
38 #include "arrow/util/logging.h"
39
40 using arrow::Array;
41 using arrow::ArrayVector;
42 using arrow::BooleanBuilder;
43 using arrow::FieldVector;
44 using arrow::NumericBuilder;
45
46 #define EXIT_NOT_OK(s) \
47 do { \
48 ::arrow::Status _s = (s); \
49 if (ARROW_PREDICT_FALSE(!_s.ok())) { \
50 std::cout << "Exiting: " << _s.ToString() << std::endl; \
51 exit(EXIT_FAILURE); \
52 } \
53 } while (0)
54
55 namespace parquet {
56
57 using arrow::FileReader;
58 using arrow::WriteTable;
59 using schema::PrimitiveNode;
60
61 namespace benchmark {
62
63 // This should result in multiple pages for most primitive types
64 constexpr int64_t BENCHMARK_SIZE = 10 * 1024 * 1024;
65
66 template <typename ParquetType>
67 struct benchmark_traits {};
68
69 template <>
70 struct benchmark_traits<Int32Type> {
71 using arrow_type = ::arrow::Int32Type;
72 };
73
74 template <>
75 struct benchmark_traits<Int64Type> {
76 using arrow_type = ::arrow::Int64Type;
77 };
78
79 template <>
80 struct benchmark_traits<DoubleType> {
81 using arrow_type = ::arrow::DoubleType;
82 };
83
84 template <>
85 struct benchmark_traits<BooleanType> {
86 using arrow_type = ::arrow::BooleanType;
87 };
88
89 template <typename ParquetType>
90 using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
91
92 template <typename ParquetType>
MakeSchema(Repetition::type repetition)93 std::shared_ptr<ColumnDescriptor> MakeSchema(Repetition::type repetition) {
94 auto node = PrimitiveNode::Make("int64", repetition, ParquetType::type_num);
95 return std::make_shared<ColumnDescriptor>(node, repetition != Repetition::REQUIRED,
96 repetition == Repetition::REPEATED);
97 }
98
99 template <bool nullable, typename ParquetType>
SetBytesProcessed(::benchmark::State & state,int64_t num_values=BENCHMARK_SIZE)100 void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) {
101 const int64_t items_processed = state.iterations() * num_values;
102 const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type);
103
104 state.SetItemsProcessed(bytes_processed);
105 state.SetBytesProcessed(bytes_processed);
106 }
107
108 constexpr int64_t kAlternatingOrNa = -1;
109
110 template <typename T>
RandomVector(int64_t true_percentage,int64_t vector_size,const std::array<T,2> & sample_values,int seed=500)111 std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
112 const std::array<T, 2>& sample_values, int seed = 500) {
113 std::vector<T> values(vector_size, {});
114 if (true_percentage == kAlternatingOrNa) {
115 int n = {0};
116 std::generate(values.begin(), values.end(), [&n] { return n++ % 2; });
117 } else {
118 std::default_random_engine rng(seed);
119 double true_probability = static_cast<double>(true_percentage) / 100.0;
120 std::bernoulli_distribution dist(true_probability);
121 std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; });
122 }
123 return values;
124 }
125
126 template <typename ParquetType>
TableFromVector(const std::vector<typename ParquetType::c_type> & vec,bool nullable,int64_t null_percentage=kAlternatingOrNa)127 std::shared_ptr<::arrow::Table> TableFromVector(
128 const std::vector<typename ParquetType::c_type>& vec, bool nullable,
129 int64_t null_percentage = kAlternatingOrNa) {
130 if (!nullable) {
131 ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
132 }
133 std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
134 NumericBuilder<ArrowType<ParquetType>> builder;
135 if (nullable) {
136 // Note true values select index 1 of sample_values
137 auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
138 vec.size(), /*sample_values=*/{1, 0});
139 EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data()));
140 } else {
141 EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr));
142 }
143 std::shared_ptr<::arrow::Array> array;
144 EXIT_NOT_OK(builder.Finish(&array));
145
146 auto field = ::arrow::field("column", type, nullable);
147 auto schema = ::arrow::schema({field});
148 return ::arrow::Table::Make(schema, {array});
149 }
150
151 template <>
TableFromVector(const std::vector<bool> & vec,bool nullable,int64_t null_percentage)152 std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
153 bool nullable,
154 int64_t null_percentage) {
155 BooleanBuilder builder;
156 if (nullable) {
157 auto valid_bytes = RandomVector<bool>(/*true_percentage=*/null_percentage, vec.size(),
158 {true, false});
159 EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes));
160 } else {
161 EXIT_NOT_OK(builder.AppendValues(vec));
162 }
163 std::shared_ptr<::arrow::Array> array;
164 EXIT_NOT_OK(builder.Finish(&array));
165
166 auto field = ::arrow::field("column", ::arrow::boolean(), nullable);
167 auto schema = std::make_shared<::arrow::Schema>(
168 std::vector<std::shared_ptr<::arrow::Field>>({field}));
169 return ::arrow::Table::Make(schema, {array});
170 }
171
172 template <bool nullable, typename ParquetType>
BM_WriteColumn(::benchmark::State & state)173 static void BM_WriteColumn(::benchmark::State& state) {
174 using T = typename ParquetType::c_type;
175 std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
176 std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
177
178 while (state.KeepRunning()) {
179 auto output = CreateOutputStream();
180 EXIT_NOT_OK(
181 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
182 }
183 SetBytesProcessed<nullable, ParquetType>(state);
184 }
185
186 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type);
187 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int32Type);
188
189 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int64Type);
190 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, Int64Type);
191
192 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, DoubleType);
193 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
194
195 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
196 BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
197
198 template <typename T>
199 struct Examples {
valuesparquet::benchmark::Examples200 static constexpr std::array<T, 2> values() { return {127, 128}; }
201 };
202
203 template <>
204 struct Examples<bool> {
valuesparquet::benchmark::Examples205 static constexpr std::array<bool, 2> values() { return {false, true}; }
206 };
207
BenchmarkReadTable(::benchmark::State & state,const::arrow::Table & table,int64_t num_values=-1,int64_t bytes_per_value=-1)208 static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
209 int64_t num_values = -1, int64_t bytes_per_value = -1) {
210 auto output = CreateOutputStream();
211 EXIT_NOT_OK(
212 WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
213 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
214
215 while (state.KeepRunning()) {
216 auto reader =
217 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
218 std::unique_ptr<FileReader> arrow_reader;
219 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
220 &arrow_reader));
221 std::shared_ptr<::arrow::Table> table;
222 EXIT_NOT_OK(arrow_reader->ReadTable(&table));
223 }
224
225 if (num_values == -1) {
226 num_values = table.num_rows();
227 }
228 state.SetItemsProcessed(num_values * state.iterations());
229 if (bytes_per_value != -1) {
230 state.SetBytesProcessed(num_values * state.iterations() * bytes_per_value);
231 }
232 }
233
BenchmarkReadArray(::benchmark::State & state,const std::shared_ptr<Array> & array,bool nullable,int64_t num_values=-1,int64_t bytes_per_value=-1)234 static void BenchmarkReadArray(::benchmark::State& state,
235 const std::shared_ptr<Array>& array, bool nullable,
236 int64_t num_values = -1, int64_t bytes_per_value = -1) {
237 auto schema = ::arrow::schema({field("s", array->type(), nullable)});
238 auto table = ::arrow::Table::Make(schema, {array}, array->length());
239
240 EXIT_NOT_OK(table->Validate());
241
242 BenchmarkReadTable(state, *table, num_values, bytes_per_value);
243 }
244
245 //
246 // Benchmark reading a primitive column
247 //
248
249 template <bool nullable, typename ParquetType>
BM_ReadColumn(::benchmark::State & state)250 static void BM_ReadColumn(::benchmark::State& state) {
251 using T = typename ParquetType::c_type;
252
253 auto values = RandomVector<T>(/*percentage=*/state.range(1), BENCHMARK_SIZE,
254 Examples<T>::values());
255
256 std::shared_ptr<::arrow::Table> table =
257 TableFromVector<ParquetType>(values, nullable, state.range(0));
258
259 BenchmarkReadTable(state, *table, table->num_rows(),
260 sizeof(typename ParquetType::c_type));
261 }
262
263 // There are two parameters here that cover different data distributions.
264 // null_percentage governs distribution and therefore runs of null values.
265 // first_value_percentage governs distribution of values (we select from 1 of 2)
266 // so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
267 // there will be some percentage of RLE encoded values and some percentage of literal
268 // encoded values (RLE is much less likely with percentages close to 50).
269 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
270 ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
271 ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
272 ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
273
274 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
275 ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
276 ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
277 ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
278 ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
279 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
280 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/0})
281 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
282 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
283
284 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
285 ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
286 ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
287 ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
288 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
289 ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
290 ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
291 ->Args({/*null_percentage=*/5, /*first_value_percentage=*/5})
292 ->Args({/*null_percentage=*/10, /*first_value_percentage=*/5})
293 ->Args({/*null_percentage=*/25, /*first_value_percentage=*/10})
294 ->Args({/*null_percentage=*/30, /*first_value_percentage=*/10})
295 ->Args({/*null_percentage=*/35, /*first_value_percentage=*/10})
296 ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
297 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
298 ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
299 ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
300 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
301 ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
302
303 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
304 ->Args({kAlternatingOrNa, 0})
305 ->Args({kAlternatingOrNa, 20});
306 // Less coverage because int64_t should be pretty good representation for nullability and
307 // repeating values.
308 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType)
309 ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
310 ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
311 ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25});
312
313 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType)
314 ->Args({kAlternatingOrNa, 0})
315 ->Args({1, 20});
316 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
317 ->Args({kAlternatingOrNa, 1})
318 ->Args({5, 10});
319
320 //
321 // Benchmark reading a nested column
322 //
323
324 const std::vector<int64_t> kNestedNullPercents = {0, 1, 50, 99};
325
326 // XXX We can use ArgsProduct() starting from Benchmark 1.5.2
NestedReadArguments(::benchmark::internal::Benchmark * b)327 static void NestedReadArguments(::benchmark::internal::Benchmark* b) {
328 for (const auto null_percentage : kNestedNullPercents) {
329 b->Arg(null_percentage);
330 }
331 }
332
MakeStructArray(::arrow::random::RandomArrayGenerator * rng,const ArrayVector & children,double null_probability,bool propagate_validity=false)333 static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
334 const ArrayVector& children,
335 double null_probability,
336 bool propagate_validity = false) {
337 ARROW_CHECK_GT(children.size(), 0);
338 const int64_t length = children[0]->length();
339
340 std::shared_ptr<::arrow::Buffer> null_bitmap;
341 if (null_probability > 0.0) {
342 null_bitmap = rng->NullBitmap(length, null_probability);
343 if (propagate_validity) {
344 // HACK: the Parquet writer currently doesn't allow non-empty list
345 // entries where a parent node is null (for instance, a struct-of-list
346 // where the outer struct is marked null but the inner list value is
347 // non-empty).
348 for (const auto& child : children) {
349 null_bitmap = *::arrow::internal::BitmapOr(
350 ::arrow::default_memory_pool(), null_bitmap->data(), 0,
351 child->null_bitmap_data(), 0, length, 0);
352 }
353 }
354 }
355 FieldVector fields(children.size());
356 char field_name = 'a';
357 for (size_t i = 0; i < children.size(); ++i) {
358 fields[i] = field(std::string{field_name++}, children[i]->type(),
359 /*nullable=*/null_probability > 0.0);
360 }
361 return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap);
362 }
363
364 // Make a (int32, int64) struct array
MakeStructArray(::arrow::random::RandomArrayGenerator * rng,int64_t size,double null_probability)365 static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
366 int64_t size, double null_probability) {
367 auto values1 = rng->Int32(size, -5, 5, null_probability);
368 auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL, null_probability);
369 return MakeStructArray(rng, {values1, values2}, null_probability);
370 }
371
BM_ReadStructColumn(::benchmark::State & state)372 static void BM_ReadStructColumn(::benchmark::State& state) {
373 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
374 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
375 const bool nullable = (null_probability != 0.0);
376
377 ARROW_CHECK_GE(null_probability, 0.0);
378
379 const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
380
381 ::arrow::random::RandomArrayGenerator rng(42);
382 auto array = MakeStructArray(&rng, kNumValues, null_probability);
383
384 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
385 }
386
387 BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);
388
BM_ReadStructOfStructColumn(::benchmark::State & state)389 static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
390 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
391 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
392 const bool nullable = (null_probability != 0.0);
393
394 ARROW_CHECK_GE(null_probability, 0.0);
395
396 const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t));
397
398 ::arrow::random::RandomArrayGenerator rng(42);
399 auto values1 = MakeStructArray(&rng, kNumValues, null_probability);
400 auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
401 auto array = MakeStructArray(&rng, {values1, values2}, null_probability);
402
403 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
404 }
405
406 BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);
407
BM_ReadStructOfListColumn(::benchmark::State & state)408 static void BM_ReadStructOfListColumn(::benchmark::State& state) {
409 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
410 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
411 const bool nullable = (null_probability != 0.0);
412
413 ARROW_CHECK_GE(null_probability, 0.0);
414
415 ::arrow::random::RandomArrayGenerator rng(42);
416
417 const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
418
419 auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
420 auto values2 =
421 rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
422 auto list1 = rng.List(*values1, kNumValues / 10, null_probability);
423 auto list2 = rng.List(*values2, kNumValues / 10, null_probability);
424 auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
425 /*propagate_validity =*/true);
426
427 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
428 }
429
430 BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);
431
BM_ReadListColumn(::benchmark::State & state)432 static void BM_ReadListColumn(::benchmark::State& state) {
433 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
434 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
435 const bool nullable = (null_probability != 0.0);
436
437 ARROW_CHECK_GE(null_probability, 0.0);
438
439 ::arrow::random::RandomArrayGenerator rng(42);
440
441 auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
442 const int64_t kBytesPerValue = sizeof(int64_t);
443
444 auto array = rng.List(*values, kNumValues / 10, null_probability);
445
446 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
447 }
448
449 BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);
450
BM_ReadListOfStructColumn(::benchmark::State & state)451 static void BM_ReadListOfStructColumn(::benchmark::State& state) {
452 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
453 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
454 const bool nullable = (null_probability != 0.0);
455
456 ARROW_CHECK_GE(null_probability, 0.0);
457
458 ::arrow::random::RandomArrayGenerator rng(42);
459
460 auto values = MakeStructArray(&rng, kNumValues, null_probability);
461 const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
462
463 auto array = rng.List(*values, kNumValues / 10, null_probability);
464
465 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
466 }
467
468 BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);
469
BM_ReadListOfListColumn(::benchmark::State & state)470 static void BM_ReadListOfListColumn(::benchmark::State& state) {
471 constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
472 const double null_probability = static_cast<double>(state.range(0)) / 100.0;
473 const bool nullable = (null_probability != 0.0);
474
475 ARROW_CHECK_GE(null_probability, 0.0);
476
477 ::arrow::random::RandomArrayGenerator rng(42);
478
479 auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
480 const int64_t kBytesPerValue = sizeof(int64_t);
481
482 auto inner = rng.List(*values, kNumValues / 10, null_probability);
483 auto array = rng.List(*inner, kNumValues / 100, null_probability);
484
485 BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
486 }
487
488 BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
489
490 //
491 // Benchmark different ways of reading select row groups
492 //
493
BM_ReadIndividualRowGroups(::benchmark::State & state)494 static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
495 std::vector<int64_t> values(BENCHMARK_SIZE, 128);
496 std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
497 auto output = CreateOutputStream();
498 // This writes 10 RowGroups
499 EXIT_NOT_OK(
500 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
501
502 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
503
504 while (state.KeepRunning()) {
505 auto reader =
506 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
507 std::unique_ptr<FileReader> arrow_reader;
508 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
509 &arrow_reader));
510
511 std::vector<std::shared_ptr<::arrow::Table>> tables;
512 for (int i = 0; i < arrow_reader->num_row_groups(); i++) {
513 // Only read the even numbered RowGroups
514 if ((i % 2) == 0) {
515 std::shared_ptr<::arrow::Table> table;
516 EXIT_NOT_OK(arrow_reader->RowGroup(i)->ReadTable(&table));
517 tables.push_back(table);
518 }
519 }
520
521 std::shared_ptr<::arrow::Table> final_table;
522 PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables));
523 }
524 SetBytesProcessed<true, Int64Type>(state);
525 }
526
527 BENCHMARK(BM_ReadIndividualRowGroups);
528
BM_ReadMultipleRowGroups(::benchmark::State & state)529 static void BM_ReadMultipleRowGroups(::benchmark::State& state) {
530 std::vector<int64_t> values(BENCHMARK_SIZE, 128);
531 std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
532 auto output = CreateOutputStream();
533 // This writes 10 RowGroups
534 EXIT_NOT_OK(
535 WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE / 10));
536 PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
537
538 while (state.KeepRunning()) {
539 auto reader =
540 ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
541 std::unique_ptr<FileReader> arrow_reader;
542 EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
543 &arrow_reader));
544
545 std::vector<std::shared_ptr<::arrow::Table>> tables;
546 std::vector<int> rgs;
547 for (int i = 0; i < arrow_reader->num_row_groups(); i++) {
548 // Only read the even numbered RowGroups
549 if ((i % 2) == 0) {
550 rgs.push_back(i);
551 }
552 }
553
554 std::shared_ptr<::arrow::Table> table;
555 EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table));
556 }
557 SetBytesProcessed<true, Int64Type>(state);
558 }
559
560 BENCHMARK(BM_ReadMultipleRowGroups);
561
562 } // namespace benchmark
563
564 } // namespace parquet
565