1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 // This module defines an abstract interface for iterating through pages in a
19 // Parquet column chunk within a row group. It could be extended in the future
20 // to iterate through all data pages in all chunks in a file.
21
22 #include "parquet/test_util.h"
23
24 #include <algorithm>
25 #include <chrono>
26 #include <limits>
27 #include <memory>
28 #include <random>
29 #include <string>
30 #include <utility>
31 #include <vector>
32
33 #include "parquet/column_page.h"
34 #include "parquet/column_reader.h"
35 #include "parquet/column_writer.h"
36 #include "parquet/encoding.h"
37 #include "parquet/platform.h"
38
39 namespace parquet {
40 namespace test {
41
get_data_dir()42 const char* get_data_dir() {
43 const auto result = std::getenv("PARQUET_TEST_DATA");
44 if (!result || !result[0]) {
45 throw ParquetTestException(
46 "Please point the PARQUET_TEST_DATA environment "
47 "variable to the test data directory");
48 }
49 return result;
50 }
51
get_bad_data_dir()52 std::string get_bad_data_dir() {
53 // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data
54 // so need to reach one folder up to access the "bad_data" folder.
55 std::string data_dir(get_data_dir());
56 std::stringstream ss;
57 ss << data_dir << "/../bad_data";
58 return ss.str();
59 }
60
get_data_file(const std::string & filename,bool is_good)61 std::string get_data_file(const std::string& filename, bool is_good) {
62 std::stringstream ss;
63
64 if (is_good) {
65 ss << get_data_dir();
66 } else {
67 ss << get_bad_data_dir();
68 }
69
70 ss << "/" << filename;
71 return ss.str();
72 }
73
random_bytes(int n,uint32_t seed,std::vector<uint8_t> * out)74 void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out) {
75 std::default_random_engine gen(seed);
76 std::uniform_int_distribution<int> d(0, 255);
77
78 out->resize(n);
79 for (int i = 0; i < n; ++i) {
80 (*out)[i] = static_cast<uint8_t>(d(gen));
81 }
82 }
83
random_bools(int n,double p,uint32_t seed,bool * out)84 void random_bools(int n, double p, uint32_t seed, bool* out) {
85 std::default_random_engine gen(seed);
86 std::bernoulli_distribution d(p);
87 for (int i = 0; i < n; ++i) {
88 out[i] = d(gen);
89 }
90 }
91
random_Int96_numbers(int n,uint32_t seed,int32_t min_value,int32_t max_value,Int96 * out)92 void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
93 Int96* out) {
94 std::default_random_engine gen(seed);
95 std::uniform_int_distribution<int32_t> d(min_value, max_value);
96 for (int i = 0; i < n; ++i) {
97 out[i].value[0] = d(gen);
98 out[i].value[1] = d(gen);
99 out[i].value[2] = d(gen);
100 }
101 }
102
random_fixed_byte_array(int n,uint32_t seed,uint8_t * buf,int len,FLBA * out)103 void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) {
104 std::default_random_engine gen(seed);
105 std::uniform_int_distribution<int> d(0, 255);
106 for (int i = 0; i < n; ++i) {
107 out[i].ptr = buf;
108 for (int j = 0; j < len; ++j) {
109 buf[j] = static_cast<uint8_t>(d(gen));
110 }
111 buf += len;
112 }
113 }
114
random_byte_array(int n,uint32_t seed,uint8_t * buf,ByteArray * out,int min_size,int max_size)115 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
116 int max_size) {
117 std::default_random_engine gen(seed);
118 std::uniform_int_distribution<int> d1(min_size, max_size);
119 std::uniform_int_distribution<int> d2(0, 255);
120 for (int i = 0; i < n; ++i) {
121 int len = d1(gen);
122 out[i].len = len;
123 out[i].ptr = buf;
124 for (int j = 0; j < len; ++j) {
125 buf[j] = static_cast<uint8_t>(d2(gen));
126 }
127 buf += len;
128 }
129 }
130
random_byte_array(int n,uint32_t seed,uint8_t * buf,ByteArray * out,int max_size)131 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size) {
132 random_byte_array(n, seed, buf, out, 0, max_size);
133 }
134
135 } // namespace test
136 } // namespace parquet
137