1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // This module defines an abstract interface for iterating through pages in a
19 // Parquet column chunk within a row group. It could be extended in the future
20 // to iterate through all data pages in all chunks in a file.
21 
22 #include "parquet/test_util.h"
23 
24 #include <algorithm>
25 #include <chrono>
26 #include <limits>
27 #include <memory>
28 #include <random>
29 #include <string>
30 #include <utility>
31 #include <vector>
32 
33 #include "parquet/column_page.h"
34 #include "parquet/column_reader.h"
35 #include "parquet/column_writer.h"
36 #include "parquet/encoding.h"
37 #include "parquet/platform.h"
38 
39 namespace parquet {
40 namespace test {
41 
get_data_dir()42 const char* get_data_dir() {
43   const auto result = std::getenv("PARQUET_TEST_DATA");
44   if (!result || !result[0]) {
45     throw ParquetTestException(
46         "Please point the PARQUET_TEST_DATA environment "
47         "variable to the test data directory");
48   }
49   return result;
50 }
51 
get_bad_data_dir()52 std::string get_bad_data_dir() {
53   // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data
54   // so need to reach one folder up to access the "bad_data" folder.
55   std::string data_dir(get_data_dir());
56   std::stringstream ss;
57   ss << data_dir << "/../bad_data";
58   return ss.str();
59 }
60 
get_data_file(const std::string & filename,bool is_good)61 std::string get_data_file(const std::string& filename, bool is_good) {
62   std::stringstream ss;
63 
64   if (is_good) {
65     ss << get_data_dir();
66   } else {
67     ss << get_bad_data_dir();
68   }
69 
70   ss << "/" << filename;
71   return ss.str();
72 }
73 
random_bytes(int n,uint32_t seed,std::vector<uint8_t> * out)74 void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out) {
75   std::default_random_engine gen(seed);
76   std::uniform_int_distribution<int> d(0, 255);
77 
78   out->resize(n);
79   for (int i = 0; i < n; ++i) {
80     (*out)[i] = static_cast<uint8_t>(d(gen));
81   }
82 }
83 
random_bools(int n,double p,uint32_t seed,bool * out)84 void random_bools(int n, double p, uint32_t seed, bool* out) {
85   std::default_random_engine gen(seed);
86   std::bernoulli_distribution d(p);
87   for (int i = 0; i < n; ++i) {
88     out[i] = d(gen);
89   }
90 }
91 
random_Int96_numbers(int n,uint32_t seed,int32_t min_value,int32_t max_value,Int96 * out)92 void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
93                           Int96* out) {
94   std::default_random_engine gen(seed);
95   std::uniform_int_distribution<int32_t> d(min_value, max_value);
96   for (int i = 0; i < n; ++i) {
97     out[i].value[0] = d(gen);
98     out[i].value[1] = d(gen);
99     out[i].value[2] = d(gen);
100   }
101 }
102 
random_fixed_byte_array(int n,uint32_t seed,uint8_t * buf,int len,FLBA * out)103 void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) {
104   std::default_random_engine gen(seed);
105   std::uniform_int_distribution<int> d(0, 255);
106   for (int i = 0; i < n; ++i) {
107     out[i].ptr = buf;
108     for (int j = 0; j < len; ++j) {
109       buf[j] = static_cast<uint8_t>(d(gen));
110     }
111     buf += len;
112   }
113 }
114 
random_byte_array(int n,uint32_t seed,uint8_t * buf,ByteArray * out,int min_size,int max_size)115 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
116                        int max_size) {
117   std::default_random_engine gen(seed);
118   std::uniform_int_distribution<int> d1(min_size, max_size);
119   std::uniform_int_distribution<int> d2(0, 255);
120   for (int i = 0; i < n; ++i) {
121     int len = d1(gen);
122     out[i].len = len;
123     out[i].ptr = buf;
124     for (int j = 0; j < len; ++j) {
125       buf[j] = static_cast<uint8_t>(d2(gen));
126     }
127     buf += len;
128   }
129 }
130 
random_byte_array(int n,uint32_t seed,uint8_t * buf,ByteArray * out,int max_size)131 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size) {
132   random_byte_array(n, seed, buf, out, 0, max_size);
133 }
134 
135 }  // namespace test
136 }  // namespace parquet
137