1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "parquet/printer.h"
19
20 #include <cstdint>
21 #include <cstdio>
22 #include <memory>
23 #include <ostream>
24 #include <string>
25 #include <vector>
26
27 #include "arrow/util/key_value_metadata.h"
28
29 #include "parquet/column_scanner.h"
30 #include "parquet/exception.h"
31 #include "parquet/file_reader.h"
32 #include "parquet/metadata.h"
33 #include "parquet/schema.h"
34 #include "parquet/statistics.h"
35 #include "parquet/types.h"
36
37 namespace parquet {
38
39 class ColumnReader;
40
41 // ----------------------------------------------------------------------
42 // ParquetFilePrinter::DebugPrint
43
44 // the fixed initial size is just for an example
45 #define COL_WIDTH 30
46
DebugPrint(std::ostream & stream,std::list<int> selected_columns,bool print_values,bool format_dump,bool print_key_value_metadata,const char * filename)47 void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
48 bool print_values, bool format_dump,
49 bool print_key_value_metadata, const char* filename) {
50 const FileMetaData* file_metadata = fileReader->metadata().get();
51
52 stream << "File Name: " << filename << "\n";
53 stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
54 stream << "Created By: " << file_metadata->created_by() << "\n";
55 stream << "Total rows: " << file_metadata->num_rows() << "\n";
56
57 if (print_key_value_metadata && file_metadata->key_value_metadata()) {
58 auto key_value_metadata = file_metadata->key_value_metadata();
59 int64_t size_of_key_value_metadata = key_value_metadata->size();
60 stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
61 for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
62 stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
63 << key_value_metadata->value(i) << "\n";
64 }
65 }
66
67 stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
68 stream << "Number of Real Columns: "
69 << file_metadata->schema()->group_node()->field_count() << "\n";
70
71 if (selected_columns.size() == 0) {
72 for (int i = 0; i < file_metadata->num_columns(); i++) {
73 selected_columns.push_back(i);
74 }
75 } else {
76 for (auto i : selected_columns) {
77 if (i < 0 || i >= file_metadata->num_columns()) {
78 throw ParquetException("Selected column is out of range");
79 }
80 }
81 }
82
83 stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
84 stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
85 for (auto i : selected_columns) {
86 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
87 stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
88 << TypeToString(descr->physical_type());
89 if (descr->converted_type() != ConvertedType::NONE) {
90 stream << "/" << ConvertedTypeToString(descr->converted_type());
91 }
92 if (descr->converted_type() == ConvertedType::DECIMAL) {
93 stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
94 }
95 stream << ")" << std::endl;
96 }
97
98 for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
99 stream << "--- Row Group: " << r << " ---\n";
100
101 auto group_reader = fileReader->RowGroup(r);
102 std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
103
104 stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
105 stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
106
107 // Print column metadata
108 for (auto i : selected_columns) {
109 auto column_chunk = group_metadata->ColumnChunk(i);
110 std::shared_ptr<Statistics> stats = column_chunk->statistics();
111
112 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
113 stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
114 if (column_chunk->is_stats_set()) {
115 std::string min = stats->EncodeMin(), max = stats->EncodeMax();
116 stream << ", Null Values: " << stats->null_count()
117 << ", Distinct Values: " << stats->distinct_count() << std::endl
118 << " Max: " << FormatStatValue(descr->physical_type(), max)
119 << ", Min: " << FormatStatValue(descr->physical_type(), min);
120 } else {
121 stream << " Statistics Not Set";
122 }
123 stream << std::endl
124 << " Compression: " << Codec::GetCodecAsString(column_chunk->compression())
125 << ", Encodings:";
126 for (auto encoding : column_chunk->encodings()) {
127 stream << " " << EncodingToString(encoding);
128 }
129 stream << std::endl
130 << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
131 << ", Compressed Size: " << column_chunk->total_compressed_size()
132 << std::endl;
133 }
134
135 if (!print_values) {
136 continue;
137 }
138 stream << "--- Values ---\n";
139
140 static constexpr int bufsize = COL_WIDTH + 1;
141 char buffer[bufsize];
142
143 // Create readers for selected columns and print contents
144 std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
145 int j = 0;
146 for (auto i : selected_columns) {
147 std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
148 // This is OK in this method as long as the RowGroupReader does not get
149 // deleted
150 auto& scanner = scanners[j++] = Scanner::Make(col_reader);
151
152 if (format_dump) {
153 stream << "Column " << i << std::endl;
154 while (scanner->HasNext()) {
155 scanner->PrintNext(stream, 0, true);
156 stream << "\n";
157 }
158 continue;
159 }
160
161 snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
162 file_metadata->schema()->Column(i)->name().c_str());
163 stream << buffer << '|';
164 }
165 if (format_dump) {
166 continue;
167 }
168 stream << "\n";
169
170 bool hasRow;
171 do {
172 hasRow = false;
173 for (auto scanner : scanners) {
174 if (scanner->HasNext()) {
175 hasRow = true;
176 scanner->PrintNext(stream, COL_WIDTH);
177 stream << '|';
178 }
179 }
180 stream << "\n";
181 } while (hasRow);
182 }
183 }
184
JSONPrint(std::ostream & stream,std::list<int> selected_columns,const char * filename)185 void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
186 const char* filename) {
187 const FileMetaData* file_metadata = fileReader->metadata().get();
188 stream << "{\n";
189 stream << " \"FileName\": \"" << filename << "\",\n";
190 stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
191 << "\",\n";
192 stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
193 stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
194 stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
195 stream << " \"NumberOfRealColumns\": \""
196 << file_metadata->schema()->group_node()->field_count() << "\",\n";
197 stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
198
199 if (selected_columns.size() == 0) {
200 for (int i = 0; i < file_metadata->num_columns(); i++) {
201 selected_columns.push_back(i);
202 }
203 } else {
204 for (auto i : selected_columns) {
205 if (i < 0 || i >= file_metadata->num_columns()) {
206 throw ParquetException("Selected column is out of range");
207 }
208 }
209 }
210
211 stream << " \"Columns\": [\n";
212 int c = 0;
213 for (auto i : selected_columns) {
214 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
215 stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\","
216 << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
217 << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
218 << "\","
219 << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
220 c++;
221 if (c != static_cast<int>(selected_columns.size())) {
222 stream << ",\n";
223 }
224 }
225
226 stream << "\n ],\n \"RowGroups\": [\n";
227 for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
228 stream << " {\n \"Id\": \"" << r << "\", ";
229
230 auto group_reader = fileReader->RowGroup(r);
231 std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
232
233 stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
234 stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
235
236 // Print column metadata
237 stream << " \"ColumnChunks\": [\n";
238 int c1 = 0;
239 for (auto i : selected_columns) {
240 auto column_chunk = group_metadata->ColumnChunk(i);
241 std::shared_ptr<Statistics> stats = column_chunk->statistics();
242
243 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
244 stream << " {\"Id\": \"" << i << "\", \"Values\": \""
245 << column_chunk->num_values() << "\", "
246 << "\"StatsSet\": ";
247 if (column_chunk->is_stats_set()) {
248 stream << "\"True\", \"Stats\": {";
249 std::string min = stats->EncodeMin(), max = stats->EncodeMax();
250 stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
251 << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
252 << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
253 << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
254 << "\" },";
255 } else {
256 stream << "\"False\",";
257 }
258 stream << "\n \"Compression\": \""
259 << Codec::GetCodecAsString(column_chunk->compression())
260 << "\", \"Encodings\": \"";
261 for (auto encoding : column_chunk->encodings()) {
262 stream << EncodingToString(encoding) << " ";
263 }
264 stream << "\", "
265 << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
266 << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
267
268 // end of a ColumnChunk
269 stream << "\" }";
270 c1++;
271 if (c1 != static_cast<int>(selected_columns.size())) {
272 stream << ",\n";
273 }
274 }
275
276 stream << "\n ]\n }";
277 if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
278 stream << ",\n";
279 }
280 }
281 stream << "\n ]\n}\n";
282 }
283
284 } // namespace parquet
285