1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "parquet/printer.h"
19 
20 #include <cstdint>
21 #include <cstdio>
22 #include <memory>
23 #include <ostream>
24 #include <string>
25 #include <vector>
26 
27 #include "arrow/util/key_value_metadata.h"
28 #include "arrow/util/string.h"
29 
30 #include "parquet/column_scanner.h"
31 #include "parquet/exception.h"
32 #include "parquet/file_reader.h"
33 #include "parquet/metadata.h"
34 #include "parquet/schema.h"
35 #include "parquet/statistics.h"
36 #include "parquet/types.h"
37 
38 namespace parquet {
39 
40 class ColumnReader;
41 
42 // ----------------------------------------------------------------------
43 // ParquetFilePrinter::DebugPrint
44 
45 // the fixed initial size is just for an example
46 #define COL_WIDTH 30
47 
DebugPrint(std::ostream & stream,std::list<int> selected_columns,bool print_values,bool format_dump,bool print_key_value_metadata,const char * filename)48 void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
49                                     bool print_values, bool format_dump,
50                                     bool print_key_value_metadata, const char* filename) {
51   const FileMetaData* file_metadata = fileReader->metadata().get();
52 
53   stream << "File Name: " << filename << "\n";
54   stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
55   stream << "Created By: " << file_metadata->created_by() << "\n";
56   stream << "Total rows: " << file_metadata->num_rows() << "\n";
57 
58   if (print_key_value_metadata && file_metadata->key_value_metadata()) {
59     auto key_value_metadata = file_metadata->key_value_metadata();
60     int64_t size_of_key_value_metadata = key_value_metadata->size();
61     stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
62     for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
63       stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
64              << key_value_metadata->value(i) << "\n";
65     }
66   }
67 
68   stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
69   stream << "Number of Real Columns: "
70          << file_metadata->schema()->group_node()->field_count() << "\n";
71 
72   if (selected_columns.size() == 0) {
73     for (int i = 0; i < file_metadata->num_columns(); i++) {
74       selected_columns.push_back(i);
75     }
76   } else {
77     for (auto i : selected_columns) {
78       if (i < 0 || i >= file_metadata->num_columns()) {
79         throw ParquetException("Selected column is out of range");
80       }
81     }
82   }
83 
84   stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
85   stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
86   for (auto i : selected_columns) {
87     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
88     stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
89            << TypeToString(descr->physical_type());
90     const auto& logical_type = descr->logical_type();
91     if (!logical_type->is_none()) {
92       stream << " / " << logical_type->ToString();
93     }
94     if (descr->converted_type() != ConvertedType::NONE) {
95       stream << " / " << ConvertedTypeToString(descr->converted_type());
96       if (descr->converted_type() == ConvertedType::DECIMAL) {
97         stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
98       }
99     }
100     stream << ")" << std::endl;
101   }
102 
103   for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
104     stream << "--- Row Group: " << r << " ---\n";
105 
106     auto group_reader = fileReader->RowGroup(r);
107     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
108 
109     stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
110     stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
111            << " ---\n";
112     stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
113 
114     // Print column metadata
115     for (auto i : selected_columns) {
116       auto column_chunk = group_metadata->ColumnChunk(i);
117       std::shared_ptr<Statistics> stats = column_chunk->statistics();
118 
119       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
120       stream << "Column " << i << std::endl << "  Values: " << column_chunk->num_values();
121       if (column_chunk->is_stats_set()) {
122         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
123         stream << ", Null Values: " << stats->null_count()
124                << ", Distinct Values: " << stats->distinct_count() << std::endl
125                << "  Max: " << FormatStatValue(descr->physical_type(), max)
126                << ", Min: " << FormatStatValue(descr->physical_type(), min);
127       } else {
128         stream << "  Statistics Not Set";
129       }
130       stream << std::endl
131              << "  Compression: "
132              << ::arrow::internal::AsciiToUpper(
133                     Codec::GetCodecAsString(column_chunk->compression()))
134              << ", Encodings:";
135       for (auto encoding : column_chunk->encodings()) {
136         stream << " " << EncodingToString(encoding);
137       }
138       stream << std::endl
139              << "  Uncompressed Size: " << column_chunk->total_uncompressed_size()
140              << ", Compressed Size: " << column_chunk->total_compressed_size()
141              << std::endl;
142     }
143 
144     if (!print_values) {
145       continue;
146     }
147     stream << "--- Values ---\n";
148 
149     static constexpr int bufsize = COL_WIDTH + 1;
150     char buffer[bufsize];
151 
152     // Create readers for selected columns and print contents
153     std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
154     int j = 0;
155     for (auto i : selected_columns) {
156       std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
157       // This is OK in this method as long as the RowGroupReader does not get
158       // deleted
159       auto& scanner = scanners[j++] = Scanner::Make(col_reader);
160 
161       if (format_dump) {
162         stream << "Column " << i << std::endl;
163         while (scanner->HasNext()) {
164           scanner->PrintNext(stream, 0, true);
165           stream << "\n";
166         }
167         continue;
168       }
169 
170       snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
171                file_metadata->schema()->Column(i)->name().c_str());
172       stream << buffer << '|';
173     }
174     if (format_dump) {
175       continue;
176     }
177     stream << "\n";
178 
179     bool hasRow;
180     do {
181       hasRow = false;
182       for (auto scanner : scanners) {
183         if (scanner->HasNext()) {
184           hasRow = true;
185           scanner->PrintNext(stream, COL_WIDTH);
186           stream << '|';
187         }
188       }
189       stream << "\n";
190     } while (hasRow);
191   }
192 }
193 
JSONPrint(std::ostream & stream,std::list<int> selected_columns,const char * filename)194 void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
195                                    const char* filename) {
196   const FileMetaData* file_metadata = fileReader->metadata().get();
197   stream << "{\n";
198   stream << "  \"FileName\": \"" << filename << "\",\n";
199   stream << "  \"Version\": \"" << ParquetVersionToString(file_metadata->version())
200          << "\",\n";
201   stream << "  \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
202   stream << "  \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
203   stream << "  \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
204   stream << "  \"NumberOfRealColumns\": \""
205          << file_metadata->schema()->group_node()->field_count() << "\",\n";
206   stream << "  \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
207 
208   if (selected_columns.size() == 0) {
209     for (int i = 0; i < file_metadata->num_columns(); i++) {
210       selected_columns.push_back(i);
211     }
212   } else {
213     for (auto i : selected_columns) {
214       if (i < 0 || i >= file_metadata->num_columns()) {
215         throw ParquetException("Selected column is out of range");
216       }
217     }
218   }
219 
220   stream << "  \"Columns\": [\n";
221   int c = 0;
222   for (auto i : selected_columns) {
223     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
224     stream << "     { \"Id\": \"" << i << "\","
225            << " \"Name\": \"" << descr->path()->ToDotString() << "\","
226            << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
227            << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
228            << "\","
229            << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
230     c++;
231     if (c != static_cast<int>(selected_columns.size())) {
232       stream << ",\n";
233     }
234   }
235 
236   stream << "\n  ],\n  \"RowGroups\": [\n";
237   for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
238     stream << "     {\n       \"Id\": \"" << r << "\", ";
239 
240     auto group_reader = fileReader->RowGroup(r);
241     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
242 
243     stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
244     stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
245            << "\", ";
246     stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
247 
248     // Print column metadata
249     stream << "       \"ColumnChunks\": [\n";
250     int c1 = 0;
251     for (auto i : selected_columns) {
252       auto column_chunk = group_metadata->ColumnChunk(i);
253       std::shared_ptr<Statistics> stats = column_chunk->statistics();
254 
255       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
256       stream << "          {\"Id\": \"" << i << "\", \"Values\": \""
257              << column_chunk->num_values() << "\", "
258              << "\"StatsSet\": ";
259       if (column_chunk->is_stats_set()) {
260         stream << "\"True\", \"Stats\": {";
261         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
262         stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
263                << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
264                << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
265                << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
266                << "\" },";
267       } else {
268         stream << "\"False\",";
269       }
270       stream << "\n           \"Compression\": \""
271              << ::arrow::internal::AsciiToUpper(
272                     Codec::GetCodecAsString(column_chunk->compression()))
273              << "\", \"Encodings\": \"";
274       for (auto encoding : column_chunk->encodings()) {
275         stream << EncodingToString(encoding) << " ";
276       }
277       stream << "\", "
278              << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
279              << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
280 
281       // end of a ColumnChunk
282       stream << "\" }";
283       c1++;
284       if (c1 != static_cast<int>(selected_columns.size())) {
285         stream << ",\n";
286       }
287     }
288 
289     stream << "\n        ]\n     }";
290     if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
291       stream << ",\n";
292     }
293   }
294   stream << "\n  ]\n}\n";
295 }
296 
297 }  // namespace parquet
298