1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "parquet/printer.h"
19 
20 #include <cstdint>
21 #include <cstdio>
22 #include <memory>
23 #include <ostream>
24 #include <string>
25 #include <vector>
26 
27 #include "arrow/util/key_value_metadata.h"
28 
29 #include "parquet/column_scanner.h"
30 #include "parquet/exception.h"
31 #include "parquet/file_reader.h"
32 #include "parquet/metadata.h"
33 #include "parquet/schema.h"
34 #include "parquet/statistics.h"
35 #include "parquet/types.h"
36 
37 namespace parquet {
38 
39 class ColumnReader;
40 
41 // ----------------------------------------------------------------------
42 // ParquetFilePrinter::DebugPrint
43 
44 // the fixed initial size is just for an example
45 #define COL_WIDTH 30
46 
DebugPrint(std::ostream & stream,std::list<int> selected_columns,bool print_values,bool format_dump,bool print_key_value_metadata,const char * filename)47 void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
48                                     bool print_values, bool format_dump,
49                                     bool print_key_value_metadata, const char* filename) {
50   const FileMetaData* file_metadata = fileReader->metadata().get();
51 
52   stream << "File Name: " << filename << "\n";
53   stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
54   stream << "Created By: " << file_metadata->created_by() << "\n";
55   stream << "Total rows: " << file_metadata->num_rows() << "\n";
56 
57   if (print_key_value_metadata && file_metadata->key_value_metadata()) {
58     auto key_value_metadata = file_metadata->key_value_metadata();
59     int64_t size_of_key_value_metadata = key_value_metadata->size();
60     stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
61     for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
62       stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
63              << key_value_metadata->value(i) << "\n";
64     }
65   }
66 
67   stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
68   stream << "Number of Real Columns: "
69          << file_metadata->schema()->group_node()->field_count() << "\n";
70 
71   if (selected_columns.size() == 0) {
72     for (int i = 0; i < file_metadata->num_columns(); i++) {
73       selected_columns.push_back(i);
74     }
75   } else {
76     for (auto i : selected_columns) {
77       if (i < 0 || i >= file_metadata->num_columns()) {
78         throw ParquetException("Selected column is out of range");
79       }
80     }
81   }
82 
83   stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
84   stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
85   for (auto i : selected_columns) {
86     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
87     stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
88            << TypeToString(descr->physical_type());
89     if (descr->converted_type() != ConvertedType::NONE) {
90       stream << "/" << ConvertedTypeToString(descr->converted_type());
91     }
92     if (descr->converted_type() == ConvertedType::DECIMAL) {
93       stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
94     }
95     stream << ")" << std::endl;
96   }
97 
98   for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
99     stream << "--- Row Group: " << r << " ---\n";
100 
101     auto group_reader = fileReader->RowGroup(r);
102     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
103 
104     stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
105     stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
106 
107     // Print column metadata
108     for (auto i : selected_columns) {
109       auto column_chunk = group_metadata->ColumnChunk(i);
110       std::shared_ptr<Statistics> stats = column_chunk->statistics();
111 
112       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
113       stream << "Column " << i << std::endl << "  Values: " << column_chunk->num_values();
114       if (column_chunk->is_stats_set()) {
115         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
116         stream << ", Null Values: " << stats->null_count()
117                << ", Distinct Values: " << stats->distinct_count() << std::endl
118                << "  Max: " << FormatStatValue(descr->physical_type(), max)
119                << ", Min: " << FormatStatValue(descr->physical_type(), min);
120       } else {
121         stream << "  Statistics Not Set";
122       }
123       stream << std::endl
124              << "  Compression: " << Codec::GetCodecAsString(column_chunk->compression())
125              << ", Encodings:";
126       for (auto encoding : column_chunk->encodings()) {
127         stream << " " << EncodingToString(encoding);
128       }
129       stream << std::endl
130              << "  Uncompressed Size: " << column_chunk->total_uncompressed_size()
131              << ", Compressed Size: " << column_chunk->total_compressed_size()
132              << std::endl;
133     }
134 
135     if (!print_values) {
136       continue;
137     }
138     stream << "--- Values ---\n";
139 
140     static constexpr int bufsize = COL_WIDTH + 1;
141     char buffer[bufsize];
142 
143     // Create readers for selected columns and print contents
144     std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
145     int j = 0;
146     for (auto i : selected_columns) {
147       std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
148       // This is OK in this method as long as the RowGroupReader does not get
149       // deleted
150       auto& scanner = scanners[j++] = Scanner::Make(col_reader);
151 
152       if (format_dump) {
153         stream << "Column " << i << std::endl;
154         while (scanner->HasNext()) {
155           scanner->PrintNext(stream, 0, true);
156           stream << "\n";
157         }
158         continue;
159       }
160 
161       snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
162                file_metadata->schema()->Column(i)->name().c_str());
163       stream << buffer << '|';
164     }
165     if (format_dump) {
166       continue;
167     }
168     stream << "\n";
169 
170     bool hasRow;
171     do {
172       hasRow = false;
173       for (auto scanner : scanners) {
174         if (scanner->HasNext()) {
175           hasRow = true;
176           scanner->PrintNext(stream, COL_WIDTH);
177           stream << '|';
178         }
179       }
180       stream << "\n";
181     } while (hasRow);
182   }
183 }
184 
JSONPrint(std::ostream & stream,std::list<int> selected_columns,const char * filename)185 void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
186                                    const char* filename) {
187   const FileMetaData* file_metadata = fileReader->metadata().get();
188   stream << "{\n";
189   stream << "  \"FileName\": \"" << filename << "\",\n";
190   stream << "  \"Version\": \"" << ParquetVersionToString(file_metadata->version())
191          << "\",\n";
192   stream << "  \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
193   stream << "  \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
194   stream << "  \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
195   stream << "  \"NumberOfRealColumns\": \""
196          << file_metadata->schema()->group_node()->field_count() << "\",\n";
197   stream << "  \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
198 
199   if (selected_columns.size() == 0) {
200     for (int i = 0; i < file_metadata->num_columns(); i++) {
201       selected_columns.push_back(i);
202     }
203   } else {
204     for (auto i : selected_columns) {
205       if (i < 0 || i >= file_metadata->num_columns()) {
206         throw ParquetException("Selected column is out of range");
207       }
208     }
209   }
210 
211   stream << "  \"Columns\": [\n";
212   int c = 0;
213   for (auto i : selected_columns) {
214     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
215     stream << "     { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\","
216            << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
217            << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
218            << "\","
219            << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
220     c++;
221     if (c != static_cast<int>(selected_columns.size())) {
222       stream << ",\n";
223     }
224   }
225 
226   stream << "\n  ],\n  \"RowGroups\": [\n";
227   for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
228     stream << "     {\n       \"Id\": \"" << r << "\", ";
229 
230     auto group_reader = fileReader->RowGroup(r);
231     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
232 
233     stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
234     stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
235 
236     // Print column metadata
237     stream << "       \"ColumnChunks\": [\n";
238     int c1 = 0;
239     for (auto i : selected_columns) {
240       auto column_chunk = group_metadata->ColumnChunk(i);
241       std::shared_ptr<Statistics> stats = column_chunk->statistics();
242 
243       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
244       stream << "          {\"Id\": \"" << i << "\", \"Values\": \""
245              << column_chunk->num_values() << "\", "
246              << "\"StatsSet\": ";
247       if (column_chunk->is_stats_set()) {
248         stream << "\"True\", \"Stats\": {";
249         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
250         stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
251                << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
252                << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
253                << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
254                << "\" },";
255       } else {
256         stream << "\"False\",";
257       }
258       stream << "\n           \"Compression\": \""
259              << Codec::GetCodecAsString(column_chunk->compression())
260              << "\", \"Encodings\": \"";
261       for (auto encoding : column_chunk->encodings()) {
262         stream << EncodingToString(encoding) << " ";
263       }
264       stream << "\", "
265              << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
266              << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
267 
268       // end of a ColumnChunk
269       stream << "\" }";
270       c1++;
271       if (c1 != static_cast<int>(selected_columns.size())) {
272         stream << ",\n";
273       }
274     }
275 
276     stream << "\n        ]\n     }";
277     if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
278       stream << ",\n";
279     }
280   }
281   stream << "\n  ]\n}\n";
282 }
283 
284 }  // namespace parquet
285