1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // This module defines an abstract interface for iterating through pages in a
19 // Parquet column chunk within a row group. It could be extended in the future
20 // to iterate through all data pages in all chunks in a file.
21 
22 #pragma once
23 
24 #include <cstdint>
25 #include <memory>
26 #include <string>
27 
28 #include "parquet/statistics.h"
29 #include "parquet/types.h"
30 
31 namespace parquet {
32 
33 // TODO: Parallel processing is not yet safe because of memory-ownership
34 // semantics (the PageReader may or may not own the memory referenced by a
35 // page)
36 //
37 // TODO(wesm): In the future Parquet implementations may store the crc code
38 // in format::PageHeader. parquet-mr currently does not, so we also skip it
39 // here, both on the read and write path
40 class Page {
41  public:
Page(const std::shared_ptr<Buffer> & buffer,PageType::type type)42   Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
43       : buffer_(buffer), type_(type) {}
44 
type()45   PageType::type type() const { return type_; }
46 
buffer()47   std::shared_ptr<Buffer> buffer() const { return buffer_; }
48 
49   // @returns: a pointer to the page's data
data()50   const uint8_t* data() const { return buffer_->data(); }
51 
52   // @returns: the total size in bytes of the page's data buffer
size()53   int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
54 
55  private:
56   std::shared_ptr<Buffer> buffer_;
57   PageType::type type_;
58 };
59 
60 /// \brief Base type for DataPageV1 and DataPageV2 including common attributes
61 class DataPage : public Page {
62  public:
num_values()63   int32_t num_values() const { return num_values_; }
encoding()64   Encoding::type encoding() const { return encoding_; }
uncompressed_size()65   int64_t uncompressed_size() const { return uncompressed_size_; }
statistics()66   const EncodedStatistics& statistics() const { return statistics_; }
67 
68   virtual ~DataPage() = default;
69 
70  protected:
71   DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
72            Encoding::type encoding, int64_t uncompressed_size,
73            const EncodedStatistics& statistics = EncodedStatistics())
Page(buffer,type)74       : Page(buffer, type),
75         num_values_(num_values),
76         encoding_(encoding),
77         uncompressed_size_(uncompressed_size),
78         statistics_(statistics) {}
79 
80   int32_t num_values_;
81   Encoding::type encoding_;
82   int64_t uncompressed_size_;
83   EncodedStatistics statistics_;
84 };
85 
86 class DataPageV1 : public DataPage {
87  public:
88   DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
89              Encoding::type encoding, Encoding::type definition_level_encoding,
90              Encoding::type repetition_level_encoding, int64_t uncompressed_size,
91              const EncodedStatistics& statistics = EncodedStatistics())
DataPage(PageType::DATA_PAGE,buffer,num_values,encoding,uncompressed_size,statistics)92       : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
93                  statistics),
94         definition_level_encoding_(definition_level_encoding),
95         repetition_level_encoding_(repetition_level_encoding) {}
96 
repetition_level_encoding()97   Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
98 
definition_level_encoding()99   Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
100 
101  private:
102   Encoding::type definition_level_encoding_;
103   Encoding::type repetition_level_encoding_;
104 };
105 
106 class DataPageV2 : public DataPage {
107  public:
108   DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
109              int32_t num_rows, Encoding::type encoding,
110              int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
111              int64_t uncompressed_size, bool is_compressed = false,
112              const EncodedStatistics& statistics = EncodedStatistics())
DataPage(PageType::DATA_PAGE_V2,buffer,num_values,encoding,uncompressed_size,statistics)113       : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
114                  statistics),
115         num_nulls_(num_nulls),
116         num_rows_(num_rows),
117         definition_levels_byte_length_(definition_levels_byte_length),
118         repetition_levels_byte_length_(repetition_levels_byte_length),
119         is_compressed_(is_compressed) {}
120 
num_nulls()121   int32_t num_nulls() const { return num_nulls_; }
122 
num_rows()123   int32_t num_rows() const { return num_rows_; }
124 
definition_levels_byte_length()125   int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
126 
repetition_levels_byte_length()127   int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
128 
is_compressed()129   bool is_compressed() const { return is_compressed_; }
130 
131  private:
132   int32_t num_nulls_;
133   int32_t num_rows_;
134   int32_t definition_levels_byte_length_;
135   int32_t repetition_levels_byte_length_;
136   bool is_compressed_;
137 };
138 
139 class DictionaryPage : public Page {
140  public:
141   DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
142                  Encoding::type encoding, bool is_sorted = false)
Page(buffer,PageType::DICTIONARY_PAGE)143       : Page(buffer, PageType::DICTIONARY_PAGE),
144         num_values_(num_values),
145         encoding_(encoding),
146         is_sorted_(is_sorted) {}
147 
num_values()148   int32_t num_values() const { return num_values_; }
149 
encoding()150   Encoding::type encoding() const { return encoding_; }
151 
is_sorted()152   bool is_sorted() const { return is_sorted_; }
153 
154  private:
155   int32_t num_values_;
156   Encoding::type encoding_;
157   bool is_sorted_;
158 };
159 
160 }  // namespace parquet
161