1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 // This module defines an abstract interface for iterating through pages in a 19 // Parquet column chunk within a row group. It could be extended in the future 20 // to iterate through all data pages in all chunks in a file. 21 22 #pragma once 23 24 #include <cstdint> 25 #include <memory> 26 #include <string> 27 28 #include "parquet/statistics.h" 29 #include "parquet/types.h" 30 31 namespace parquet { 32 33 // TODO: Parallel processing is not yet safe because of memory-ownership 34 // semantics (the PageReader may or may not own the memory referenced by a 35 // page) 36 // 37 // TODO(wesm): In the future Parquet implementations may store the crc code 38 // in format::PageHeader. parquet-mr currently does not, so we also skip it 39 // here, both on the read and write path 40 class Page { 41 public: Page(const std::shared_ptr<Buffer> & buffer,PageType::type type)42 Page(const std::shared_ptr<Buffer>& buffer, PageType::type type) 43 : buffer_(buffer), type_(type) {} 44 type()45 PageType::type type() const { return type_; } 46 buffer()47 std::shared_ptr<Buffer> buffer() const { return buffer_; } 48 49 // @returns: a pointer to the page's data data()50 const uint8_t* data() const { return buffer_->data(); } 51 52 // @returns: the total size in bytes of the page's data buffer size()53 int32_t size() const { return static_cast<int32_t>(buffer_->size()); } 54 55 private: 56 std::shared_ptr<Buffer> buffer_; 57 PageType::type type_; 58 }; 59 60 /// \brief Base type for DataPageV1 and DataPageV2 including common attributes 61 class DataPage : public Page { 62 public: num_values()63 int32_t num_values() const { return num_values_; } encoding()64 Encoding::type encoding() const { return encoding_; } uncompressed_size()65 int64_t uncompressed_size() const { return uncompressed_size_; } statistics()66 const EncodedStatistics& statistics() const { return statistics_; } 67 68 virtual ~DataPage() = default; 69 70 protected: 71 DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values, 72 Encoding::type encoding, int64_t uncompressed_size, 73 const EncodedStatistics& statistics = EncodedStatistics()) Page(buffer,type)74 : Page(buffer, type), 75 num_values_(num_values), 76 encoding_(encoding), 77 uncompressed_size_(uncompressed_size), 78 statistics_(statistics) {} 79 80 int32_t num_values_; 81 Encoding::type encoding_; 82 int64_t uncompressed_size_; 83 EncodedStatistics statistics_; 84 }; 85 86 class DataPageV1 : public DataPage { 87 public: 88 DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values, 89 Encoding::type encoding, Encoding::type definition_level_encoding, 90 Encoding::type repetition_level_encoding, int64_t uncompressed_size, 91 const EncodedStatistics& statistics = EncodedStatistics()) DataPage(PageType::DATA_PAGE,buffer,num_values,encoding,uncompressed_size,statistics)92 : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size, 93 statistics), 94 definition_level_encoding_(definition_level_encoding), 95 repetition_level_encoding_(repetition_level_encoding) {} 96 repetition_level_encoding()97 Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } 98 definition_level_encoding()99 Encoding::type definition_level_encoding() const { return definition_level_encoding_; } 100 101 private: 102 Encoding::type definition_level_encoding_; 103 Encoding::type repetition_level_encoding_; 104 }; 105 106 class DataPageV2 : public DataPage { 107 public: 108 DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls, 109 int32_t num_rows, Encoding::type encoding, 110 int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, 111 int64_t uncompressed_size, bool is_compressed = false, 112 const EncodedStatistics& statistics = EncodedStatistics()) DataPage(PageType::DATA_PAGE_V2,buffer,num_values,encoding,uncompressed_size,statistics)113 : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size, 114 statistics), 115 num_nulls_(num_nulls), 116 num_rows_(num_rows), 117 definition_levels_byte_length_(definition_levels_byte_length), 118 repetition_levels_byte_length_(repetition_levels_byte_length), 119 is_compressed_(is_compressed) {} 120 num_nulls()121 int32_t num_nulls() const { return num_nulls_; } 122 num_rows()123 int32_t num_rows() const { return num_rows_; } 124 definition_levels_byte_length()125 int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } 126 repetition_levels_byte_length()127 int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } 128 is_compressed()129 bool is_compressed() const { return is_compressed_; } 130 131 private: 132 int32_t num_nulls_; 133 int32_t num_rows_; 134 int32_t definition_levels_byte_length_; 135 int32_t repetition_levels_byte_length_; 136 bool is_compressed_; 137 }; 138 139 class DictionaryPage : public Page { 140 public: 141 DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values, 142 Encoding::type encoding, bool is_sorted = false) Page(buffer,PageType::DICTIONARY_PAGE)143 : Page(buffer, PageType::DICTIONARY_PAGE), 144 num_values_(num_values), 145 encoding_(encoding), 146 is_sorted_(is_sorted) {} 147 num_values()148 int32_t num_values() const { return num_values_; } 149 encoding()150 Encoding::type encoding() const { return encoding_; } 151 is_sorted()152 bool is_sorted() const { return is_sorted_; } 153 154 private: 155 int32_t num_values_; 156 Encoding::type encoding_; 157 bool is_sorted_; 158 }; 159 160 } // namespace parquet 161