1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <cstdint> 21 #include <memory> 22 #include <utility> 23 24 #include "parquet/metadata.h" 25 #include "parquet/platform.h" 26 #include "parquet/properties.h" 27 #include "parquet/schema.h" 28 29 namespace parquet { 30 31 class ColumnWriter; 32 33 // FIXME: copied from reader-internal.cc 34 static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; 35 static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; 36 37 class PARQUET_EXPORT RowGroupWriter { 38 public: 39 // Forward declare a virtual class 'Contents' to aid dependency injection and more 40 // easily create test fixtures 41 // An implementation of the Contents class is defined in the .cc file 42 struct Contents { 43 virtual ~Contents() = default; 44 virtual int num_columns() const = 0; 45 virtual int64_t num_rows() const = 0; 46 47 // to be used only with ParquetFileWriter::AppendRowGroup 48 virtual ColumnWriter* NextColumn() = 0; 49 // to be used only with ParquetFileWriter::AppendBufferedRowGroup 50 virtual ColumnWriter* column(int i) = 0; 51 52 virtual int current_column() const = 0; 53 virtual void Close() = 0; 54 55 // total bytes written by the page writer 56 virtual int64_t total_bytes_written() const = 0; 57 // total bytes still compressed but not written 58 virtual int64_t total_compressed_bytes() const = 0; 59 }; 60 61 explicit RowGroupWriter(std::unique_ptr<Contents> contents); 62 63 /// Construct a ColumnWriter for the indicated row group-relative column. 64 /// 65 /// To be used only with ParquetFileWriter::AppendRowGroup 66 /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only 67 /// valid until the next call to NextColumn or Close. As the contents are 68 /// directly written to the sink, once a new column is started, the contents 69 /// of the previous one cannot be modified anymore. 70 ColumnWriter* NextColumn(); 71 /// Index of currently written column. Equal to -1 if NextColumn() 72 /// has not been called yet. 73 int current_column(); 74 void Close(); 75 76 int num_columns() const; 77 78 /// Construct a ColumnWriter for the indicated row group column. 79 /// 80 /// To be used only with ParquetFileWriter::AppendBufferedRowGroup 81 /// Ownership is solely within the RowGroupWriter. The ColumnWriter is 82 /// valid until Close. The contents are buffered in memory and written to sink 83 /// on Close 84 ColumnWriter* column(int i); 85 86 /** 87 * Number of rows that shall be written as part of this RowGroup. 88 */ 89 int64_t num_rows() const; 90 91 int64_t total_bytes_written() const; 92 int64_t total_compressed_bytes() const; 93 94 private: 95 // Holds a pointer to an instance of Contents implementation 96 std::unique_ptr<Contents> contents_; 97 }; 98 99 PARQUET_EXPORT 100 void WriteFileMetaData(const FileMetaData& file_metadata, 101 ::arrow::io::OutputStream* sink); 102 103 PARQUET_EXPORT 104 void WriteMetaDataFile(const FileMetaData& file_metadata, 105 ::arrow::io::OutputStream* sink); 106 107 PARQUET_EXPORT 108 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, 109 ArrowOutputStream* sink, 110 const std::shared_ptr<Encryptor>& encryptor, 111 bool encrypt_footer); 112 113 PARQUET_EXPORT 114 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, 115 ::arrow::io::OutputStream* sink, 116 const std::shared_ptr<Encryptor>& encryptor = NULLPTR, 117 bool encrypt_footer = false); 118 PARQUET_EXPORT 119 void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, 120 ::arrow::io::OutputStream* sink); 121 122 class PARQUET_EXPORT ParquetFileWriter { 123 public: 124 // Forward declare a virtual class 'Contents' to aid dependency injection and more 125 // easily create test fixtures 126 // An implementation of the Contents class is defined in the .cc file 127 struct Contents { ContentsContents128 Contents(std::shared_ptr<::parquet::schema::GroupNode> schema, 129 std::shared_ptr<const KeyValueMetadata> key_value_metadata) 130 : schema_(), key_value_metadata_(std::move(key_value_metadata)) { 131 schema_.Init(std::move(schema)); 132 } ~ContentsContents133 virtual ~Contents() {} 134 // Perform any cleanup associated with the file contents 135 virtual void Close() = 0; 136 137 /// \note Deprecated since 1.3.0 138 RowGroupWriter* AppendRowGroup(int64_t num_rows); 139 140 virtual RowGroupWriter* AppendRowGroup() = 0; 141 virtual RowGroupWriter* AppendBufferedRowGroup() = 0; 142 143 virtual int64_t num_rows() const = 0; 144 virtual int num_columns() const = 0; 145 virtual int num_row_groups() const = 0; 146 147 virtual const std::shared_ptr<WriterProperties>& properties() const = 0; 148 key_value_metadataContents149 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const { 150 return key_value_metadata_; 151 } 152 153 // Return const-pointer to make it clear that this object is not to be copied schemaContents154 const SchemaDescriptor* schema() const { return &schema_; } 155 156 SchemaDescriptor schema_; 157 158 /// This should be the only place this is stored. Everything else is a const reference 159 std::shared_ptr<const KeyValueMetadata> key_value_metadata_; 160 metadataContents161 const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; } 162 std::shared_ptr<FileMetaData> file_metadata_; 163 }; 164 165 ParquetFileWriter(); 166 ~ParquetFileWriter(); 167 168 static std::unique_ptr<ParquetFileWriter> Open( 169 std::shared_ptr<::arrow::io::OutputStream> sink, 170 std::shared_ptr<schema::GroupNode> schema, 171 std::shared_ptr<WriterProperties> properties = default_writer_properties(), 172 std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR); 173 174 void Open(std::unique_ptr<Contents> contents); 175 void Close(); 176 177 // Construct a RowGroupWriter for the indicated number of rows. 178 // 179 // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid 180 // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. 181 // @param num_rows The number of rows that are stored in the new RowGroup 182 // 183 // \deprecated Since 1.3.0 184 RowGroupWriter* AppendRowGroup(int64_t num_rows); 185 186 /// Construct a RowGroupWriter with an arbitrary number of rows. 187 /// 188 /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid 189 /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. 190 RowGroupWriter* AppendRowGroup(); 191 192 /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready. 193 /// Use this if you want to write a RowGroup based on a certain size 194 /// 195 /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid 196 /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. 197 RowGroupWriter* AppendBufferedRowGroup(); 198 199 /// Number of columns. 200 /// 201 /// This number is fixed during the lifetime of the writer as it is determined via 202 /// the schema. 203 int num_columns() const; 204 205 /// Number of rows in the yet started RowGroups. 206 /// 207 /// Changes on the addition of a new RowGroup. 208 int64_t num_rows() const; 209 210 /// Number of started RowGroups. 211 int num_row_groups() const; 212 213 /// Configuration passed to the writer, e.g. the used Parquet format version. 214 const std::shared_ptr<WriterProperties>& properties() const; 215 216 /// Returns the file schema descriptor 217 const SchemaDescriptor* schema() const; 218 219 /// Returns a column descriptor in schema 220 const ColumnDescriptor* descr(int i) const; 221 222 /// Returns the file custom metadata 223 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const; 224 225 /// Returns the file metadata, only available after calling Close(). 226 const std::shared_ptr<FileMetaData> metadata() const; 227 228 private: 229 // Holds a pointer to an instance of Contents implementation 230 std::unique_ptr<Contents> contents_; 231 std::shared_ptr<FileMetaData> file_metadata_; 232 }; 233 234 } // namespace parquet 235