1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <cstdint>
21 #include <memory>
22 #include <utility>
23 
24 #include "parquet/metadata.h"
25 #include "parquet/platform.h"
26 #include "parquet/properties.h"
27 #include "parquet/schema.h"
28 
29 namespace parquet {
30 
31 class ColumnWriter;
32 
33 // FIXME: copied from reader-internal.cc
34 static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
35 static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
36 
37 class PARQUET_EXPORT RowGroupWriter {
38  public:
39   // Forward declare a virtual class 'Contents' to aid dependency injection and more
40   // easily create test fixtures
41   // An implementation of the Contents class is defined in the .cc file
42   struct Contents {
43     virtual ~Contents() = default;
44     virtual int num_columns() const = 0;
45     virtual int64_t num_rows() const = 0;
46 
47     // to be used only with ParquetFileWriter::AppendRowGroup
48     virtual ColumnWriter* NextColumn() = 0;
49     // to be used only with ParquetFileWriter::AppendBufferedRowGroup
50     virtual ColumnWriter* column(int i) = 0;
51 
52     virtual int current_column() const = 0;
53     virtual void Close() = 0;
54 
55     // total bytes written by the page writer
56     virtual int64_t total_bytes_written() const = 0;
57     // total bytes still compressed but not written
58     virtual int64_t total_compressed_bytes() const = 0;
59   };
60 
61   explicit RowGroupWriter(std::unique_ptr<Contents> contents);
62 
63   /// Construct a ColumnWriter for the indicated row group-relative column.
64   ///
65   /// To be used only with ParquetFileWriter::AppendRowGroup
66   /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
67   /// valid until the next call to NextColumn or Close. As the contents are
68   /// directly written to the sink, once a new column is started, the contents
69   /// of the previous one cannot be modified anymore.
70   ColumnWriter* NextColumn();
71   /// Index of currently written column. Equal to -1 if NextColumn()
72   /// has not been called yet.
73   int current_column();
74   void Close();
75 
76   int num_columns() const;
77 
78   /// Construct a ColumnWriter for the indicated row group column.
79   ///
80   /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
81   /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
82   /// valid until Close. The contents are buffered in memory and written to sink
83   /// on Close
84   ColumnWriter* column(int i);
85 
86   /**
87    * Number of rows that shall be written as part of this RowGroup.
88    */
89   int64_t num_rows() const;
90 
91   int64_t total_bytes_written() const;
92   int64_t total_compressed_bytes() const;
93 
94  private:
95   // Holds a pointer to an instance of Contents implementation
96   std::unique_ptr<Contents> contents_;
97 };
98 
99 PARQUET_EXPORT
100 void WriteFileMetaData(const FileMetaData& file_metadata,
101                        ::arrow::io::OutputStream* sink);
102 
103 PARQUET_EXPORT
104 void WriteMetaDataFile(const FileMetaData& file_metadata,
105                        ::arrow::io::OutputStream* sink);
106 
107 PARQUET_EXPORT
108 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
109                                 ArrowOutputStream* sink,
110                                 const std::shared_ptr<Encryptor>& encryptor,
111                                 bool encrypt_footer);
112 
113 PARQUET_EXPORT
114 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
115                                 ::arrow::io::OutputStream* sink,
116                                 const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
117                                 bool encrypt_footer = false);
118 PARQUET_EXPORT
119 void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
120                              ::arrow::io::OutputStream* sink);
121 
122 class PARQUET_EXPORT ParquetFileWriter {
123  public:
124   // Forward declare a virtual class 'Contents' to aid dependency injection and more
125   // easily create test fixtures
126   // An implementation of the Contents class is defined in the .cc file
127   struct Contents {
ContentsContents128     Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
129              std::shared_ptr<const KeyValueMetadata> key_value_metadata)
130         : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
131       schema_.Init(std::move(schema));
132     }
~ContentsContents133     virtual ~Contents() {}
134     // Perform any cleanup associated with the file contents
135     virtual void Close() = 0;
136 
137     /// \note Deprecated since 1.3.0
138     RowGroupWriter* AppendRowGroup(int64_t num_rows);
139 
140     virtual RowGroupWriter* AppendRowGroup() = 0;
141     virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
142 
143     virtual int64_t num_rows() const = 0;
144     virtual int num_columns() const = 0;
145     virtual int num_row_groups() const = 0;
146 
147     virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
148 
key_value_metadataContents149     const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
150       return key_value_metadata_;
151     }
152 
153     // Return const-pointer to make it clear that this object is not to be copied
schemaContents154     const SchemaDescriptor* schema() const { return &schema_; }
155 
156     SchemaDescriptor schema_;
157 
158     /// This should be the only place this is stored. Everything else is a const reference
159     std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
160 
metadataContents161     const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
162     std::shared_ptr<FileMetaData> file_metadata_;
163   };
164 
165   ParquetFileWriter();
166   ~ParquetFileWriter();
167 
168   static std::unique_ptr<ParquetFileWriter> Open(
169       std::shared_ptr<::arrow::io::OutputStream> sink,
170       std::shared_ptr<schema::GroupNode> schema,
171       std::shared_ptr<WriterProperties> properties = default_writer_properties(),
172       std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
173 
174   void Open(std::unique_ptr<Contents> contents);
175   void Close();
176 
177   // Construct a RowGroupWriter for the indicated number of rows.
178   //
179   // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
180   // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
181   // @param num_rows The number of rows that are stored in the new RowGroup
182   //
183   // \deprecated Since 1.3.0
184   RowGroupWriter* AppendRowGroup(int64_t num_rows);
185 
186   /// Construct a RowGroupWriter with an arbitrary number of rows.
187   ///
188   /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
189   /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
190   RowGroupWriter* AppendRowGroup();
191 
192   /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
193   /// Use this if you want to write a RowGroup based on a certain size
194   ///
195   /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
196   /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
197   RowGroupWriter* AppendBufferedRowGroup();
198 
199   /// Number of columns.
200   ///
201   /// This number is fixed during the lifetime of the writer as it is determined via
202   /// the schema.
203   int num_columns() const;
204 
205   /// Number of rows in the yet started RowGroups.
206   ///
207   /// Changes on the addition of a new RowGroup.
208   int64_t num_rows() const;
209 
210   /// Number of started RowGroups.
211   int num_row_groups() const;
212 
213   /// Configuration passed to the writer, e.g. the used Parquet format version.
214   const std::shared_ptr<WriterProperties>& properties() const;
215 
216   /// Returns the file schema descriptor
217   const SchemaDescriptor* schema() const;
218 
219   /// Returns a column descriptor in schema
220   const ColumnDescriptor* descr(int i) const;
221 
222   /// Returns the file custom metadata
223   const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
224 
225   /// Returns the file metadata, only available after calling Close().
226   const std::shared_ptr<FileMetaData> metadata() const;
227 
228  private:
229   // Holds a pointer to an instance of Contents implementation
230   std::unique_ptr<Contents> contents_;
231   std::shared_ptr<FileMetaData> file_metadata_;
232 };
233 
234 }  // namespace parquet
235