1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <cstdint>
21 #include <memory>
22 #include <string>
23 #include <vector>
24 
25 #include "arrow/chunked_array.h"  // IWYU pragma: keep
26 #include "arrow/record_batch.h"
27 #include "arrow/status.h"
28 #include "arrow/type.h"
29 #include "arrow/type_fwd.h"
30 #include "arrow/util/macros.h"
31 #include "arrow/util/visibility.h"
32 
33 namespace arrow {
34 
35 class Array;
36 class ChunkedArray;
37 class KeyValueMetadata;
38 class MemoryPool;
39 
40 /// \class Table
41 /// \brief Logical table as sequence of chunked arrays
42 class ARROW_EXPORT Table {
43  public:
44   virtual ~Table() = default;
45 
46   /// \brief Construct a Table from schema and columns
47   ///
48   /// If columns is zero-length, the table's number of rows is zero
49   ///
50   /// \param[in] schema The table schema (column types)
51   /// \param[in] columns The table's columns as chunked arrays
52   /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
53   static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
54                                      std::vector<std::shared_ptr<ChunkedArray>> columns,
55                                      int64_t num_rows = -1);
56 
57   /// \brief Construct a Table from schema and arrays
58   ///
59   /// \param[in] schema The table schema (column types)
60   /// \param[in] arrays The table's columns as arrays
61   /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
62   static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
63                                      const std::vector<std::shared_ptr<Array>>& arrays,
64                                      int64_t num_rows = -1);
65 
66   /// \brief Construct a Table from a RecordBatchReader.
67   ///
68   /// \param[in] reader the arrow::Schema for each batch
69   static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader);
70 
71   /// \brief Construct a Table from RecordBatches, using schema supplied by the first
72   /// RecordBatch.
73   ///
74   /// \param[in] batches a std::vector of record batches
75   static Result<std::shared_ptr<Table>> FromRecordBatches(
76       const std::vector<std::shared_ptr<RecordBatch>>& batches);
77 
78   /// \brief Construct a Table from RecordBatches, using supplied schema. There may be
79   /// zero record batches
80   ///
81   /// \param[in] schema the arrow::Schema for each batch
82   /// \param[in] batches a std::vector of record batches
83   static Result<std::shared_ptr<Table>> FromRecordBatches(
84       std::shared_ptr<Schema> schema,
85       const std::vector<std::shared_ptr<RecordBatch>>& batches);
86 
87   /// \brief Construct a Table from a chunked StructArray. One column will be produced
88   /// for each field of the StructArray.
89   ///
90   /// \param[in] array a chunked StructArray
91   static Result<std::shared_ptr<Table>> FromChunkedStructArray(
92       const std::shared_ptr<ChunkedArray>& array);
93 
94   /// \brief Return the table schema
schema()95   std::shared_ptr<Schema> schema() const { return schema_; }
96 
97   /// \brief Return a column by index
98   virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
99 
100   /// \brief Return vector of all columns for table
101   std::vector<std::shared_ptr<ChunkedArray>> columns() const;
102 
103   /// Return a column's field by index
field(int i)104   std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
105 
106   /// \brief Return vector of all fields for table
107   std::vector<std::shared_ptr<Field>> fields() const;
108 
109   /// \brief Construct a zero-copy slice of the table with the
110   /// indicated offset and length
111   ///
112   /// \param[in] offset the index of the first row in the constructed
113   /// slice
114   /// \param[in] length the number of rows of the slice. If there are not enough
115   /// rows in the table, the length will be adjusted accordingly
116   ///
117   /// \return a new object wrapped in std::shared_ptr<Table>
118   virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0;
119 
120   /// \brief Slice from first row at offset until end of the table
Slice(int64_t offset)121   std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); }
122 
123   /// \brief Return a column by name
124   /// \param[in] name field name
125   /// \return an Array or null if no field was found
GetColumnByName(const std::string & name)126   std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const {
127     auto i = schema_->GetFieldIndex(name);
128     return i == -1 ? NULLPTR : column(i);
129   }
130 
131   /// \brief Remove column from the table, producing a new Table
132   virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0;
133 
134   /// \brief Add column to the table, producing a new Table
135   virtual Result<std::shared_ptr<Table>> AddColumn(
136       int i, std::shared_ptr<Field> field_arg,
137       std::shared_ptr<ChunkedArray> column) const = 0;
138 
139   /// \brief Replace a column in the table, producing a new Table
140   virtual Result<std::shared_ptr<Table>> SetColumn(
141       int i, std::shared_ptr<Field> field_arg,
142       std::shared_ptr<ChunkedArray> column) const = 0;
143 
144   /// \brief Return names of all columns
145   std::vector<std::string> ColumnNames() const;
146 
147   /// \brief Rename columns with provided names
148   Result<std::shared_ptr<Table>> RenameColumns(
149       const std::vector<std::string>& names) const;
150 
151   /// \brief Return new table with specified columns
152   Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
153 
154   /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL)
155   /// \since 0.5.0
156   ///
157   /// \param[in] metadata new KeyValueMetadata
158   /// \return new Table
159   virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
160       const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
161 
162   /// \brief Flatten the table, producing a new Table.  Any column with a
163   /// struct type will be flattened into multiple columns
164   ///
165   /// \param[in] pool The pool for buffer allocations, if any
166   virtual Result<std::shared_ptr<Table>> Flatten(
167       MemoryPool* pool = default_memory_pool()) const = 0;
168 
169   /// \return PrettyPrint representation suitable for debugging
170   std::string ToString() const;
171 
172   /// \brief Perform cheap validation checks to determine obvious inconsistencies
173   /// within the table's schema and internal data.
174   ///
175   /// This is O(k*m) where k is the total number of field descendents,
176   /// and m is the number of chunks.
177   ///
178   /// \return Status
179   virtual Status Validate() const = 0;
180 
181   /// \brief Perform extensive validation checks to determine inconsistencies
182   /// within the table's schema and internal data.
183   ///
184   /// This is O(k*n) where k is the total number of field descendents,
185   /// and n is the number of rows.
186   ///
187   /// \return Status
188   virtual Status ValidateFull() const = 0;
189 
190   /// \brief Return the number of columns in the table
num_columns()191   int num_columns() const { return schema_->num_fields(); }
192 
193   /// \brief Return the number of rows (equal to each column's logical length)
num_rows()194   int64_t num_rows() const { return num_rows_; }
195 
196   /// \brief Determine if tables are equal
197   ///
198   /// Two tables can be equal only if they have equal schemas.
199   /// However, they may be equal even if they have different chunkings.
200   bool Equals(const Table& other, bool check_metadata = false) const;
201 
202   /// \brief Make a new table by combining the chunks this table has.
203   ///
204   /// All the underlying chunks in the ChunkedArray of each column are
205   /// concatenated into zero or one chunk.
206   ///
207   /// \param[in] pool The pool for buffer allocations
208   Result<std::shared_ptr<Table>> CombineChunks(
209       MemoryPool* pool = default_memory_pool()) const;
210 
211  protected:
212   Table();
213 
214   std::shared_ptr<Schema> schema_;
215   int64_t num_rows_;
216 
217  private:
218   ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
219 };
220 
221 /// \brief Compute a stream of record batches from a (possibly chunked) Table
222 ///
223 /// The conversion is zero-copy: each record batch is a view over a slice
224 /// of the table's columns.
225 class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
226  public:
227   /// \brief Construct a TableBatchReader for the given table
228   explicit TableBatchReader(const Table& table);
229 
230   std::shared_ptr<Schema> schema() const override;
231 
232   Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
233 
234   /// \brief Set the desired maximum chunk size of record batches
235   ///
236   /// The actual chunk size of each record batch may be smaller, depending
237   /// on actual chunking characteristics of each table column.
238   void set_chunksize(int64_t chunksize);
239 
240  private:
241   const Table& table_;
242   std::vector<ChunkedArray*> column_data_;
243   std::vector<int> chunk_numbers_;
244   std::vector<int64_t> chunk_offsets_;
245   int64_t absolute_row_position_;
246   int64_t max_chunksize_;
247 };
248 
249 /// \defgroup concat-tables ConcatenateTables function.
250 ///
251 /// ConcatenateTables function.
252 /// @{
253 
254 /// \brief Controls the behavior of ConcatenateTables().
255 struct ARROW_EXPORT ConcatenateTablesOptions {
256   /// If true, the schemas of the tables will be first unified with fields of
257   /// the same name being merged, according to `field_merge_options`, then each
258   /// table will be promoted to the unified schema before being concatenated.
259   /// Otherwise, all tables should have the same schema. Each column in the output table
260   /// is the result of concatenating the corresponding columns in all input tables.
261   bool unify_schemas = false;
262 
263   Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
264 
DefaultsConcatenateTablesOptions265   static ConcatenateTablesOptions Defaults() { return {}; }
266 };
267 
268 /// \brief Construct table from multiple input tables.
269 ARROW_EXPORT
270 Result<std::shared_ptr<Table>> ConcatenateTables(
271     const std::vector<std::shared_ptr<Table>>& tables,
272     ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(),
273     MemoryPool* memory_pool = default_memory_pool());
274 
275 /// \brief Promotes a table to conform to the given schema.
276 ///
277 /// If a field in the schema does not have a corresponding column in the
278 /// table, a column of nulls will be added to the resulting table.
279 /// If the corresponding column is of type Null, it will be promoted to
280 /// the type specified by schema, with null values filled.
281 /// Returns an error:
282 /// - if the corresponding column's type is not compatible with the
283 ///   schema.
284 /// - if there is a column in the table that does not exist in the schema.
285 ///
286 /// \param[in] table the input Table
287 /// \param[in] schema the target schema to promote to
288 /// \param[in] pool The memory pool to be used if null-filled arrays need to
289 /// be created.
290 ARROW_EXPORT
291 Result<std::shared_ptr<Table>> PromoteTableToSchema(
292     const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
293     MemoryPool* pool = default_memory_pool());
294 
295 }  // namespace arrow
296