1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <cstdint> 21 #include <memory> 22 #include <string> 23 #include <vector> 24 25 #include "arrow/chunked_array.h" // IWYU pragma: keep 26 #include "arrow/record_batch.h" 27 #include "arrow/status.h" 28 #include "arrow/type.h" 29 #include "arrow/type_fwd.h" 30 #include "arrow/util/macros.h" 31 #include "arrow/util/visibility.h" 32 33 namespace arrow { 34 35 class Array; 36 class ChunkedArray; 37 class KeyValueMetadata; 38 class MemoryPool; 39 40 /// \class Table 41 /// \brief Logical table as sequence of chunked arrays 42 class ARROW_EXPORT Table { 43 public: 44 virtual ~Table() = default; 45 46 /// \brief Construct a Table from schema and columns 47 /// 48 /// If columns is zero-length, the table's number of rows is zero 49 /// 50 /// \param[in] schema The table schema (column types) 51 /// \param[in] columns The table's columns as chunked arrays 52 /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns 53 static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema, 54 std::vector<std::shared_ptr<ChunkedArray>> columns, 55 int64_t num_rows = -1); 56 57 /// \brief Construct a Table from schema and arrays 58 /// 59 /// \param[in] schema The table schema (column types) 60 /// \param[in] arrays The table's columns as arrays 61 /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns 62 static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema, 63 const std::vector<std::shared_ptr<Array>>& arrays, 64 int64_t num_rows = -1); 65 66 /// \brief Construct a Table from a RecordBatchReader. 67 /// 68 /// \param[in] reader the arrow::Schema for each batch 69 static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader); 70 71 /// \brief Construct a Table from RecordBatches, using schema supplied by the first 72 /// RecordBatch. 73 /// 74 /// \param[in] batches a std::vector of record batches 75 static Result<std::shared_ptr<Table>> FromRecordBatches( 76 const std::vector<std::shared_ptr<RecordBatch>>& batches); 77 78 /// \brief Construct a Table from RecordBatches, using supplied schema. There may be 79 /// zero record batches 80 /// 81 /// \param[in] schema the arrow::Schema for each batch 82 /// \param[in] batches a std::vector of record batches 83 static Result<std::shared_ptr<Table>> FromRecordBatches( 84 std::shared_ptr<Schema> schema, 85 const std::vector<std::shared_ptr<RecordBatch>>& batches); 86 87 /// \brief Construct a Table from a chunked StructArray. One column will be produced 88 /// for each field of the StructArray. 89 /// 90 /// \param[in] array a chunked StructArray 91 static Result<std::shared_ptr<Table>> FromChunkedStructArray( 92 const std::shared_ptr<ChunkedArray>& array); 93 94 /// \brief Return the table schema schema()95 std::shared_ptr<Schema> schema() const { return schema_; } 96 97 /// \brief Return a column by index 98 virtual std::shared_ptr<ChunkedArray> column(int i) const = 0; 99 100 /// \brief Return vector of all columns for table 101 std::vector<std::shared_ptr<ChunkedArray>> columns() const; 102 103 /// Return a column's field by index field(int i)104 std::shared_ptr<Field> field(int i) const { return schema_->field(i); } 105 106 /// \brief Return vector of all fields for table 107 std::vector<std::shared_ptr<Field>> fields() const; 108 109 /// \brief Construct a zero-copy slice of the table with the 110 /// indicated offset and length 111 /// 112 /// \param[in] offset the index of the first row in the constructed 113 /// slice 114 /// \param[in] length the number of rows of the slice. If there are not enough 115 /// rows in the table, the length will be adjusted accordingly 116 /// 117 /// \return a new object wrapped in std::shared_ptr<Table> 118 virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0; 119 120 /// \brief Slice from first row at offset until end of the table Slice(int64_t offset)121 std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); } 122 123 /// \brief Return a column by name 124 /// \param[in] name field name 125 /// \return an Array or null if no field was found GetColumnByName(const std::string & name)126 std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const { 127 auto i = schema_->GetFieldIndex(name); 128 return i == -1 ? NULLPTR : column(i); 129 } 130 131 /// \brief Remove column from the table, producing a new Table 132 virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0; 133 134 /// \brief Add column to the table, producing a new Table 135 virtual Result<std::shared_ptr<Table>> AddColumn( 136 int i, std::shared_ptr<Field> field_arg, 137 std::shared_ptr<ChunkedArray> column) const = 0; 138 139 /// \brief Replace a column in the table, producing a new Table 140 virtual Result<std::shared_ptr<Table>> SetColumn( 141 int i, std::shared_ptr<Field> field_arg, 142 std::shared_ptr<ChunkedArray> column) const = 0; 143 144 /// \brief Return names of all columns 145 std::vector<std::string> ColumnNames() const; 146 147 /// \brief Rename columns with provided names 148 Result<std::shared_ptr<Table>> RenameColumns( 149 const std::vector<std::string>& names) const; 150 151 /// \brief Return new table with specified columns 152 Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const; 153 154 /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) 155 /// \since 0.5.0 156 /// 157 /// \param[in] metadata new KeyValueMetadata 158 /// \return new Table 159 virtual std::shared_ptr<Table> ReplaceSchemaMetadata( 160 const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0; 161 162 /// \brief Flatten the table, producing a new Table. Any column with a 163 /// struct type will be flattened into multiple columns 164 /// 165 /// \param[in] pool The pool for buffer allocations, if any 166 virtual Result<std::shared_ptr<Table>> Flatten( 167 MemoryPool* pool = default_memory_pool()) const = 0; 168 169 /// \return PrettyPrint representation suitable for debugging 170 std::string ToString() const; 171 172 /// \brief Perform cheap validation checks to determine obvious inconsistencies 173 /// within the table's schema and internal data. 174 /// 175 /// This is O(k*m) where k is the total number of field descendents, 176 /// and m is the number of chunks. 177 /// 178 /// \return Status 179 virtual Status Validate() const = 0; 180 181 /// \brief Perform extensive validation checks to determine inconsistencies 182 /// within the table's schema and internal data. 183 /// 184 /// This is O(k*n) where k is the total number of field descendents, 185 /// and n is the number of rows. 186 /// 187 /// \return Status 188 virtual Status ValidateFull() const = 0; 189 190 /// \brief Return the number of columns in the table num_columns()191 int num_columns() const { return schema_->num_fields(); } 192 193 /// \brief Return the number of rows (equal to each column's logical length) num_rows()194 int64_t num_rows() const { return num_rows_; } 195 196 /// \brief Determine if tables are equal 197 /// 198 /// Two tables can be equal only if they have equal schemas. 199 /// However, they may be equal even if they have different chunkings. 200 bool Equals(const Table& other, bool check_metadata = false) const; 201 202 /// \brief Make a new table by combining the chunks this table has. 203 /// 204 /// All the underlying chunks in the ChunkedArray of each column are 205 /// concatenated into zero or one chunk. 206 /// 207 /// \param[in] pool The pool for buffer allocations 208 Result<std::shared_ptr<Table>> CombineChunks( 209 MemoryPool* pool = default_memory_pool()) const; 210 211 protected: 212 Table(); 213 214 std::shared_ptr<Schema> schema_; 215 int64_t num_rows_; 216 217 private: 218 ARROW_DISALLOW_COPY_AND_ASSIGN(Table); 219 }; 220 221 /// \brief Compute a stream of record batches from a (possibly chunked) Table 222 /// 223 /// The conversion is zero-copy: each record batch is a view over a slice 224 /// of the table's columns. 225 class ARROW_EXPORT TableBatchReader : public RecordBatchReader { 226 public: 227 /// \brief Construct a TableBatchReader for the given table 228 explicit TableBatchReader(const Table& table); 229 230 std::shared_ptr<Schema> schema() const override; 231 232 Status ReadNext(std::shared_ptr<RecordBatch>* out) override; 233 234 /// \brief Set the desired maximum chunk size of record batches 235 /// 236 /// The actual chunk size of each record batch may be smaller, depending 237 /// on actual chunking characteristics of each table column. 238 void set_chunksize(int64_t chunksize); 239 240 private: 241 const Table& table_; 242 std::vector<ChunkedArray*> column_data_; 243 std::vector<int> chunk_numbers_; 244 std::vector<int64_t> chunk_offsets_; 245 int64_t absolute_row_position_; 246 int64_t max_chunksize_; 247 }; 248 249 /// \defgroup concat-tables ConcatenateTables function. 250 /// 251 /// ConcatenateTables function. 252 /// @{ 253 254 /// \brief Controls the behavior of ConcatenateTables(). 255 struct ARROW_EXPORT ConcatenateTablesOptions { 256 /// If true, the schemas of the tables will be first unified with fields of 257 /// the same name being merged, according to `field_merge_options`, then each 258 /// table will be promoted to the unified schema before being concatenated. 259 /// Otherwise, all tables should have the same schema. Each column in the output table 260 /// is the result of concatenating the corresponding columns in all input tables. 261 bool unify_schemas = false; 262 263 Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults(); 264 DefaultsConcatenateTablesOptions265 static ConcatenateTablesOptions Defaults() { return {}; } 266 }; 267 268 /// \brief Construct table from multiple input tables. 269 ARROW_EXPORT 270 Result<std::shared_ptr<Table>> ConcatenateTables( 271 const std::vector<std::shared_ptr<Table>>& tables, 272 ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(), 273 MemoryPool* memory_pool = default_memory_pool()); 274 275 /// \brief Promotes a table to conform to the given schema. 276 /// 277 /// If a field in the schema does not have a corresponding column in the 278 /// table, a column of nulls will be added to the resulting table. 279 /// If the corresponding column is of type Null, it will be promoted to 280 /// the type specified by schema, with null values filled. 281 /// Returns an error: 282 /// - if the corresponding column's type is not compatible with the 283 /// schema. 284 /// - if there is a column in the table that does not exist in the schema. 285 /// 286 /// \param[in] table the input Table 287 /// \param[in] schema the target schema to promote to 288 /// \param[in] pool The memory pool to be used if null-filled arrays need to 289 /// be created. 290 ARROW_EXPORT 291 Result<std::shared_ptr<Table>> PromoteTableToSchema( 292 const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema, 293 MemoryPool* pool = default_memory_pool()); 294 295 } // namespace arrow 296