1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18require "arrow/raw-table-converter" 19 20module Arrow 21 class Table 22 include ColumnContainable 23 include GenericFilterable 24 include GenericTakeable 25 include RecordContainable 26 27 class << self 28 def load(path, options={}) 29 TableLoader.load(path, options) 30 end 31 end 32 33 alias_method :initialize_raw, :initialize 34 private :initialize_raw 35 36 # Creates a new {Arrow::Table}. 37 # 38 # @overload initialize(columns) 39 # 40 # @param columns [::Array<Arrow::Column>] The columns of the table. 41 # 42 # @example Create a table from columns 43 # count_field = Arrow::Field.new("count", :uint32) 44 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4]) 45 # count_column = Arrow::Column.new(count_field, count_array) 46 # visible_field = Arrow::Field.new("visible", :boolean) 47 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false]) 48 # visible_column = Arrow::Column.new(visible_field, visible_array) 49 # Arrow::Table.new([count_column, visible_column]) 50 # 51 # @overload initialize(raw_table) 52 # 53 # @param raw_table [Hash<String, Arrow::Array>] 54 # The pairs of column name and values of the table. Column values is 55 # `Arrow::Array`. 56 # 57 # @example Create a table from column name and values 58 # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]), 59 # "visible" => Arrow::BooleanArray.new([true, nil, nil, false])) 60 # 61 # @overload initialize(raw_table) 62 # 63 # @param raw_table [Hash<String, Arrow::ChunkedArray>] 64 # The pairs of column name and values of the table. Column values is 65 # `Arrow::ChunkedArray`. 66 # 67 # @example Create a table from column name and values 68 # count_chunks = [ 69 # Arrow::UInt32Array.new([0, 2]), 70 # Arrow::UInt32Array.new([nil, 4]), 71 # ] 72 # visible_chunks = [ 73 # Arrow::BooleanArray.new([true]), 74 # Arrow::BooleanArray.new([nil, nil, false]), 75 # ] 76 # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks), 77 # "visible" => Arrow::ChunkedArray.new(visible_chunks)) 78 # 79 # @overload initialize(raw_table) 80 # 81 # @param raw_table [Hash<String, ::Array>] 82 # The pairs of column name and values of the table. Column values is 83 # `Array`. 84 # 85 # @example Create a table from column name and values 86 # Arrow::Table.new("count" => [0, 2, nil, 4], 87 # "visible" => [true, nil, nil, false]) 88 # 89 # @overload initialize(schema, columns) 90 # 91 # @param schema [Arrow::Schema] The schema of the table. 92 # You can also specify schema as primitive Ruby objects. 93 # See {Arrow::Schema#initialize} for details. 94 # 95 # @param columns [::Array<Arrow::Column>] The data of the table. 96 # 97 # @example Create a table from schema and columns 98 # count_field = Arrow::Field.new("count", :uint32) 99 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4]) 100 # count_column = Arrow::Column.new(count_field, count_array) 101 # visible_field = Arrow::Field.new("visible", :boolean) 102 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false]) 103 # visible_column = Arrow::Column.new(visible_field, visible_array) 104 # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]), 105 # [count_column, visible_column]) 106 # 107 # @overload initialize(schema, arrays) 108 # 109 # @param schema [Arrow::Schema] The schema of the table. 110 # You can also specify schema as primitive Ruby objects. 111 # See {Arrow::Schema#initialize} for details. 112 # 113 # @param arrays [::Array<Arrow::Array>] The data of the table. 114 # 115 # @example Create a table from schema and arrays 116 # count_field = Arrow::Field.new("count", :uint32) 117 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4]) 118 # visible_field = Arrow::Field.new("visible", :boolean) 119 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false]) 120 # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]), 121 # [count_array, visible_array]) 122 # 123 # @overload initialize(schema, record_batches) 124 # 125 # @param schema [Arrow::Schema] The schema of the table. 126 # You can also specify schema as primitive Ruby objects. 127 # See {Arrow::Schema#initialize} for details. 128 # 129 # @param arrays [::Array<Arrow::RecordBatch>] The data of the table. 130 # 131 # @example Create a table from schema and record batches 132 # count_field = Arrow::Field.new("count", :uint32) 133 # visible_field = Arrow::Field.new("visible", :boolean) 134 # schema = Arrow::Schema.new([count_field, visible_field]) 135 # record_batches = [ 136 # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]), 137 # Arrow::RecordBatch.new(schema, [[4, false]]), 138 # ] 139 # Arrow::Table.new(schema, record_batches) 140 # 141 # @overload initialize(schema, raw_records) 142 # 143 # @param schema [Arrow::Schema] The schema of the table. 144 # You can also specify schema as primitive Ruby objects. 145 # See {Arrow::Schema#initialize} for details. 146 # 147 # @param arrays [::Array<::Array>] The data of the table as primitive 148 # Ruby objects. 149 # 150 # @example Create a table from schema and raw records 151 # schema = { 152 # count: :uint32, 153 # visible: :boolean, 154 # } 155 # raw_records = [ 156 # [0, true], 157 # [2, nil], 158 # [nil, nil], 159 # [4, false], 160 # ] 161 # Arrow::Table.new(schema, raw_records) 162 def initialize(*args) 163 n_args = args.size 164 case n_args 165 when 1 166 raw_table_converter = RawTableConverter.new(args[0]) 167 schema = raw_table_converter.schema 168 values = raw_table_converter.values 169 when 2 170 schema = args[0] 171 schema = Schema.new(schema) unless schema.is_a?(Schema) 172 values = args[1] 173 case values[0] 174 when ::Array 175 values = [RecordBatch.new(schema, values)] 176 when Column 177 values = values.collect(&:data) 178 end 179 else 180 message = "wrong number of arguments (given #{n_args}, expected 1..2)" 181 raise ArgumentError, message 182 end 183 initialize_raw(schema, values) 184 end 185 186 def each_record_batch 187 return to_enum(__method__) unless block_given? 188 189 reader = TableBatchReader.new(self) 190 while record_batch = reader.read_next 191 yield(record_batch) 192 end 193 end 194 195 alias_method :size, :n_rows 196 alias_method :length, :n_rows 197 198 alias_method :slice_raw, :slice 199 200 # @overload slice(offset, length) 201 # 202 # @param offset [Integer] The offset of sub Arrow::Table. 203 # @param length [Integer] The length of sub Arrow::Table. 204 # @return [Arrow::Table] 205 # The sub `Arrow::Table` that covers only from 206 # `offset` to `offset + length` range. 207 # 208 # @overload slice(index) 209 # 210 # @param index [Integer] The index in this table. 211 # @return [Arrow::Record] 212 # The `Arrow::Record` corresponding to index of 213 # the table. 214 # 215 # @overload slice(booleans) 216 # 217 # @param booleans [::Array<Boolean>] 218 # The values indicating the target rows. 219 # @return [Arrow::Table] 220 # The sub `Arrow::Table` that covers only rows of indices 221 # the values of `booleans` is true. 222 # 223 # @overload slice(boolean_array) 224 # 225 # @param boolean_array [::Array<Arrow::BooleanArray>] 226 # The values indicating the target rows. 227 # @return [Arrow::Table] 228 # The sub `Arrow::Table` that covers only rows of indices 229 # the values of `boolean_array` is true. 230 # 231 # @overload slice(range) 232 # 233 # @param range_included_end [Range] The range indicating the target rows. 234 # @return [Arrow::Table] 235 # The sub `Arrow::Table` that covers only rows of the range of indices. 236 # 237 # @overload slice(conditions) 238 # 239 # @param conditions [Hash] The conditions to select records. 240 # @return [Arrow::Table] 241 # The sub `Arrow::Table` that covers only rows matched by condition 242 # 243 # @overload slice 244 # 245 # @yield [slicer] Gives slicer that constructs condition to select records. 246 # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to 247 # build condition. 248 # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>] 249 # The condition to select records. 250 # @return [Arrow::Table] 251 # The sub `Arrow::Table` that covers only rows matched by condition 252 # specified by slicer. 253 def slice(*args) 254 slicers = [] 255 if block_given? 256 unless args.empty? 257 raise ArgumentError, "must not specify both arguments and block" 258 end 259 block_slicer = yield(Slicer.new(self)) 260 case block_slicer 261 when ::Array 262 slicers.concat(block_slicer) 263 else 264 slicers << block_slicer 265 end 266 else 267 expected_n_args = nil 268 case args.size 269 when 1 270 case args[0] 271 when Integer 272 index = args[0] 273 index += n_rows if index < 0 274 return nil if index < 0 275 return nil if index >= n_rows 276 return Record.new(self, index) 277 when Hash 278 condition_pairs = args[0] 279 slicer = Slicer.new(self) 280 conditions = [] 281 condition_pairs.each do |key, value| 282 case value 283 when Range 284 # TODO: Optimize "begin <= key <= end" case by missing "between" kernel 285 # https://issues.apache.org/jira/browse/ARROW-9843 286 unless value.begin.nil? 287 conditions << (slicer[key] >= value.begin) 288 end 289 unless value.end.nil? 290 if value.exclude_end? 291 conditions << (slicer[key] < value.end) 292 else 293 conditions << (slicer[key] <= value.end) 294 end 295 end 296 else 297 conditions << (slicer[key] == value) 298 end 299 end 300 slicers << conditions.inject(:&) 301 else 302 slicers << args[0] 303 end 304 when 2 305 offset, length = args 306 slicers << (offset...(offset + length)) 307 else 308 expected_n_args = "1..2" 309 end 310 if expected_n_args 311 message = "wrong number of arguments " + 312 "(given #{args.size}, expected #{expected_n_args})" 313 raise ArgumentError, message 314 end 315 end 316 317 filter_options = Arrow::FilterOptions.new 318 filter_options.null_selection_behavior = :emit_null 319 sliced_tables = [] 320 slicers.each do |slicer| 321 slicer = slicer.evaluate if slicer.respond_to?(:evaluate) 322 case slicer 323 when Integer 324 slicer += n_rows if slicer < 0 325 sliced_tables << slice_by_range(slicer, n_rows - 1) 326 when Range 327 original_from = from = slicer.first 328 to = slicer.last 329 to -= 1 if slicer.exclude_end? 330 from += n_rows if from < 0 331 if from < 0 or from >= n_rows 332 message = 333 "offset is out of range (-#{n_rows + 1},#{n_rows}): " + 334 "#{original_from}" 335 raise ArgumentError, message 336 end 337 to += n_rows if to < 0 338 sliced_tables << slice_by_range(from, to) 339 when ::Array, BooleanArray, ChunkedArray 340 sliced_tables << filter(slicer, filter_options) 341 else 342 message = "slicer must be Integer, Range, (from, to), " + 343 "Arrow::ChunkedArray of Arrow::BooleanArray, " + 344 "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}" 345 raise ArgumentError, message 346 end 347 end 348 if sliced_tables.size > 1 349 sliced_tables[0].concatenate(sliced_tables[1..-1]) 350 else 351 sliced_tables[0] 352 end 353 end 354 355 # TODO 356 # 357 # @return [Arrow::Table] 358 def merge(other) 359 added_columns = {} 360 removed_columns = {} 361 362 case other 363 when Hash 364 other.each do |name, value| 365 name = name.to_s 366 if value 367 added_columns[name] = ensure_raw_column(name, value) 368 else 369 removed_columns[name] = true 370 end 371 end 372 when Table 373 added_columns = {} 374 other.columns.each do |column| 375 name = column.name 376 added_columns[name] = ensure_raw_column(name, column) 377 end 378 else 379 message = "merge target must be Hash or Arrow::Table: " + 380 "<#{other.inspect}>: #{inspect}" 381 raise ArgumentError, message 382 end 383 384 new_columns = [] 385 columns.each do |column| 386 column_name = column.name 387 new_column = added_columns.delete(column_name) 388 if new_column 389 new_columns << new_column 390 next 391 end 392 next if removed_columns.key?(column_name) 393 new_columns << ensure_raw_column(column_name, column) 394 end 395 added_columns.each do |name, new_column| 396 new_columns << new_column 397 end 398 new_fields = [] 399 new_arrays = [] 400 new_columns.each do |new_column| 401 new_fields << new_column[:field] 402 new_arrays << new_column[:data] 403 end 404 self.class.new(new_fields, new_arrays) 405 end 406 407 alias_method :remove_column_raw, :remove_column 408 def remove_column(name_or_index) 409 case name_or_index 410 when String, Symbol 411 name = name_or_index.to_s 412 index = columns.index {|column| column.name == name} 413 if index.nil? 414 message = "unknown column: #{name_or_index.inspect}: #{inspect}" 415 raise KeyError.new(message) 416 end 417 else 418 index = name_or_index 419 index += n_columns if index < 0 420 if index < 0 or index >= n_columns 421 message = "out of index (0..#{n_columns - 1}): " + 422 "#{name_or_index.inspect}: #{inspect}" 423 raise IndexError.new(message) 424 end 425 end 426 remove_column_raw(index) 427 end 428 429 # Experimental 430 def group(*keys) 431 Group.new(self, keys) 432 end 433 434 # Experimental 435 def window(size: nil) 436 RollingWindow.new(self, size) 437 end 438 439 def save(output, options={}) 440 saver = TableSaver.new(self, output, options) 441 saver.save 442 end 443 444 def pack 445 packed_arrays = columns.collect do |column| 446 column.data.pack 447 end 448 self.class.new(schema, packed_arrays) 449 end 450 451 alias_method :to_s_raw, :to_s 452 def to_s(options={}) 453 format = options[:format] 454 case format 455 when :column 456 return to_s_raw 457 when :list 458 formatter_class = TableListFormatter 459 when :table, nil 460 formatter_class = TableTableFormatter 461 else 462 message = ":format must be :column, :list, :table or nil" 463 raise ArgumentError, "#{message}: <#{format.inspect}>" 464 end 465 formatter = formatter_class.new(self, options) 466 formatter.format 467 end 468 469 alias_method :inspect_raw, :inspect 470 def inspect 471 "#{super}\n#{to_s}" 472 end 473 474 def respond_to_missing?(name, include_private) 475 return true if find_column(name) 476 super 477 end 478 479 def method_missing(name, *args, &block) 480 if args.empty? 481 column = find_column(name) 482 return column if column 483 end 484 super 485 end 486 487 private 488 def slice_by_range(from, to) 489 slice_raw(from, to - from + 1) 490 end 491 492 def ensure_raw_column(name, data) 493 case data 494 when Array 495 { 496 field: Field.new(name, data.value_data_type), 497 data: ChunkedArray.new([data]), 498 } 499 when ChunkedArray 500 { 501 field: Field.new(name, data.value_data_type), 502 data: data, 503 } 504 when Column 505 column = data 506 data = column.data 507 data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray) 508 { 509 field: column.field, 510 data: data, 511 } 512 else 513 message = "column must be Arrow::Array or Arrow::Column: " + 514 "<#{name}>: <#{data.inspect}>: #{inspect}" 515 raise ArgumentError, message 516 end 517 end 518 end 519end 520