1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18require "arrow/raw-table-converter"
19
20module Arrow
21  class Table
22    include ColumnContainable
23    include GenericFilterable
24    include GenericTakeable
25    include RecordContainable
26
27    class << self
28      def load(path, options={})
29        TableLoader.load(path, options)
30      end
31    end
32
33    alias_method :initialize_raw, :initialize
34    private :initialize_raw
35
36    # Creates a new {Arrow::Table}.
37    #
38    # @overload initialize(columns)
39    #
40    #   @param columns [::Array<Arrow::Column>] The columns of the table.
41    #
42    #   @example Create a table from columns
43    #     count_field = Arrow::Field.new("count", :uint32)
44    #     count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
45    #     count_column = Arrow::Column.new(count_field, count_array)
46    #     visible_field = Arrow::Field.new("visible", :boolean)
47    #     visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
48    #     visible_column = Arrow::Column.new(visible_field, visible_array)
49    #     Arrow::Table.new([count_column, visible_column])
50    #
51    # @overload initialize(raw_table)
52    #
53    #   @param raw_table [Hash<String, Arrow::Array>]
54    #     The pairs of column name and values of the table. Column values is
55    #     `Arrow::Array`.
56    #
57    #   @example Create a table from column name and values
58    #     Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
59    #                      "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
60    #
61    # @overload initialize(raw_table)
62    #
63    #   @param raw_table [Hash<String, Arrow::ChunkedArray>]
64    #     The pairs of column name and values of the table. Column values is
65    #     `Arrow::ChunkedArray`.
66    #
67    #   @example Create a table from column name and values
68    #     count_chunks = [
69    #       Arrow::UInt32Array.new([0, 2]),
70    #       Arrow::UInt32Array.new([nil, 4]),
71    #     ]
72    #     visible_chunks = [
73    #       Arrow::BooleanArray.new([true]),
74    #       Arrow::BooleanArray.new([nil, nil, false]),
75    #     ]
76    #     Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
77    #                      "visible" => Arrow::ChunkedArray.new(visible_chunks))
78    #
79    # @overload initialize(raw_table)
80    #
81    #   @param raw_table [Hash<String, ::Array>]
82    #     The pairs of column name and values of the table. Column values is
83    #     `Array`.
84    #
85    #   @example Create a table from column name and values
86    #     Arrow::Table.new("count" => [0, 2, nil, 4],
87    #                      "visible" => [true, nil, nil, false])
88    #
89    # @overload initialize(schema, columns)
90    #
91    #   @param schema [Arrow::Schema] The schema of the table.
92    #     You can also specify schema as primitive Ruby objects.
93    #     See {Arrow::Schema#initialize} for details.
94    #
95    #   @param columns [::Array<Arrow::Column>] The data of the table.
96    #
97    #   @example Create a table from schema and columns
98    #     count_field = Arrow::Field.new("count", :uint32)
99    #     count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
100    #     count_column = Arrow::Column.new(count_field, count_array)
101    #     visible_field = Arrow::Field.new("visible", :boolean)
102    #     visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
103    #     visible_column = Arrow::Column.new(visible_field, visible_array)
104    #     Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
105    #                      [count_column, visible_column])
106    #
107    # @overload initialize(schema, arrays)
108    #
109    #   @param schema [Arrow::Schema] The schema of the table.
110    #     You can also specify schema as primitive Ruby objects.
111    #     See {Arrow::Schema#initialize} for details.
112    #
113    #   @param arrays [::Array<Arrow::Array>] The data of the table.
114    #
115    #   @example Create a table from schema and arrays
116    #     count_field = Arrow::Field.new("count", :uint32)
117    #     count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
118    #     visible_field = Arrow::Field.new("visible", :boolean)
119    #     visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
120    #     Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
121    #                      [count_array, visible_array])
122    #
123    # @overload initialize(schema, record_batches)
124    #
125    #   @param schema [Arrow::Schema] The schema of the table.
126    #     You can also specify schema as primitive Ruby objects.
127    #     See {Arrow::Schema#initialize} for details.
128    #
129    #   @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
130    #
131    #   @example Create a table from schema and record batches
132    #     count_field = Arrow::Field.new("count", :uint32)
133    #     visible_field = Arrow::Field.new("visible", :boolean)
134    #     schema = Arrow::Schema.new([count_field, visible_field])
135    #     record_batches = [
136    #       Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
137    #       Arrow::RecordBatch.new(schema, [[4, false]]),
138    #     ]
139    #     Arrow::Table.new(schema, record_batches)
140    #
141    # @overload initialize(schema, raw_records)
142    #
143    #   @param schema [Arrow::Schema] The schema of the table.
144    #     You can also specify schema as primitive Ruby objects.
145    #     See {Arrow::Schema#initialize} for details.
146    #
147    #   @param arrays [::Array<::Array>] The data of the table as primitive
148    #     Ruby objects.
149    #
150    #   @example Create a table from schema and raw records
151    #     schema = {
152    #       count: :uint32,
153    #       visible: :boolean,
154    #     }
155    #     raw_records = [
156    #       [0, true],
157    #       [2, nil],
158    #       [nil, nil],
159    #       [4, false],
160    #     ]
161    #     Arrow::Table.new(schema, raw_records)
162    def initialize(*args)
163      n_args = args.size
164      case n_args
165      when 1
166        raw_table_converter = RawTableConverter.new(args[0])
167        schema = raw_table_converter.schema
168        values = raw_table_converter.values
169      when 2
170        schema = args[0]
171        schema = Schema.new(schema) unless schema.is_a?(Schema)
172        values = args[1]
173        case values[0]
174        when ::Array
175          values = [RecordBatch.new(schema, values)]
176        when Column
177          values = values.collect(&:data)
178        end
179      else
180        message = "wrong number of arguments (given #{n_args}, expected 1..2)"
181        raise ArgumentError, message
182      end
183      initialize_raw(schema, values)
184    end
185
186    def each_record_batch
187      return to_enum(__method__) unless block_given?
188
189      reader = TableBatchReader.new(self)
190      while record_batch = reader.read_next
191        yield(record_batch)
192      end
193    end
194
195    alias_method :size, :n_rows
196    alias_method :length, :n_rows
197
198    alias_method :slice_raw, :slice
199
200    # @overload slice(offset, length)
201    #
202    #   @param offset [Integer] The offset of sub Arrow::Table.
203    #   @param length [Integer] The length of sub Arrow::Table.
204    #   @return [Arrow::Table]
205    #     The sub `Arrow::Table` that covers only from
206    #     `offset` to `offset + length` range.
207    #
208    # @overload slice(index)
209    #
210    #   @param index [Integer] The index in this table.
211    #   @return [Arrow::Record]
212    #     The `Arrow::Record` corresponding to index of
213    #     the table.
214    #
215    # @overload slice(booleans)
216    #
217    #   @param booleans [::Array<Boolean>]
218    #     The values indicating the target rows.
219    #   @return [Arrow::Table]
220    #     The sub `Arrow::Table` that covers only rows of indices
221    #     the values of `booleans` is true.
222    #
223    # @overload slice(boolean_array)
224    #
225    #   @param boolean_array [::Array<Arrow::BooleanArray>]
226    #     The values indicating the target rows.
227    #   @return [Arrow::Table]
228    #     The sub `Arrow::Table` that covers only rows of indices
229    #     the values of `boolean_array` is true.
230    #
231    # @overload slice(range)
232    #
233    #   @param range_included_end [Range] The range indicating the target rows.
234    #   @return [Arrow::Table]
235    #     The sub `Arrow::Table` that covers only rows of the range of indices.
236    #
237    # @overload slice(conditions)
238    #
239    #   @param conditions [Hash] The conditions to select records.
240    #   @return [Arrow::Table]
241    #     The sub `Arrow::Table` that covers only rows matched by condition
242    #
243    # @overload slice
244    #
245    #   @yield [slicer] Gives slicer that constructs condition to select records.
246    #   @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
247    #     build condition.
248    #   @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
249    #     The condition to select records.
250    #   @return [Arrow::Table]
251    #     The sub `Arrow::Table` that covers only rows matched by condition
252    #     specified by slicer.
253    def slice(*args)
254      slicers = []
255      if block_given?
256        unless args.empty?
257          raise ArgumentError, "must not specify both arguments and block"
258        end
259        block_slicer = yield(Slicer.new(self))
260        case block_slicer
261        when ::Array
262          slicers.concat(block_slicer)
263        else
264          slicers << block_slicer
265        end
266      else
267        expected_n_args = nil
268        case args.size
269        when 1
270          case args[0]
271          when Integer
272            index = args[0]
273            index += n_rows if index < 0
274            return nil if index < 0
275            return nil if index >= n_rows
276            return Record.new(self, index)
277          when Hash
278            condition_pairs = args[0]
279            slicer = Slicer.new(self)
280            conditions = []
281            condition_pairs.each do |key, value|
282              case value
283              when Range
284                # TODO: Optimize "begin <= key <= end" case by missing "between" kernel
285                # https://issues.apache.org/jira/browse/ARROW-9843
286                unless value.begin.nil?
287                  conditions << (slicer[key] >= value.begin)
288                end
289                unless value.end.nil?
290                  if value.exclude_end?
291                    conditions << (slicer[key] < value.end)
292                  else
293                    conditions << (slicer[key] <= value.end)
294                  end
295                end
296              else
297                conditions << (slicer[key] == value)
298              end
299            end
300            slicers << conditions.inject(:&)
301          else
302            slicers << args[0]
303          end
304        when 2
305          offset, length = args
306          slicers << (offset...(offset + length))
307        else
308          expected_n_args = "1..2"
309        end
310        if expected_n_args
311          message = "wrong number of arguments " +
312            "(given #{args.size}, expected #{expected_n_args})"
313          raise ArgumentError, message
314        end
315      end
316
317      filter_options = Arrow::FilterOptions.new
318      filter_options.null_selection_behavior = :emit_null
319      sliced_tables = []
320      slicers.each do |slicer|
321        slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
322        case slicer
323        when Integer
324          slicer += n_rows if slicer < 0
325          sliced_tables << slice_by_range(slicer, n_rows - 1)
326        when Range
327          original_from = from = slicer.first
328          to = slicer.last
329          to -= 1 if slicer.exclude_end?
330          from += n_rows if from < 0
331          if from < 0 or from >= n_rows
332            message =
333              "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
334              "#{original_from}"
335            raise ArgumentError, message
336          end
337          to += n_rows if to < 0
338          sliced_tables << slice_by_range(from, to)
339        when ::Array, BooleanArray, ChunkedArray
340          sliced_tables << filter(slicer, filter_options)
341        else
342          message = "slicer must be Integer, Range, (from, to), " +
343            "Arrow::ChunkedArray of Arrow::BooleanArray, " +
344            "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
345          raise ArgumentError, message
346        end
347      end
348      if sliced_tables.size > 1
349        sliced_tables[0].concatenate(sliced_tables[1..-1])
350      else
351        sliced_tables[0]
352      end
353    end
354
355    # TODO
356    #
357    # @return [Arrow::Table]
358    def merge(other)
359      added_columns = {}
360      removed_columns = {}
361
362      case other
363      when Hash
364        other.each do |name, value|
365          name = name.to_s
366          if value
367            added_columns[name] = ensure_raw_column(name, value)
368          else
369            removed_columns[name] = true
370          end
371        end
372      when Table
373        added_columns = {}
374        other.columns.each do |column|
375          name = column.name
376          added_columns[name] = ensure_raw_column(name, column)
377        end
378      else
379        message = "merge target must be Hash or Arrow::Table: " +
380          "<#{other.inspect}>: #{inspect}"
381        raise ArgumentError, message
382      end
383
384      new_columns = []
385      columns.each do |column|
386        column_name = column.name
387        new_column = added_columns.delete(column_name)
388        if new_column
389          new_columns << new_column
390          next
391        end
392        next if removed_columns.key?(column_name)
393        new_columns << ensure_raw_column(column_name, column)
394      end
395      added_columns.each do |name, new_column|
396        new_columns << new_column
397      end
398      new_fields = []
399      new_arrays = []
400      new_columns.each do |new_column|
401        new_fields << new_column[:field]
402        new_arrays << new_column[:data]
403      end
404      self.class.new(new_fields, new_arrays)
405    end
406
407    alias_method :remove_column_raw, :remove_column
408    def remove_column(name_or_index)
409      case name_or_index
410      when String, Symbol
411        name = name_or_index.to_s
412        index = columns.index {|column| column.name == name}
413        if index.nil?
414          message = "unknown column: #{name_or_index.inspect}: #{inspect}"
415          raise KeyError.new(message)
416        end
417      else
418        index = name_or_index
419        index += n_columns if index < 0
420        if index < 0 or index >= n_columns
421          message = "out of index (0..#{n_columns - 1}): " +
422            "#{name_or_index.inspect}: #{inspect}"
423          raise IndexError.new(message)
424        end
425      end
426      remove_column_raw(index)
427    end
428
429    # Experimental
430    def group(*keys)
431      Group.new(self, keys)
432    end
433
434    # Experimental
435    def window(size: nil)
436      RollingWindow.new(self, size)
437    end
438
439    def save(output, options={})
440      saver = TableSaver.new(self, output, options)
441      saver.save
442    end
443
444    def pack
445      packed_arrays = columns.collect do |column|
446        column.data.pack
447      end
448      self.class.new(schema, packed_arrays)
449    end
450
451    alias_method :to_s_raw, :to_s
452    def to_s(options={})
453      format = options[:format]
454      case format
455      when :column
456        return to_s_raw
457      when :list
458        formatter_class = TableListFormatter
459      when :table, nil
460        formatter_class = TableTableFormatter
461      else
462        message = ":format must be :column, :list, :table or nil"
463        raise ArgumentError, "#{message}: <#{format.inspect}>"
464      end
465      formatter = formatter_class.new(self, options)
466      formatter.format
467    end
468
469    alias_method :inspect_raw, :inspect
470    def inspect
471      "#{super}\n#{to_s}"
472    end
473
474    def respond_to_missing?(name, include_private)
475      return true if find_column(name)
476      super
477    end
478
479    def method_missing(name, *args, &block)
480      if args.empty?
481        column = find_column(name)
482        return column if column
483      end
484      super
485    end
486
487    private
488    def slice_by_range(from, to)
489      slice_raw(from, to - from + 1)
490    end
491
492    def ensure_raw_column(name, data)
493      case data
494      when Array
495        {
496          field: Field.new(name, data.value_data_type),
497          data: ChunkedArray.new([data]),
498        }
499      when ChunkedArray
500        {
501          field: Field.new(name, data.value_data_type),
502          data: data,
503        }
504      when Column
505        column = data
506        data = column.data
507        data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray)
508        {
509          field: column.field,
510          data: data,
511        }
512      else
513        message = "column must be Arrow::Array or Arrow::Column: " +
514          "<#{name}>: <#{data.inspect}>: #{inspect}"
515        raise ArgumentError, message
516      end
517    end
518  end
519end
520