1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18#' @include arrow-package.R
19#' @title Schema class
20#'
21#' @description A `Schema` is a list of [Field]s, which map names to
22#' Arrow [data types][data-type]. Create a `Schema` when you
23#' want to convert an R `data.frame` to Arrow but don't want to rely on the
24#' default mapping of R types to Arrow types, such as when you want to choose a
25#' specific numeric precision, or when creating a [Dataset] and you want to
26#' ensure a specific schema rather than inferring it from the various files.
27#'
28#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method
29#' (active binding) that lets you access their schema.
30#'
31#' @usage NULL
32#' @format NULL
33#' @docType class
34#' @section Methods:
35#'
36#' - `$ToString()`: convert to a string
37#' - `$field(i)`: returns the field at index `i` (0-based)
38#' - `$GetFieldByName(x)`: returns the field with name `x`
39#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value
40#'    `metadata` set. Note that all list elements in `metadata` will be coerced
41#'    to `character`.
42#'
43#' @section Active bindings:
44#'
45#' - `$names`: returns the field names (called in `names(Schema)`)
46#' - `$num_fields`: returns the number of fields (called in `length(Schema)`)
47#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for
48#'   iterating over
49#' - `$HasMetadata`: logical: does this `Schema` have extra metadata?
50#' - `$metadata`: returns the key-value metadata as a named list.
51#'    Modify or replace by assigning in (`sch$metadata <- new_metadata`).
52#'    All list elements are coerced to string.
53#'
54#' @section R Metadata:
55#'
56#'   When converting a data.frame to an Arrow Table or RecordBatch, attributes
57#'   from the `data.frame` are saved alongside tables so that the object can be
58#'   reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
59#'   can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
60#'   at the column (e.g. `attributes(df$col_a)`) or for list columns only:
61#'   element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
62#'   for storing `haven` columns in a table and being able to faithfully
63#'   re-create them when pulled back into R. This metadata is separate from the
64#'   schema (column names and types) which is compatible with other Arrow
65#'   clients. The R metadata is only read by R and is ignored by other clients
66#'   (e.g. Pandas has its own custom metadata). This metadata is stored in
67#'   `$metadata$r`.
68#'
69#'   Since Schema metadata keys and values must be strings, this metadata is
70#'   saved by serializing R's attribute list structure to a string. If the
71#'   serialized metadata exceeds 100Kb in size, by default it is compressed
72#'   starting in version 3.0.0. To disable this compression (e.g. for tables
73#'   that are compatible with Arrow versions before 3.0.0 and include large
74#'   amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
75#'   Files with compressed metadata are readable by older versions of arrow, but
76#'   the metadata is dropped.
77#'
78#' @rdname Schema
79#' @name Schema
80#' @examplesIf arrow_available()
81#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
82#' tab1 <- arrow_table(df)
83#' tab1$schema
84#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
85#' tab2$schema
86#' @export
87Schema <- R6Class("Schema",
88  inherit = ArrowObject,
89  public = list(
90    ToString = function() {
91      fields <- print_schema_fields(self)
92      if (self$HasMetadata) {
93        fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata")
94      }
95      fields
96    },
97    field = function(i) Schema__field(self, i),
98    GetFieldByName = function(x) Schema__GetFieldByName(self, x),
99    AddField = function(i, field) {
100      assert_is(field, "Field")
101      Schema__AddField(self, i, field)
102    },
103    SetField = function(i, field) {
104      assert_is(field, "Field")
105      Schema__SetField(self, i, field)
106    },
107    RemoveField = function(i) Schema__RemoveField(self, i),
108    serialize = function() Schema__serialize(self),
109    WithMetadata = function(metadata = NULL) {
110      metadata <- prepare_key_value_metadata(metadata)
111      Schema__WithMetadata(self, metadata)
112    },
113    Equals = function(other, check_metadata = FALSE, ...) {
114      inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata))
115    },
116    export_to_c = function(ptr) ExportSchema(self, ptr)
117  ),
118  active = list(
119    names = function() {
120      Schema__field_names(self)
121    },
122    num_fields = function() Schema__num_fields(self),
123    fields = function() Schema__fields(self),
124    HasMetadata = function() Schema__HasMetadata(self),
125    metadata = function(new_metadata) {
126      if (missing(new_metadata)) {
127        Schema__metadata(self)
128      } else {
129        # Set the metadata
130        out <- self$WithMetadata(new_metadata)
131        # $WithMetadata returns a new object but we're modifying in place,
132        # so swap in that new C++ object pointer into our R6 object
133        self$set_pointer(out$pointer())
134        self
135      }
136    },
137    r_metadata = function(new) {
138      # Helper for the R metadata that handles the serialization
139      # See also method on ArrowTabular
140      if (missing(new)) {
141        out <- self$metadata$r
142        if (!is.null(out)) {
143          # Can't unserialize NULL
144          out <- .unserialize_arrow_r_metadata(out)
145        }
146        # Returns either NULL or a named list
147        out
148      } else {
149        # Set the R metadata
150        self$metadata$r <- .serialize_arrow_r_metadata(new)
151        self
152      }
153    }
154  )
155)
156Schema$create <- function(...) {
157  .list <- list2(...)
158  if (all(map_lgl(.list, ~ inherits(., "Field")))) {
159    schema_(.list)
160  } else {
161    schema_(.fields(.list))
162  }
163}
164#' @include arrowExports.R
165Schema$import_from_c <- ImportSchema
166
167prepare_key_value_metadata <- function(metadata) {
168  # key-value-metadata must be a named character vector;
169  # this function validates and coerces
170  if (is.null(metadata)) {
171    # NULL to remove metadata, so equivalent to setting an empty list
172    metadata <- empty_named_list()
173  }
174  if (is.null(names(metadata))) {
175    stop(
176      "Key-value metadata must be a named list or character vector",
177      call. = FALSE
178    )
179  }
180  map_chr(metadata, as.character)
181}
182
183print_schema_fields <- function(s) {
184  # Alternative to Schema__ToString that doesn't print metadata
185  paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
186}
187
188#' @param ... named list containing [data types][data-type] or
189#'   a list of [fields][field] containing the fields for the schema
190#' @export
191#' @rdname Schema
192schema <- Schema$create
193
194#' @export
195names.Schema <- function(x) x$names
196
197#' @export
198length.Schema <- function(x) x$num_fields
199
200#' @export
201`[[.Schema` <- function(x, i, ...) {
202  if (is.character(i)) {
203    x$GetFieldByName(i)
204  } else if (is.numeric(i)) {
205    x$field(i - 1)
206  } else {
207    stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
208  }
209}
210
211#' @export
212`[[<-.Schema` <- function(x, i, value) {
213  assert_that(length(i) == 1)
214  if (is.character(i)) {
215    field_names <- names(x)
216    if (anyDuplicated(field_names)) {
217      stop("Cannot update field by name with duplicates", call. = FALSE)
218    }
219
220    # If i is character, it's the field name
221    if (!is.null(value) && !inherits(value, "Field")) {
222      value <- field(i, as_type(value, "value"))
223    }
224
225    # No match means we're adding to the end
226    i <- match(i, field_names, nomatch = length(field_names) + 1L)
227  } else {
228    assert_that(is.numeric(i), !is.na(i), i > 0)
229    # If i is numeric and we have a type,
230    # we need to grab the existing field name for the new one
231    if (!is.null(value) && !inherits(value, "Field")) {
232      value <- field(names(x)[i], as_type(value, "value"))
233    }
234  }
235
236  i <- as.integer(i - 1L)
237  if (i >= length(x)) {
238    if (!is.null(value)) {
239      x <- x$AddField(i, value)
240    }
241  } else if (is.null(value)) {
242    x <- x$RemoveField(i)
243  } else {
244    x <- x$SetField(i, value)
245  }
246  x
247}
248
249#' @export
250`$<-.Schema` <- `$<-.ArrowTabular`
251
252#' @export
253`[.Schema` <- function(x, i, ...) {
254  if (is.logical(i)) {
255    i <- rep_len(i, length(x)) # For R recycling behavior
256    i <- which(i)
257  }
258  if (is.numeric(i)) {
259    if (all(i < 0)) {
260      # in R, negative i means "everything but i"
261      i <- setdiff(seq_len(length(x)), -1 * i)
262    }
263  }
264  fields <- map(i, ~ x[[.]])
265  invalid <- map_lgl(fields, is.null)
266  if (any(invalid)) {
267    stop(
268      "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "),
269      oxford_paste(i[invalid]),
270      call. = FALSE
271    )
272  }
273  schema_(fields)
274}
275
276#' @export
277`$.Schema` <- function(x, name, ...) {
278  assert_that(is.string(name))
279  if (name %in% ls(x)) {
280    get(name, x)
281  } else {
282    x$GetFieldByName(name)
283  }
284}
285
286#' @export
287as.list.Schema <- function(x, ...) x$fields
288
289#' read a Schema from a stream
290#'
291#' @param stream a `Message`, `InputStream`, or `Buffer`
292#' @param ... currently ignored
293#' @return A [Schema]
294#' @export
295read_schema <- function(stream, ...) {
296  if (inherits(stream, "Message")) {
297    return(ipc___ReadSchema_Message(stream))
298  } else {
299    if (!inherits(stream, "InputStream")) {
300      stream <- BufferReader$create(stream)
301      on.exit(stream$close())
302    }
303    return(ipc___ReadSchema_InputStream(stream))
304  }
305}
306
307#' Combine and harmonize schemas
308#'
309#' @param ... [Schema]s to unify
310#' @param schemas Alternatively, a list of schemas
311#' @return A `Schema` with the union of fields contained in the inputs, or
312#'   `NULL` if any of `schemas` is `NULL`
313#' @export
314#' @examplesIf arrow_available()
315#' a <- schema(b = double(), c = bool())
316#' z <- schema(b = double(), k = utf8())
317#' unify_schemas(a, z)
318unify_schemas <- function(..., schemas = list(...)) {
319  if (any(vapply(schemas, is.null, TRUE))) {
320    return(NULL)
321  }
322  arrow__UnifySchemas(schemas)
323}
324
325#' @export
326print.arrow_r_metadata <- function(x, ...) {
327  utils::str(x)
328  utils::str(.unserialize_arrow_r_metadata(x))
329  invisible(x)
330}
331