1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18#' @include arrow-package.R 19#' @title Schema class 20#' 21#' @description A `Schema` is a list of [Field]s, which map names to 22#' Arrow [data types][data-type]. Create a `Schema` when you 23#' want to convert an R `data.frame` to Arrow but don't want to rely on the 24#' default mapping of R types to Arrow types, such as when you want to choose a 25#' specific numeric precision, or when creating a [Dataset] and you want to 26#' ensure a specific schema rather than inferring it from the various files. 27#' 28#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method 29#' (active binding) that lets you access their schema. 30#' 31#' @usage NULL 32#' @format NULL 33#' @docType class 34#' @section Methods: 35#' 36#' - `$ToString()`: convert to a string 37#' - `$field(i)`: returns the field at index `i` (0-based) 38#' - `$GetFieldByName(x)`: returns the field with name `x` 39#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value 40#' `metadata` set. Note that all list elements in `metadata` will be coerced 41#' to `character`. 42#' 43#' @section Active bindings: 44#' 45#' - `$names`: returns the field names (called in `names(Schema)`) 46#' - `$num_fields`: returns the number of fields (called in `length(Schema)`) 47#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for 48#' iterating over 49#' - `$HasMetadata`: logical: does this `Schema` have extra metadata? 50#' - `$metadata`: returns the key-value metadata as a named list. 51#' Modify or replace by assigning in (`sch$metadata <- new_metadata`). 52#' All list elements are coerced to string. 53#' 54#' @section R Metadata: 55#' 56#' When converting a data.frame to an Arrow Table or RecordBatch, attributes 57#' from the `data.frame` are saved alongside tables so that the object can be 58#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata 59#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or 60#' at the column (e.g. `attributes(df$col_a)`) or for list columns only: 61#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows 62#' for storing `haven` columns in a table and being able to faithfully 63#' re-create them when pulled back into R. This metadata is separate from the 64#' schema (column names and types) which is compatible with other Arrow 65#' clients. The R metadata is only read by R and is ignored by other clients 66#' (e.g. Pandas has its own custom metadata). This metadata is stored in 67#' `$metadata$r`. 68#' 69#' Since Schema metadata keys and values must be strings, this metadata is 70#' saved by serializing R's attribute list structure to a string. If the 71#' serialized metadata exceeds 100Kb in size, by default it is compressed 72#' starting in version 3.0.0. To disable this compression (e.g. for tables 73#' that are compatible with Arrow versions before 3.0.0 and include large 74#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`. 75#' Files with compressed metadata are readable by older versions of arrow, but 76#' the metadata is dropped. 77#' 78#' @rdname Schema 79#' @name Schema 80#' @examplesIf arrow_available() 81#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) 82#' tab1 <- arrow_table(df) 83#' tab1$schema 84#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32())) 85#' tab2$schema 86#' @export 87Schema <- R6Class("Schema", 88 inherit = ArrowObject, 89 public = list( 90 ToString = function() { 91 fields <- print_schema_fields(self) 92 if (self$HasMetadata) { 93 fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata") 94 } 95 fields 96 }, 97 field = function(i) Schema__field(self, i), 98 GetFieldByName = function(x) Schema__GetFieldByName(self, x), 99 AddField = function(i, field) { 100 assert_is(field, "Field") 101 Schema__AddField(self, i, field) 102 }, 103 SetField = function(i, field) { 104 assert_is(field, "Field") 105 Schema__SetField(self, i, field) 106 }, 107 RemoveField = function(i) Schema__RemoveField(self, i), 108 serialize = function() Schema__serialize(self), 109 WithMetadata = function(metadata = NULL) { 110 metadata <- prepare_key_value_metadata(metadata) 111 Schema__WithMetadata(self, metadata) 112 }, 113 Equals = function(other, check_metadata = FALSE, ...) { 114 inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata)) 115 }, 116 export_to_c = function(ptr) ExportSchema(self, ptr) 117 ), 118 active = list( 119 names = function() { 120 Schema__field_names(self) 121 }, 122 num_fields = function() Schema__num_fields(self), 123 fields = function() Schema__fields(self), 124 HasMetadata = function() Schema__HasMetadata(self), 125 metadata = function(new_metadata) { 126 if (missing(new_metadata)) { 127 Schema__metadata(self) 128 } else { 129 # Set the metadata 130 out <- self$WithMetadata(new_metadata) 131 # $WithMetadata returns a new object but we're modifying in place, 132 # so swap in that new C++ object pointer into our R6 object 133 self$set_pointer(out$pointer()) 134 self 135 } 136 }, 137 r_metadata = function(new) { 138 # Helper for the R metadata that handles the serialization 139 # See also method on ArrowTabular 140 if (missing(new)) { 141 out <- self$metadata$r 142 if (!is.null(out)) { 143 # Can't unserialize NULL 144 out <- .unserialize_arrow_r_metadata(out) 145 } 146 # Returns either NULL or a named list 147 out 148 } else { 149 # Set the R metadata 150 self$metadata$r <- .serialize_arrow_r_metadata(new) 151 self 152 } 153 } 154 ) 155) 156Schema$create <- function(...) { 157 .list <- list2(...) 158 if (all(map_lgl(.list, ~ inherits(., "Field")))) { 159 schema_(.list) 160 } else { 161 schema_(.fields(.list)) 162 } 163} 164#' @include arrowExports.R 165Schema$import_from_c <- ImportSchema 166 167prepare_key_value_metadata <- function(metadata) { 168 # key-value-metadata must be a named character vector; 169 # this function validates and coerces 170 if (is.null(metadata)) { 171 # NULL to remove metadata, so equivalent to setting an empty list 172 metadata <- empty_named_list() 173 } 174 if (is.null(names(metadata))) { 175 stop( 176 "Key-value metadata must be a named list or character vector", 177 call. = FALSE 178 ) 179 } 180 map_chr(metadata, as.character) 181} 182 183print_schema_fields <- function(s) { 184 # Alternative to Schema__ToString that doesn't print metadata 185 paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n") 186} 187 188#' @param ... named list containing [data types][data-type] or 189#' a list of [fields][field] containing the fields for the schema 190#' @export 191#' @rdname Schema 192schema <- Schema$create 193 194#' @export 195names.Schema <- function(x) x$names 196 197#' @export 198length.Schema <- function(x) x$num_fields 199 200#' @export 201`[[.Schema` <- function(x, i, ...) { 202 if (is.character(i)) { 203 x$GetFieldByName(i) 204 } else if (is.numeric(i)) { 205 x$field(i - 1) 206 } else { 207 stop("'i' must be character or numeric, not ", class(i), call. = FALSE) 208 } 209} 210 211#' @export 212`[[<-.Schema` <- function(x, i, value) { 213 assert_that(length(i) == 1) 214 if (is.character(i)) { 215 field_names <- names(x) 216 if (anyDuplicated(field_names)) { 217 stop("Cannot update field by name with duplicates", call. = FALSE) 218 } 219 220 # If i is character, it's the field name 221 if (!is.null(value) && !inherits(value, "Field")) { 222 value <- field(i, as_type(value, "value")) 223 } 224 225 # No match means we're adding to the end 226 i <- match(i, field_names, nomatch = length(field_names) + 1L) 227 } else { 228 assert_that(is.numeric(i), !is.na(i), i > 0) 229 # If i is numeric and we have a type, 230 # we need to grab the existing field name for the new one 231 if (!is.null(value) && !inherits(value, "Field")) { 232 value <- field(names(x)[i], as_type(value, "value")) 233 } 234 } 235 236 i <- as.integer(i - 1L) 237 if (i >= length(x)) { 238 if (!is.null(value)) { 239 x <- x$AddField(i, value) 240 } 241 } else if (is.null(value)) { 242 x <- x$RemoveField(i) 243 } else { 244 x <- x$SetField(i, value) 245 } 246 x 247} 248 249#' @export 250`$<-.Schema` <- `$<-.ArrowTabular` 251 252#' @export 253`[.Schema` <- function(x, i, ...) { 254 if (is.logical(i)) { 255 i <- rep_len(i, length(x)) # For R recycling behavior 256 i <- which(i) 257 } 258 if (is.numeric(i)) { 259 if (all(i < 0)) { 260 # in R, negative i means "everything but i" 261 i <- setdiff(seq_len(length(x)), -1 * i) 262 } 263 } 264 fields <- map(i, ~ x[[.]]) 265 invalid <- map_lgl(fields, is.null) 266 if (any(invalid)) { 267 stop( 268 "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "), 269 oxford_paste(i[invalid]), 270 call. = FALSE 271 ) 272 } 273 schema_(fields) 274} 275 276#' @export 277`$.Schema` <- function(x, name, ...) { 278 assert_that(is.string(name)) 279 if (name %in% ls(x)) { 280 get(name, x) 281 } else { 282 x$GetFieldByName(name) 283 } 284} 285 286#' @export 287as.list.Schema <- function(x, ...) x$fields 288 289#' read a Schema from a stream 290#' 291#' @param stream a `Message`, `InputStream`, or `Buffer` 292#' @param ... currently ignored 293#' @return A [Schema] 294#' @export 295read_schema <- function(stream, ...) { 296 if (inherits(stream, "Message")) { 297 return(ipc___ReadSchema_Message(stream)) 298 } else { 299 if (!inherits(stream, "InputStream")) { 300 stream <- BufferReader$create(stream) 301 on.exit(stream$close()) 302 } 303 return(ipc___ReadSchema_InputStream(stream)) 304 } 305} 306 307#' Combine and harmonize schemas 308#' 309#' @param ... [Schema]s to unify 310#' @param schemas Alternatively, a list of schemas 311#' @return A `Schema` with the union of fields contained in the inputs, or 312#' `NULL` if any of `schemas` is `NULL` 313#' @export 314#' @examplesIf arrow_available() 315#' a <- schema(b = double(), c = bool()) 316#' z <- schema(b = double(), k = utf8()) 317#' unify_schemas(a, z) 318unify_schemas <- function(..., schemas = list(...)) { 319 if (any(vapply(schemas, is.null, TRUE))) { 320 return(NULL) 321 } 322 arrow__UnifySchemas(schemas) 323} 324 325#' @export 326print.arrow_r_metadata <- function(x, ...) { 327 utils::str(x) 328 utils::str(.unserialize_arrow_r_metadata(x)) 329 invisible(x) 330} 331