1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18 19expect_chunked_roundtrip <- function(x, type) { 20 a <- ChunkedArray$create(!!!x) 21 flat_x <- unlist(x, recursive = FALSE) 22 attributes(flat_x) <- attributes(x[[1]]) 23 expect_type_equal(a$type, type) 24 expect_identical(a$num_chunks, length(x)) 25 expect_identical(length(a), length(flat_x)) 26 if (!inherits(type, "ListType")) { 27 # TODO: revisit how missingness works with ListArrays 28 # R list objects don't handle missingness the same way as other vectors. 29 # Is there some vctrs thing we should do on the roundtrip back to R? 30 expect_identical(as.vector(is.na(a)), is.na(flat_x)) 31 } 32 expect_as_vector(a, flat_x) 33 expect_as_vector(a$chunk(0), x[[1]]) 34 35 if (length(flat_x)) { 36 a_sliced <- a$Slice(1) 37 x_sliced <- flat_x[-1] 38 expect_type_equal(a_sliced$type, type) 39 expect_identical(length(a_sliced), length(x_sliced)) 40 if (!inherits(type, "ListType")) { 41 expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced)) 42 } 43 expect_as_vector(a_sliced, x_sliced) 44 } 45 invisible(a) 46} 47 48test_that("ChunkedArray", { 49 x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32()) 50 51 y <- x$Slice(8) 52 expect_equal(y$type, int32()) 53 expect_equal(y$num_chunks, 3L) 54 expect_equal(length(y), 17L) 55 expect_as_vector(y, c(9:10, 1:10, 1:5)) 56 57 z <- x$Slice(8, 5) 58 expect_equal(z$type, int32()) 59 expect_equal(z$num_chunks, 2L) 60 expect_equal(z$length(), 5L) 61 expect_equal(z$as_vector(), c(9:10, 1:3)) 62 63 expect_chunked_roundtrip(list(c(1, 2, 3), c(4, 5, 6)), float64()) 64 65 # input validation 66 expect_error(x$chunk(14), "subscript out of bounds") 67 expect_error(x$chunk("one")) 68 expect_error(x$chunk(NA_integer_), "'i' cannot be NA") 69 expect_error(x$chunk(-1), "subscript out of bounds") 70 71 expect_error(x$Slice("ten")) 72 expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA") 73 expect_error(x$Slice(NA), "Slice 'offset' cannot be NA") 74 expect_error(x$Slice(10, "ten")) 75 expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA") 76 expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") 77 expect_error(x$Slice(c(10, 10))) 78 expect_error(x$Slice(10, c(10, 10))) 79 expect_error(x$Slice(1000), "Slice 'offset' greater than array length") 80 expect_error(x$Slice(-1), "Slice 'offset' cannot be negative") 81 expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length") 82 expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative") 83 expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative") 84 85 expect_warning(x$Slice(10, 15), NA) 86 expect_warning( 87 overslice <- x$Slice(10, 16), 88 "Slice 'length' greater than available length" 89 ) 90 expect_equal(length(overslice), 15) 91 expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length") 92}) 93 94test_that("print ChunkedArray", { 95 verify_output(test_path("test-chunked-array.txt"), { 96 chunked_array(c(1, 2, 3), c(4, 5, 6)) 97 chunked_array(1:30, c(4, 5, 6)) 98 chunked_array(1:30) 99 chunked_array(factor(c("a", "b")), factor(c("c", "d"))) 100 }) 101}) 102 103test_that("ChunkedArray handles !!! splicing", { 104 data <- list(1, 2, 3) 105 x <- chunked_array(!!!data) 106 expect_equal(x$type, float64()) 107 expect_equal(x$num_chunks, 3L) 108}) 109 110test_that("ChunkedArray handles Inf", { 111 data <- list(c(Inf, 2:10), c(1:3, Inf, 5L), 1:10) 112 x <- chunked_array(!!!data) 113 expect_equal(x$type, float64()) 114 expect_equal(x$num_chunks, 3L) 115 expect_equal(length(x), 25L) 116 expect_as_vector(x, c(c(Inf, 2:10), c(1:3, Inf, 5), 1:10)) 117 118 chunks <- x$chunks 119 expect_as_vector(is.infinite(chunks[[2]]), is.infinite(data[[2]])) 120 expect_equal( 121 as.vector(is.infinite(x)), 122 c(is.infinite(data[[1]]), is.infinite(data[[2]]), is.infinite(data[[3]])) 123 ) 124}) 125 126test_that("ChunkedArray handles NA", { 127 data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L)) 128 x <- chunked_array(!!!data) 129 expect_equal(x$type, int32()) 130 expect_equal(x$num_chunks, 3L) 131 expect_equal(length(x), 25L) 132 expect_as_vector(x, c(1:10, c(NA, 2:10), c(1:3, NA, 5))) 133 134 chunks <- x$chunks 135 expect_as_vector(is.na(chunks[[2]]), is.na(data[[2]])) 136 expect_as_vector(is.na(x), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]]))) 137}) 138 139test_that("ChunkedArray handles NaN", { 140 data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L)) 141 x <- chunked_array(!!!data) 142 143 expect_equal(x$type, float64()) 144 expect_equal(x$num_chunks, 3L) 145 expect_equal(length(x), 25L) 146 expect_as_vector(x, c(1:10, c(NaN, 2:10), c(1:3, NaN, 5))) 147 148 chunks <- x$chunks 149 expect_as_vector(is.nan(chunks[[2]]), is.nan(data[[2]])) 150 expect_as_vector(is.nan(x), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]]))) 151}) 152 153test_that("ChunkedArray supports logical vectors (ARROW-3341)", { 154 # with NA 155 data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE)) 156 expect_chunked_roundtrip(data, bool()) 157 # without NA 158 data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE)) 159 expect_chunked_roundtrip(data, bool()) 160}) 161 162test_that("ChunkedArray supports character vectors (ARROW-3339)", { 163 data <- list( 164 c("itsy", NA, "spider"), 165 c("Climbed", "up", "the", "water", "spout"), 166 c("Down", "came", "the", "rain"), 167 "And washed the spider out. " 168 ) 169 expect_chunked_roundtrip(data, utf8()) 170}) 171 172test_that("ChunkedArray supports factors (ARROW-3716)", { 173 f <- factor(c("itsy", "bitsy", "spider", "spider")) 174 expect_chunked_roundtrip(list(f, f, f), dictionary(int8())) 175}) 176 177test_that("ChunkedArray supports dates (ARROW-3716)", { 178 d <- Sys.Date() + 1:10 179 expect_chunked_roundtrip(list(d, d), date32()) 180}) 181 182test_that("ChunkedArray supports POSIXct (ARROW-3716)", { 183 times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 184 expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC")) 185}) 186 187test_that("ChunkedArray supports integer64 (ARROW-3716)", { 188 x <- bit64::as.integer64(1:10) + MAX_INT 189 expect_chunked_roundtrip(list(x, x), int64()) 190 # Also with a first chunk that would downcast 191 zero <- Array$create(0L)$cast(int64()) 192 expect_type_equal(zero, int64()) 193 ca <- ChunkedArray$create(zero, x) 194 expect_type_equal(ca, int64()) 195 expect_s3_class(as.vector(ca), "integer64") 196 expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x)) 197}) 198 199test_that("ChunkedArray supports difftime", { 200 time <- hms::hms(56, 34, 12) 201 expect_chunked_roundtrip(list(time, time), time32("s")) 202}) 203 204test_that("ChunkedArray supports empty arrays (ARROW-13761)", { 205 types <- c( 206 int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), 207 uint64(), float32(), float64(), timestamp("ns"), binary(), 208 large_binary(), fixed_size_binary(32), date32(), date64(), 209 decimal(4, 2), dictionary(), struct(x = int32()) 210 ) 211 212 empty_filter <- ChunkedArray$create(type = bool()) 213 for (type in types) { 214 one_empty_chunk <- ChunkedArray$create(type = type) 215 expect_type_equal(one_empty_chunk$type, type) 216 if (type != struct(x = int32())) { 217 expect_identical(length(one_empty_chunk), length(as.vector(one_empty_chunk))) 218 } else { 219 # struct -> tbl and length(tbl) is num_columns instead of num_rows 220 expect_identical(length(as.vector(one_empty_chunk)), 1L) 221 } 222 zero_empty_chunks <- one_empty_chunk$Filter(empty_filter) 223 expect_equal(zero_empty_chunks$num_chunks, 0) 224 expect_type_equal(zero_empty_chunks$type, type) 225 if (type != struct(x = int32())) { 226 expect_identical(length(zero_empty_chunks), length(as.vector(zero_empty_chunks))) 227 } else { 228 expect_identical(length(as.vector(zero_empty_chunks)), 1L) 229 } 230 } 231}) 232 233test_that("integer types casts for ChunkedArray (ARROW-3741)", { 234 int_types <- c(int8(), int16(), int32(), int64()) 235 uint_types <- c(uint8(), uint16(), uint32(), uint64()) 236 float_types <- c(float32(), float64()) # float16() not really supported in C++ yet 237 all_types <- c( 238 int_types, 239 uint_types, 240 float_types 241 ) 242 243 a <- chunked_array(1:10, 1:10) 244 for (type in c(int_types, uint_types)) { 245 casted <- a$cast(type) 246 expect_r6_class(casted, "ChunkedArray") 247 expect_type_equal(casted$type, type) 248 } 249 # Also test casting to double(), not actually a type, a base R function but should be alias for float64 250 dbl <- a$cast(double()) 251 expect_r6_class(dbl, "ChunkedArray") 252 expect_type_equal(dbl$type, float64()) 253}) 254 255test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", { 256 num_int32 <- 12L 257 num_int64 <- bit64::as.integer64(10) 258 for (type in all_types) { 259 expect_type_equal(chunked_array(num_int32, type = type)$type, type) 260 expect_type_equal(chunked_array(num_int64, type = type)$type, type) 261 } 262 # also test creating with double() "type" 263 expect_type_equal(chunked_array(num_int32, type = double())$type, float64()) 264}) 265 266test_that("ChunkedArray$create() aborts on overflow", { 267 expect_error(chunked_array(128L, type = int8())$type) 268 expect_error(chunked_array(-129L, type = int8())$type) 269 270 expect_error(chunked_array(256L, type = uint8())$type) 271 expect_error(chunked_array(-1L, type = uint8())$type) 272 273 expect_error(chunked_array(32768L, type = int16())$type) 274 expect_error(chunked_array(-32769L, type = int16())$type) 275 276 expect_error(chunked_array(65536L, type = uint16())$type) 277 expect_error(chunked_array(-1L, type = uint16())$type) 278 279 expect_error(chunked_array(65536L, type = uint16())$type) 280 expect_error(chunked_array(-1L, type = uint16())$type) 281 282 expect_error(chunked_array(bit64::as.integer64(2^31), type = int32())) 283 expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32())) 284}) 285 286test_that("chunked_array() convert doubles to integers", { 287 for (type in c(int_types, uint_types)) { 288 a <- chunked_array(10, type = type) 289 expect_type_equal(a$type, type) 290 if (type != uint64()) { 291 # exception for unsigned integer 64 that 292 # wa cannot handle yet 293 expect_true(as.vector(a) == 10) 294 } 295 } 296}) 297 298test_that("chunked_array() uses the first ... to infer type", { 299 a <- chunked_array(10, 10L) 300 expect_type_equal(a$type, float64()) 301}) 302 303test_that("chunked_array() handles downcasting", { 304 a <- chunked_array(10L, 10) 305 expect_type_equal(a$type, int32()) 306 expect_as_vector(a, c(10L, 10L)) 307}) 308 309test_that("chunked_array() makes chunks of the same type", { 310 a <- chunked_array(10L, bit64::as.integer64(13), type = int64()) 311 for (chunk in a$chunks) { 312 expect_type_equal(chunk$type, int64()) 313 } 314}) 315 316test_that("chunked_array() handles 0 chunks if given a type", { 317 for (type in all_types) { 318 a <- chunked_array(type = type) 319 expect_type_equal(a$type, as_type(type)) 320 expect_equal(length(a), 0L) 321 } 322}) 323 324test_that("chunked_array() can ingest arrays (ARROW-3815)", { 325 expect_equal( 326 as.vector(chunked_array(1:5, Array$create(6:10))), 327 1:10 328 ) 329}) 330 331test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", { 332 df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10]) 333 a <- chunked_array(df, df) 334 expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8())) 335 expect_equal(a$as_vector(), rbind(df, df), ignore_attr = TRUE) 336}) 337 338test_that("ChunkedArray$View() (ARROW-6542)", { 339 a <- ChunkedArray$create(1:3, 1:4) 340 b <- a$View(float32()) 341 expect_equal(b$type, float32()) 342 expect_equal(length(b), 7L) 343 expect_true(all( 344 sapply(b$chunks, function(.x) .x$type == float32()) 345 )) 346 # Input validation 347 expect_error(a$View("not a type"), "type must be a DataType, not character") 348}) 349 350test_that("ChunkedArray$Validate()", { 351 a <- ChunkedArray$create(1:10) 352 expect_error(a$Validate(), NA) 353}) 354 355test_that("[ ChunkedArray", { 356 one_chunk <- chunked_array(2:11) 357 x <- chunked_array(1:10, 31:40, 51:55) 358 # Slice 359 expect_as_vector(x[8:12], c(8:10, 31:32)) 360 # Take from same chunk 361 expect_as_vector(x[c(11, 15, 12)], c(31, 35, 32)) 362 # Take from multiple chunks (calls Concatenate) 363 expect_as_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3)) 364 # Take with Array (note these are 0-based) 365 take1 <- Array$create(c(10L, 14L, 11L)) 366 expect_as_vector(x[take1], c(31, 35, 32)) 367 # Take with ChunkedArray 368 take2 <- ChunkedArray$create(c(10L, 14L), 11L) 369 expect_as_vector(x[take2], c(31, 35, 32)) 370 371 # Filter (with recycling) 372 expect_as_vector( 373 one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)], 374 c(3, 6, 8, 11) 375 ) 376 # Filter where both are 1-chunk 377 expect_as_vector( 378 one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))], 379 c(3, 6, 8, 11) 380 ) 381 # Filter multi-chunk with logical (-> Array) 382 expect_as_vector( 383 x[c(FALSE, TRUE, FALSE, FALSE, TRUE)], 384 c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55) 385 ) 386 # Filter with a chunked array with different sized chunks 387 p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE) 388 p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) 389 filt <- ChunkedArray$create(p1, p2, p2) 390 expect_as_vector( 391 x[filt], 392 c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55) 393 ) 394}) 395 396test_that("ChunkedArray head/tail", { 397 vec <- 11:20 398 a <- ChunkedArray$create(11:15, 16:20) 399 expect_as_vector(head(a), head(vec)) 400 expect_as_vector(head(a, 4), head(vec, 4)) 401 expect_as_vector(head(a, 40), head(vec, 40)) 402 expect_as_vector(head(a, -4), head(vec, -4)) 403 expect_as_vector(head(a, -40), head(vec, -40)) 404 expect_as_vector(tail(a), tail(vec)) 405 expect_as_vector(tail(a, 4), tail(vec, 4)) 406 expect_as_vector(tail(a, 40), tail(vec, 40)) 407 expect_as_vector(tail(a, -40), tail(vec, -40)) 408}) 409 410test_that("ChunkedArray$Equals", { 411 vec <- 11:20 412 a <- ChunkedArray$create(vec[1:5], vec[6:10]) 413 b <- ChunkedArray$create(vec[1:5], vec[6:10]) 414 expect_equal(a, b) 415 expect_true(a$Equals(b)) 416 expect_false(a$Equals(vec)) 417}) 418 419test_that("Converting a chunked array unifies factors (ARROW-8374)", { 420 f1 <- factor(c("a"), levels = c("a", "b")) 421 f2 <- factor(c("c"), levels = c("c", "d")) 422 f3 <- factor(NA, levels = "a") 423 f4 <- factor() 424 425 res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d")) 426 ca <- ChunkedArray$create(f1, f2, f3, f4) 427 428 expect_identical(ca$as_vector(), res) 429}) 430 431test_that("Handling string data with embedded nuls", { 432 raws <- structure(list( 433 as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), 434 as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), 435 as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 436 as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls 437 as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), 438 as.raw(c(0x74, 0x76)) 439 ), 440 class = c("arrow_binary", "vctrs_vctr", "list") 441 ) 442 chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8()) 443 444 # The behavior of the warnings/errors is slightly different with and without 445 # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately 446 # on `as.vector()` where as with it, the error only happens on materialization) 447 skip_if_r_version("3.5.0") 448 449 v <- expect_error(as.vector(chunked_array_with_nul), NA) 450 451 expect_error( 452 v[], 453 paste0( 454 "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ", 455 "set options(arrow.skip_nul = TRUE)" 456 ), 457 fixed = TRUE 458 ) 459 460 withr::with_options(list(arrow.skip_nul = TRUE), { 461 v <- expect_warning(as.vector(chunked_array_with_nul), NA) 462 expect_warning( 463 expect_identical(v[3], "man"), 464 "Stripping '\\0' (nul) from character vector", 465 fixed = TRUE 466 ) 467 }) 468}) 469