1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18context("ChunkedArray") 19 20expect_chunked_roundtrip <- function(x, type) { 21 a <- ChunkedArray$create(!!!x) 22 flat_x <- unlist(x, recursive = FALSE) 23 attributes(flat_x) <- attributes(x[[1]]) 24 expect_type_equal(a$type, type) 25 expect_identical(a$num_chunks, length(x)) 26 expect_identical(length(a), length(flat_x)) 27 if (!inherits(type, "ListType")) { 28 # TODO: revisit how missingness works with ListArrays 29 # R list objects don't handle missingness the same way as other vectors. 30 # Is there some vctrs thing we should do on the roundtrip back to R? 31 expect_identical(as.vector(is.na(a)), is.na(flat_x)) 32 } 33 expect_equal(as.vector(a), flat_x) 34 expect_equal(as.vector(a$chunk(0)), x[[1]]) 35 36 if (length(flat_x)) { 37 a_sliced <- a$Slice(1) 38 x_sliced <- flat_x[-1] 39 expect_type_equal(a_sliced$type, type) 40 expect_identical(length(a_sliced), length(x_sliced)) 41 if (!inherits(type, "ListType")) { 42 expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced)) 43 } 44 expect_equal(as.vector(a_sliced), x_sliced) 45 } 46 invisible(a) 47} 48 49test_that("ChunkedArray", { 50 x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32()) 51 52 y <- x$Slice(8) 53 expect_equal(y$type, int32()) 54 expect_equal(y$num_chunks, 3L) 55 expect_equal(length(y), 17L) 56 expect_equal(as.vector(y), c(9:10, 1:10, 1:5)) 57 58 z <- x$Slice(8, 5) 59 expect_equal(z$type, int32()) 60 expect_equal(z$num_chunks, 2L) 61 expect_equal(z$length(), 5L) 62 expect_equal(z$as_vector(), c(9:10, 1:3)) 63 64 expect_chunked_roundtrip(list(c(1,2,3), c(4,5,6)), float64()) 65 66 # input validation 67 expect_error(x$chunk(14), "subscript out of bounds") 68 expect_error(x$chunk("one")) 69 expect_error(x$chunk(NA_integer_), "'i' cannot be NA") 70 expect_error(x$chunk(-1), "subscript out of bounds") 71 72 expect_error(x$Slice("ten")) 73 expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA") 74 expect_error(x$Slice(NA), "Slice 'offset' cannot be NA") 75 expect_error(x$Slice(10, "ten")) 76 expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA") 77 expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") 78 expect_error(x$Slice(c(10, 10))) 79 expect_error(x$Slice(10, c(10, 10))) 80 expect_error(x$Slice(1000), "Slice 'offset' greater than array length") 81 expect_error(x$Slice(-1), "Slice 'offset' cannot be negative") 82 expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length") 83 expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative") 84 expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative") 85 86 expect_warning(x$Slice(10, 15), NA) 87 expect_warning( 88 overslice <- x$Slice(10, 16), 89 "Slice 'length' greater than available length" 90 ) 91 expect_equal(length(overslice), 15) 92 expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length") 93}) 94 95test_that("print ChunkedArray", { 96 verify_output(test_path("test-chunked-array.txt"), { 97 chunked_array(c(1,2,3), c(4,5,6)) 98 chunked_array(1:30, c(4,5,6)) 99 chunked_array(1:30) 100 chunked_array(factor(c("a", "b")), factor(c("c", "d"))) 101 }) 102}) 103 104test_that("ChunkedArray handles !!! splicing", { 105 data <- list(1, 2, 3) 106 x <- chunked_array(!!!data) 107 expect_equal(x$type, float64()) 108 expect_equal(x$num_chunks, 3L) 109}) 110 111test_that("ChunkedArray handles NA", { 112 data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L)) 113 x <- chunked_array(!!!data) 114 expect_equal(x$type, int32()) 115 expect_equal(x$num_chunks, 3L) 116 expect_equal(length(x), 25L) 117 expect_equal(as.vector(x), c(1:10, c(NA, 2:10), c(1:3, NA, 5))) 118 119 chunks <- x$chunks 120 expect_equal(as.vector(is.na(chunks[[2]])), is.na(data[[2]])) 121 expect_equal(as.vector(is.na(x)), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]]))) 122}) 123 124test_that("ChunkedArray handles NaN", { 125 data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L)) 126 x <- chunked_array(!!!data) 127 128 expect_equal(x$type, float64()) 129 expect_equal(x$num_chunks, 3L) 130 expect_equal(length(x), 25L) 131 expect_equal(as.vector(x), c(1:10, c(NaN, 2:10), c(1:3, NaN, 5))) 132 133 chunks <- x$chunks 134 expect_equal(as.vector(is.nan(chunks[[2]])), is.nan(data[[2]])) 135 expect_equal(as.vector(is.nan(x)), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]]))) 136}) 137 138test_that("ChunkedArray supports logical vectors (ARROW-3341)", { 139 # with NA 140 data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE)) 141 expect_chunked_roundtrip(data, bool()) 142 # without NA 143 data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE)) 144 expect_chunked_roundtrip(data, bool()) 145}) 146 147test_that("ChunkedArray supports character vectors (ARROW-3339)", { 148 data <- list( 149 c("itsy", NA, "spider"), 150 c("Climbed", "up", "the", "water", "spout"), 151 c("Down", "came", "the", "rain"), 152 "And washed the spider out. " 153 ) 154 expect_chunked_roundtrip(data, utf8()) 155}) 156 157test_that("ChunkedArray supports factors (ARROW-3716)", { 158 f <- factor(c("itsy", "bitsy", "spider", "spider")) 159 expect_chunked_roundtrip(list(f, f, f), dictionary(int8())) 160}) 161 162test_that("ChunkedArray supports dates (ARROW-3716)", { 163 d <- Sys.Date() + 1:10 164 expect_chunked_roundtrip(list(d, d), date32()) 165}) 166 167test_that("ChunkedArray supports POSIXct (ARROW-3716)", { 168 times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 169 expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC")) 170}) 171 172test_that("ChunkedArray supports integer64 (ARROW-3716)", { 173 x <- bit64::as.integer64(1:10) + MAX_INT 174 expect_chunked_roundtrip(list(x, x), int64()) 175 # Also with a first chunk that would downcast 176 zero <- Array$create(0L)$cast(int64()) 177 expect_type_equal(zero, int64()) 178 ca <- ChunkedArray$create(zero, x) 179 expect_type_equal(ca, int64()) 180 expect_is(as.vector(ca), "integer64") 181 expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x)) 182}) 183 184test_that("ChunkedArray supports difftime", { 185 time <- hms::hms(56, 34, 12) 186 expect_chunked_roundtrip(list(time, time), time32("s")) 187}) 188 189test_that("integer types casts for ChunkedArray (ARROW-3741)", { 190 int_types <- c(int8(), int16(), int32(), int64()) 191 uint_types <- c(uint8(), uint16(), uint32(), uint64()) 192 float_types <- c(float32(), float64()) # float16() not really supported in C++ yet 193 all_types <- c( 194 int_types, 195 uint_types, 196 float_types 197 ) 198 199 a <- chunked_array(1:10, 1:10) 200 for (type in c(int_types, uint_types)) { 201 casted <- a$cast(type) 202 expect_is(casted, "ChunkedArray") 203 expect_type_equal(casted$type, type) 204 } 205 # Also test casting to double(), not actually a type, a base R function but should be alias for float64 206 dbl <- a$cast(double()) 207 expect_is(dbl, "ChunkedArray") 208 expect_type_equal(dbl$type, float64()) 209}) 210 211test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", { 212 num_int32 <- 12L 213 num_int64 <- bit64::as.integer64(10) 214 for (type in all_types) { 215 expect_type_equal(chunked_array(num_int32, type = type)$type, type) 216 expect_type_equal(chunked_array(num_int64, type = type)$type, type) 217 } 218 # also test creating with double() "type" 219 expect_type_equal(chunked_array(num_int32, type = double())$type, float64()) 220}) 221 222test_that("ChunkedArray$create() aborts on overflow", { 223 expect_error(chunked_array(128L, type = int8())$type, "Invalid.*Value is too large") 224 expect_error(chunked_array(-129L, type = int8())$type, "Invalid.*Value is too large") 225 226 expect_error(chunked_array(256L, type = uint8())$type, "Invalid.*Value is too large") 227 expect_error(chunked_array(-1L, type = uint8())$type, "Invalid.*Value is too large") 228 229 expect_error(chunked_array(32768L, type = int16())$type, "Invalid.*Value is too large") 230 expect_error(chunked_array(-32769L, type = int16())$type, "Invalid.*Value is too large") 231 232 expect_error(chunked_array(65536L, type = uint16())$type, "Invalid.*Value is too large") 233 expect_error(chunked_array(-1L, type = uint16())$type, "Invalid.*Value is too large") 234 235 expect_error(chunked_array(65536L, type = uint16())$type, "Invalid.*Value is too large") 236 expect_error(chunked_array(-1L, type = uint16())$type, "Invalid.*Value is too large") 237 238 expect_error(chunked_array(bit64::as.integer64(2^31), type = int32()), "Invalid.*Value is too large") 239 expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32()), "Invalid.*Value is too large") 240}) 241 242test_that("chunked_array() convert doubles to integers", { 243 for (type in c(int_types, uint_types)) { 244 a <- chunked_array(10, type = type) 245 expect_type_equal(a$type, type) 246 if (type != uint64()) { 247 # exception for unsigned integer 64 that 248 # wa cannot handle yet 249 expect_true(as.vector(a) == 10) 250 } 251 } 252}) 253 254test_that("chunked_array() uses the first ... to infer type", { 255 a <- chunked_array(10, 10L) 256 expect_type_equal(a$type, float64()) 257}) 258 259test_that("chunked_array() handles downcasting", { 260 a <- chunked_array(10L, 10) 261 expect_type_equal(a$type, int32()) 262 expect_equal(as.vector(a), c(10L, 10L)) 263}) 264 265test_that("chunked_array() makes chunks of the same type", { 266 a <- chunked_array(10L, bit64::as.integer64(13), type = int64()) 267 for(chunk in a$chunks) { 268 expect_type_equal(chunk$type, int64()) 269 } 270}) 271 272test_that("chunked_array() handles 0 chunks if given a type", { 273 for (type in all_types) { 274 a <- chunked_array(type = type) 275 expect_type_equal(a$type, as_type(type)) 276 expect_equal(length(a), 0L) 277 } 278}) 279 280test_that("chunked_array() can ingest arrays (ARROW-3815)", { 281 expect_equal( 282 as.vector(chunked_array(1:5, Array$create(6:10))), 283 1:10 284 ) 285}) 286 287test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", { 288 df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10]) 289 a <- chunked_array(df, df) 290 expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8())) 291 expect_equivalent(a$as_vector(), rbind(df, df)) 292}) 293 294test_that("ChunkedArray$View() (ARROW-6542)", { 295 a <- ChunkedArray$create(1:3, 1:4) 296 b <- a$View(float32()) 297 expect_equal(b$type, float32()) 298 expect_equal(length(b), 7L) 299 expect_true(all( 300 sapply(b$chunks, function(.x) .x$type == float32()) 301 )) 302 # Input validation 303 expect_error(a$View("not a type"), "type must be a DataType, not character") 304}) 305 306test_that("ChunkedArray$Validate()", { 307 a <- ChunkedArray$create(1:10) 308 expect_error(a$Validate(), NA) 309}) 310 311test_that("[ ChunkedArray", { 312 one_chunk <- chunked_array(2:11) 313 x <- chunked_array(1:10, 31:40, 51:55) 314 # Slice 315 expect_vector(x[8:12], c(8:10, 31:32)) 316 # Take from same chunk 317 expect_vector(x[c(11, 15, 12)], c(31, 35, 32)) 318 # Take from multiple chunks (calls Concatenate) 319 expect_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3)) 320 # Take with Array (note these are 0-based) 321 take1 <- Array$create(c(10L, 14L, 11L)) 322 expect_vector(x[take1], c(31, 35, 32)) 323 # Take with ChunkedArray 324 take2 <- ChunkedArray$create(c(10L, 14L), 11L) 325 expect_vector(x[take2], c(31, 35, 32)) 326 327 # Filter (with recycling) 328 expect_vector( 329 one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)], 330 c(3, 6, 8, 11) 331 ) 332 # Filter where both are 1-chunk 333 expect_vector( 334 one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))], 335 c(3, 6, 8, 11) 336 ) 337 # Filter multi-chunk with logical (-> Array) 338 expect_vector( 339 x[c(FALSE, TRUE, FALSE, FALSE, TRUE)], 340 c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55) 341 ) 342 # Filter with a chunked array with different sized chunks 343 p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE) 344 p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) 345 filt <- ChunkedArray$create(p1, p2, p2) 346 expect_vector( 347 x[filt], 348 c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55) 349 ) 350}) 351 352test_that("ChunkedArray head/tail", { 353 vec <- 11:20 354 a <- ChunkedArray$create(11:15, 16:20) 355 expect_vector(head(a), head(vec)) 356 expect_vector(head(a, 4), head(vec, 4)) 357 expect_vector(head(a, 40), head(vec, 40)) 358 expect_vector(head(a, -4), head(vec, -4)) 359 expect_vector(head(a, -40), head(vec, -40)) 360 expect_vector(tail(a), tail(vec)) 361 expect_vector(tail(a, 4), tail(vec, 4)) 362 expect_vector(tail(a, 40), tail(vec, 40)) 363 expect_vector(tail(a, -40), tail(vec, -40)) 364}) 365 366test_that("ChunkedArray$Equals", { 367 vec <- 11:20 368 a <- ChunkedArray$create(vec[1:5], vec[6:10]) 369 b <- ChunkedArray$create(vec[1:5], vec[6:10]) 370 expect_equal(a, b) 371 expect_true(a$Equals(b)) 372 expect_false(a$Equals(vec)) 373}) 374 375test_that("Converting a chunked array unifies factors (ARROW-8374)", { 376 f1 <- factor(c("a"), levels = c("a", "b")) 377 f2 <- factor(c("c"), levels = c("c", "d")) 378 f3 <- factor(NA, levels = "a") 379 f4 <- factor() 380 381 res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d")) 382 ca <- ChunkedArray$create(f1, f2, f3, f4) 383 384 expect_identical(ca$as_vector(), res) 385}) 386