1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18context("ChunkedArray")
19
20expect_chunked_roundtrip <- function(x, type) {
21  a <- ChunkedArray$create(!!!x)
22  flat_x <- unlist(x, recursive = FALSE)
23  attributes(flat_x) <- attributes(x[[1]])
24  expect_type_equal(a$type, type)
25  expect_identical(a$num_chunks, length(x))
26  expect_identical(length(a), length(flat_x))
27  if (!inherits(type, "ListType")) {
28    # TODO: revisit how missingness works with ListArrays
29    # R list objects don't handle missingness the same way as other vectors.
30    # Is there some vctrs thing we should do on the roundtrip back to R?
31    expect_identical(is.na(a), is.na(flat_x))
32  }
33  expect_equal(as.vector(a), flat_x)
34  expect_equal(as.vector(a$chunk(0)), x[[1]])
35
36  if (length(flat_x)) {
37    a_sliced <- a$Slice(1)
38    x_sliced <- flat_x[-1]
39    expect_type_equal(a_sliced$type, type)
40    expect_identical(length(a_sliced), length(x_sliced))
41    if (!inherits(type, "ListType")) {
42      expect_identical(is.na(a_sliced), is.na(x_sliced))
43    }
44    expect_equal(as.vector(a_sliced), x_sliced)
45  }
46  invisible(a)
47}
48
49test_that("ChunkedArray", {
50  x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32())
51
52  y <- x$Slice(8)
53  expect_equal(y$type, int32())
54  expect_equal(y$num_chunks, 3L)
55  expect_equal(length(y), 17L)
56  expect_equal(as.vector(y), c(9:10, 1:10, 1:5))
57
58  z <- x$Slice(8, 5)
59  expect_equal(z$type, int32())
60  expect_equal(z$num_chunks, 2L)
61  expect_equal(z$length(), 5L)
62  expect_equal(z$as_vector(), c(9:10, 1:3))
63
64  expect_chunked_roundtrip(list(c(1,2,3), c(4,5,6)), float64())
65
66  # input validation
67  expect_error(x$chunk(14), "subscript out of bounds")
68  expect_error(x$chunk("one"), class = "Rcpp::not_compatible")
69  expect_error(x$chunk(NA_integer_), "'i' cannot be NA")
70  expect_error(x$chunk(-1), "subscript out of bounds")
71
72  expect_error(x$Slice("ten"), class = "Rcpp::not_compatible")
73  expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
74  expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
75  expect_error(x$Slice(10, "ten"), class = "Rcpp::not_compatible")
76  expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
77  expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
78  expect_error(x$Slice(c(10, 10)), class = "Rcpp::not_compatible")
79  expect_error(x$Slice(10, c(10, 10)), class = "Rcpp::not_compatible")
80  expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
81  expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
82  expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
83  expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
84  expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
85
86  expect_warning(x$Slice(10, 15), NA)
87  expect_warning(
88    overslice <- x$Slice(10, 16),
89    "Slice 'length' greater than available length"
90  )
91  expect_equal(length(overslice), 15)
92  expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
93})
94
95test_that("print ChunkedArray", {
96  x1 <- chunked_array(c(1,2,3), c(4,5,6))
97  expect_output(
98    print(x1),
99    paste(
100      "ChunkedArray",
101      "<double>",
102      "[",
103      "  1,",
104      "  2,",
105      "  3,",
106      "  ...",
107      "]",
108      sep = "\n"
109    ),
110    fixed = TRUE
111  )
112  x2 <- chunked_array(1:30, c(4,5,6))
113  expect_output(
114    print(x2),
115    paste(
116      "ChunkedArray",
117      "<int32>",
118      "[",
119      "  1,",
120      "  2,",
121      "  3,",
122      "  4,",
123      "  5,",
124      "  6,",
125      "  7,",
126      "  8,",
127      "  9,",
128      "  10,",
129      "  ...",
130      "]",
131      sep = "\n"
132    ),
133    fixed = TRUE
134  )
135  # If there's only one chunk, it should look like a regular Array
136  x3 <- chunked_array(1:30)
137  expect_output(
138    print(x3),
139    paste0("Chunked", paste(capture.output(print(Array$create(1:30))), collapse = "\n")),
140    fixed = TRUE
141  )
142})
143
144test_that("ChunkedArray handles !!! splicing", {
145  data <- list(1, 2, 3)
146  x <- chunked_array(!!!data)
147  expect_equal(x$type, float64())
148  expect_equal(x$num_chunks, 3L)
149})
150
151test_that("ChunkedArray handles NA", {
152  data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L))
153  x <- chunked_array(!!!data)
154  expect_equal(x$type, int32())
155  expect_equal(x$num_chunks, 3L)
156  expect_equal(length(x), 25L)
157  expect_equal(as.vector(x), c(1:10, c(NA, 2:10), c(1:3, NA, 5)))
158
159  chunks <- x$chunks
160  expect_equal(is.na(chunks[[1]]), is.na(data[[1]]))
161  expect_equal(is.na(chunks[[2]]), is.na(data[[2]]))
162  expect_equal(is.na(chunks[[3]]), is.na(data[[3]]))
163  expect_equal(is.na(x), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]])))
164})
165
166test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
167  # with NA
168  data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
169  expect_chunked_roundtrip(data, bool())
170  # without NA
171  data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
172  expect_chunked_roundtrip(data, bool())
173})
174
175test_that("ChunkedArray supports character vectors (ARROW-3339)", {
176  data <- list(
177    c("itsy", NA, "spider"),
178    c("Climbed", "up", "the", "water", "spout"),
179    c("Down", "came", "the", "rain"),
180    "And washed the spider out. "
181  )
182  expect_chunked_roundtrip(data, utf8())
183})
184
185test_that("ChunkedArray supports factors (ARROW-3716)", {
186  f <- factor(c("itsy", "bitsy", "spider", "spider"))
187  expect_chunked_roundtrip(list(f, f, f), dictionary(int8()))
188})
189
190test_that("ChunkedArray supports dates (ARROW-3716)", {
191  d <- Sys.Date() + 1:10
192  expect_chunked_roundtrip(list(d, d), date32())
193})
194
195test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
196  times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
197  expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC"))
198})
199
200test_that("ChunkedArray supports integer64 (ARROW-3716)", {
201  x <- bit64::as.integer64(1:10)
202  expect_chunked_roundtrip(list(x, x), int64())
203})
204
205test_that("ChunkedArray supports difftime", {
206  time <- hms::hms(56, 34, 12)
207  expect_chunked_roundtrip(list(time, time), time32("s"))
208})
209
210test_that("integer types casts for ChunkedArray (ARROW-3741)", {
211  int_types <- c(int8(), int16(), int32(), int64())
212  uint_types <- c(uint8(), uint16(), uint32(), uint64())
213  float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
214  all_types <- c(
215    int_types,
216    uint_types,
217    float_types
218  )
219
220  a <- chunked_array(1:10, 1:10)
221  for (type in c(int_types, uint_types)) {
222    casted <- a$cast(type)
223    expect_is(casted, "ChunkedArray")
224    expect_type_equal(casted$type, type)
225  }
226  # Also test casting to double(), not actually a type, a base R function but should be alias for float64
227  dbl <- a$cast(double())
228  expect_is(dbl, "ChunkedArray")
229  expect_type_equal(dbl$type, float64())
230})
231
232test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", {
233  num_int32 <- 12L
234  num_int64 <- bit64::as.integer64(10)
235  for (type in all_types) {
236    expect_type_equal(chunked_array(num_int32, type = type)$type, type)
237    expect_type_equal(chunked_array(num_int64, type = type)$type, type)
238  }
239  # also test creating with double() "type"
240  expect_type_equal(chunked_array(num_int32, type = double())$type, float64())
241})
242
243test_that("ChunkedArray$create() aborts on overflow", {
244  expect_error(chunked_array(128L, type = int8())$type, "Invalid.*Value is too large")
245  expect_error(chunked_array(-129L, type = int8())$type, "Invalid.*Value is too large")
246
247  expect_error(chunked_array(256L, type = uint8())$type, "Invalid.*Value is too large")
248  expect_error(chunked_array(-1L, type = uint8())$type, "Invalid.*Value is too large")
249
250  expect_error(chunked_array(32768L, type = int16())$type, "Invalid.*Value is too large")
251  expect_error(chunked_array(-32769L, type = int16())$type, "Invalid.*Value is too large")
252
253  expect_error(chunked_array(65536L, type = uint16())$type, "Invalid.*Value is too large")
254  expect_error(chunked_array(-1L, type = uint16())$type, "Invalid.*Value is too large")
255
256  expect_error(chunked_array(65536L, type = uint16())$type, "Invalid.*Value is too large")
257  expect_error(chunked_array(-1L, type = uint16())$type, "Invalid.*Value is too large")
258
259  expect_error(chunked_array(bit64::as.integer64(2^31), type = int32()), "Invalid.*Value is too large")
260  expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32()), "Invalid.*Value is too large")
261})
262
263test_that("chunked_array() convert doubles to integers", {
264  for (type in c(int_types, uint_types)) {
265    a <- chunked_array(10, type = type)
266    expect_type_equal(a$type, type)
267    if (type != uint64()) {
268      # exception for unsigned integer 64 that
269      # wa cannot handle yet
270      expect_true(as.vector(a) == 10)
271    }
272  }
273})
274
275test_that("chunked_array() uses the first ... to infer type", {
276  a <- chunked_array(10, 10L)
277  expect_type_equal(a$type, float64())
278})
279
280test_that("chunked_array() handles downcasting", {
281   a <- chunked_array(10L, 10)
282   expect_type_equal(a$type, int32())
283   expect_equal(as.vector(a), c(10L, 10L))
284})
285
286test_that("chunked_array() makes chunks of the same type", {
287  a <- chunked_array(10L, bit64::as.integer64(13), type = int64())
288  for(chunk in a$chunks) {
289    expect_type_equal(chunk$type, int64())
290  }
291})
292
293test_that("chunked_array() handles 0 chunks if given a type", {
294  for (type in all_types) {
295    a <- chunked_array(type = type)
296    expect_type_equal(a$type, as_type(type))
297    expect_equal(length(a), 0L)
298  }
299})
300
301test_that("chunked_array() can ingest arrays (ARROW-3815)", {
302  expect_equal(
303    as.vector(chunked_array(1:5, Array$create(6:10))),
304    1:10
305  )
306})
307
308test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", {
309  df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
310  a <- chunked_array(df, df)
311  expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
312  expect_equivalent(a$as_vector(), rbind(df, df))
313})
314
315test_that("ChunkedArray$View() (ARROW-6542)", {
316  a <- ChunkedArray$create(1:3, 1:4)
317  b <- a$View(float32())
318  expect_equal(b$type, float32())
319  expect_equal(length(b), 7L)
320  expect_true(all(
321    sapply(b$chunks, function(.x) .x$type == float32())
322  ))
323  # Input validation
324  expect_error(a$View("not a type"), "type must be a DataType, not character")
325})
326
327test_that("ChunkedArray$Validate()", {
328  a <- ChunkedArray$create(1:10)
329  expect_error(a$Validate(), NA)
330})
331
332test_that("[ ChunkedArray", {
333  one_chunk <- chunked_array(2:11)
334  x <- chunked_array(1:10, 31:40, 51:55)
335  # Slice
336  expect_vector(x[8:12], c(8:10, 31:32))
337  # Take from same chunk
338  expect_vector(x[c(11, 15, 12)], c(31, 35, 32))
339  # Take from multiple chunks (calls Concatenate)
340  expect_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3))
341  # Take with Array (note these are 0-based)
342  take1 <- Array$create(c(10L, 14L, 11L))
343  expect_vector(x[take1], c(31, 35, 32))
344  # Take with ChunkedArray
345  take2 <- ChunkedArray$create(c(10L, 14L), 11L)
346  expect_vector(x[take2], c(31, 35, 32))
347
348  # Filter (with recycling)
349  expect_vector(
350    one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
351    c(3, 6, 8, 11)
352  )
353  # Filter where both are 1-chunk
354  expect_vector(
355    one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))],
356    c(3, 6, 8, 11)
357  )
358  # Filter multi-chunk with logical (-> Array)
359  expect_vector(
360    x[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
361    c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55)
362  )
363  # Filter with a chunked array with different sized chunks
364  p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE)
365  p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)
366  filt <- ChunkedArray$create(p1, p2, p2)
367  expect_vector(
368    x[filt],
369    c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55)
370  )
371})
372
373test_that("ChunkedArray head/tail", {
374  vec <- 11:20
375  a <- ChunkedArray$create(11:15, 16:20)
376  expect_vector(head(a), head(vec))
377  expect_vector(head(a, 4), head(vec, 4))
378  expect_vector(head(a, 40), head(vec, 40))
379  expect_vector(head(a, -4), head(vec, -4))
380  expect_vector(head(a, -40), head(vec, -40))
381  expect_vector(tail(a), tail(vec))
382  expect_vector(tail(a, 4), tail(vec, 4))
383  expect_vector(tail(a, 40), tail(vec, 40))
384  expect_vector(tail(a, -40), tail(vec, -40))
385})
386
387test_that("ChunkedArray$Equals", {
388  vec <- 11:20
389  a <- ChunkedArray$create(vec[1:5], vec[6:10])
390  b <- ChunkedArray$create(vec[1:5], vec[6:10])
391  expect_equal(a, b)
392  expect_true(a$Equals(b))
393  expect_false(a$Equals(vec))
394})
395