1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18skip_if_not_installed("duckdb", minimum_version = "0.2.8") 19skip_if_not_installed("dbplyr") 20skip_if_not_available("dataset") 21skip_on_cran() 22 23library(duckdb, quietly = TRUE) 24library(dplyr, warn.conflicts = FALSE) 25 26test_that("to_duckdb", { 27 ds <- InMemoryDataset$create(example_data) 28 29 expect_identical( 30 ds %>% 31 to_duckdb() %>% 32 collect() %>% 33 # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 34 select(!fct), 35 select(example_data, !fct) 36 ) 37 38 expect_identical( 39 ds %>% 40 select(int, lgl, dbl) %>% 41 to_duckdb() %>% 42 group_by(lgl) %>% 43 summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>% 44 collect(), 45 tibble::tibble( 46 lgl = c(TRUE, NA, FALSE), 47 mean_int = c(3, 6.25, 8.5), 48 mean_dbl = c(3.1, 6.35, 6.1) 49 ) 50 ) 51 52 # can group_by before the to_duckdb 53 expect_identical( 54 ds %>% 55 select(int, lgl, dbl) %>% 56 group_by(lgl) %>% 57 to_duckdb() %>% 58 summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>% 59 collect(), 60 tibble::tibble( 61 lgl = c(TRUE, NA, FALSE), 62 mean_int = c(3, 6.25, 8.5), 63 mean_dbl = c(3.1, 6.35, 6.1) 64 ) 65 ) 66}) 67 68test_that("to_duckdb then to_arrow", { 69 ds <- InMemoryDataset$create(example_data) 70 71 ds_rt <- ds %>% 72 to_duckdb() %>% 73 # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 74 select(-fct) %>% 75 to_arrow() 76 77 expect_identical( 78 collect(ds_rt), 79 ds %>% 80 select(-fct) %>% 81 collect() 82 ) 83 84 # And we can continue the pipeline 85 ds_rt <- ds %>% 86 to_duckdb() %>% 87 # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 88 select(-fct) %>% 89 to_arrow() %>% 90 filter(int > 5) 91 92 expect_identical( 93 collect(ds_rt), 94 ds %>% 95 select(-fct) %>% 96 filter(int > 5) %>% 97 collect() 98 ) 99 100 # Now check errors 101 ds_rt <- ds %>% 102 to_duckdb() %>% 103 # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 104 select(-fct) 105 106 # alter the class of ds_rt's connection to simulate some other database 107 class(ds_rt$src$con) <- "some_other_connection" 108 109 expect_error( 110 to_arrow(ds_rt), 111 "to_arrow\\(\\) currently only supports Arrow tables, Arrow datasets," 112 ) 113}) 114 115# The next set of tests use an already-extant connection to test features of 116# persistence and querying against the table without using the `tbl` itself, so 117# we need to create a connection separate from the ephemeral one that is made 118# with arrow_duck_connection() 119con <- dbConnect(duckdb::duckdb()) 120dbExecute(con, "PRAGMA threads=2") 121on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) 122 123# write one table to the connection so it is kept open 124DBI::dbWriteTable(con, "mtcars", mtcars) 125 126test_that("Joining, auto-cleanup enabled", { 127 ds <- InMemoryDataset$create(example_data) 128 129 table_one_name <- "my_arrow_table_1" 130 table_one <- to_duckdb(ds, con = con, table_name = table_one_name, auto_disconnect = TRUE) 131 table_two_name <- "my_arrow_table_2" 132 table_two <- to_duckdb(ds, con = con, table_name = table_two_name, auto_disconnect = TRUE) 133 134 res <- dbGetQuery( 135 con, 136 paste0( 137 "SELECT * FROM ", table_one_name, 138 " INNER JOIN ", table_two_name, 139 " ON ", table_one_name, ".int = ", table_two_name, ".int" 140 ) 141 ) 142 expect_identical(dim(res), c(9L, 14L)) 143 144 # clean up cleans up the tables 145 expect_true(all(c(table_one_name, table_two_name) %in% DBI::dbListTables(con))) 146 rm(table_one, table_two) 147 gc() 148 expect_false(any(c(table_one_name, table_two_name) %in% DBI::dbListTables(con))) 149}) 150 151test_that("Joining, auto-cleanup disabled", { 152 ds <- InMemoryDataset$create(example_data) 153 154 table_three_name <- "my_arrow_table_3" 155 table_three <- to_duckdb(ds, con = con, table_name = table_three_name) 156 157 # clean up does *not* clean these tables 158 expect_true(table_three_name %in% DBI::dbListTables(con)) 159 rm(table_three) 160 gc() 161 # but because we aren't auto_disconnecting then we still have this table. 162 expect_true(table_three_name %in% DBI::dbListTables(con)) 163}) 164 165test_that("to_duckdb with a table", { 166 tab <- Table$create(example_data) 167 168 expect_identical( 169 tab %>% 170 to_duckdb() %>% 171 group_by(int > 4) %>% 172 summarise( 173 int_mean = mean(int, na.rm = TRUE), 174 dbl_mean = mean(dbl, na.rm = TRUE) 175 ) %>% 176 collect(), 177 tibble::tibble( 178 "int > 4" = c(FALSE, NA, TRUE), 179 int_mean = c(2, NA, 7.5), 180 dbl_mean = c(2.1, 4.1, 7.3) 181 ) 182 ) 183}) 184 185test_that("to_duckdb passing a connection", { 186 ds <- InMemoryDataset$create(example_data) 187 188 con_separate <- dbConnect(duckdb::duckdb()) 189 # we always want to test in parallel 190 dbExecute(con_separate, "PRAGMA threads=2") 191 on.exit(dbDisconnect(con_separate, shutdown = TRUE), add = TRUE) 192 193 # create a table to join to that we know is in our con_separate 194 new_df <- data.frame( 195 int = 1:10, 196 char = letters[26:17], 197 stringsAsFactors = FALSE 198 ) 199 DBI::dbWriteTable(con_separate, "separate_join_table", new_df) 200 201 table_four <- ds %>% 202 select(int, lgl, dbl) %>% 203 to_duckdb(con = con_separate, auto_disconnect = FALSE) 204 table_four_name <- table_four$ops$x 205 206 result <- DBI::dbGetQuery( 207 con_separate, 208 paste0( 209 "SELECT * FROM ", table_four_name, 210 " INNER JOIN separate_join_table ", 211 "ON separate_join_table.int = ", table_four_name, ".int" 212 ) 213 ) 214 215 expect_identical(dim(result), c(9L, 5L)) 216 expect_identical(result$char, new_df[new_df$int != 4, ]$char) 217}) 218