1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18skip_if_not_installed("duckdb", minimum_version = "0.2.8")
19skip_if_not_installed("dbplyr")
20skip_if_not_available("dataset")
21skip_on_cran()
22
23library(duckdb, quietly = TRUE)
24library(dplyr, warn.conflicts = FALSE)
25
26test_that("to_duckdb", {
27  ds <- InMemoryDataset$create(example_data)
28
29  expect_identical(
30    ds %>%
31      to_duckdb() %>%
32      collect() %>%
33      # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
34      select(!fct),
35    select(example_data, !fct)
36  )
37
38  expect_identical(
39    ds %>%
40      select(int, lgl, dbl) %>%
41      to_duckdb() %>%
42      group_by(lgl) %>%
43      summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>%
44      collect(),
45    tibble::tibble(
46      lgl = c(TRUE, NA, FALSE),
47      mean_int = c(3, 6.25, 8.5),
48      mean_dbl = c(3.1, 6.35, 6.1)
49    )
50  )
51
52  # can group_by before the to_duckdb
53  expect_identical(
54    ds %>%
55      select(int, lgl, dbl) %>%
56      group_by(lgl) %>%
57      to_duckdb() %>%
58      summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>%
59      collect(),
60    tibble::tibble(
61      lgl = c(TRUE, NA, FALSE),
62      mean_int = c(3, 6.25, 8.5),
63      mean_dbl = c(3.1, 6.35, 6.1)
64    )
65  )
66})
67
68test_that("to_duckdb then to_arrow", {
69  ds <- InMemoryDataset$create(example_data)
70
71  ds_rt <- ds %>%
72    to_duckdb() %>%
73    # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
74    select(-fct) %>%
75    to_arrow()
76
77  expect_identical(
78    collect(ds_rt),
79    ds %>%
80      select(-fct) %>%
81      collect()
82  )
83
84  # And we can continue the pipeline
85  ds_rt <- ds %>%
86    to_duckdb() %>%
87    # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
88    select(-fct) %>%
89    to_arrow() %>%
90    filter(int > 5)
91
92  expect_identical(
93    collect(ds_rt),
94    ds %>%
95      select(-fct) %>%
96      filter(int > 5) %>%
97      collect()
98  )
99
100  # Now check errors
101  ds_rt <- ds %>%
102    to_duckdb() %>%
103    # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
104    select(-fct)
105
106  # alter the class of ds_rt's connection to simulate some other database
107  class(ds_rt$src$con) <- "some_other_connection"
108
109  expect_error(
110    to_arrow(ds_rt),
111    "to_arrow\\(\\) currently only supports Arrow tables, Arrow datasets,"
112  )
113})
114
115# The next set of tests use an already-extant connection to test features of
116# persistence and querying against the table without using the `tbl` itself, so
117# we need to create a connection separate from the ephemeral one that is made
118# with arrow_duck_connection()
119con <- dbConnect(duckdb::duckdb())
120dbExecute(con, "PRAGMA threads=2")
121on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE)
122
123# write one table to the connection so it is kept open
124DBI::dbWriteTable(con, "mtcars", mtcars)
125
126test_that("Joining, auto-cleanup enabled", {
127  ds <- InMemoryDataset$create(example_data)
128
129  table_one_name <- "my_arrow_table_1"
130  table_one <- to_duckdb(ds, con = con, table_name = table_one_name, auto_disconnect = TRUE)
131  table_two_name <- "my_arrow_table_2"
132  table_two <- to_duckdb(ds, con = con, table_name = table_two_name, auto_disconnect = TRUE)
133
134  res <- dbGetQuery(
135    con,
136    paste0(
137      "SELECT * FROM ", table_one_name,
138      " INNER JOIN ", table_two_name,
139      " ON ", table_one_name, ".int = ", table_two_name, ".int"
140    )
141  )
142  expect_identical(dim(res), c(9L, 14L))
143
144  # clean up cleans up the tables
145  expect_true(all(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
146  rm(table_one, table_two)
147  gc()
148  expect_false(any(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
149})
150
151test_that("Joining, auto-cleanup disabled", {
152  ds <- InMemoryDataset$create(example_data)
153
154  table_three_name <- "my_arrow_table_3"
155  table_three <- to_duckdb(ds, con = con, table_name = table_three_name)
156
157  # clean up does *not* clean these tables
158  expect_true(table_three_name %in% DBI::dbListTables(con))
159  rm(table_three)
160  gc()
161  # but because we aren't auto_disconnecting then we still have this table.
162  expect_true(table_three_name %in% DBI::dbListTables(con))
163})
164
165test_that("to_duckdb with a table", {
166  tab <- Table$create(example_data)
167
168  expect_identical(
169    tab %>%
170      to_duckdb() %>%
171      group_by(int > 4) %>%
172      summarise(
173        int_mean = mean(int, na.rm = TRUE),
174        dbl_mean = mean(dbl, na.rm = TRUE)
175      ) %>%
176      collect(),
177    tibble::tibble(
178      "int > 4" = c(FALSE, NA, TRUE),
179      int_mean = c(2, NA, 7.5),
180      dbl_mean = c(2.1, 4.1, 7.3)
181    )
182  )
183})
184
185test_that("to_duckdb passing a connection", {
186  ds <- InMemoryDataset$create(example_data)
187
188  con_separate <- dbConnect(duckdb::duckdb())
189  # we always want to test in parallel
190  dbExecute(con_separate, "PRAGMA threads=2")
191  on.exit(dbDisconnect(con_separate, shutdown = TRUE), add = TRUE)
192
193  # create a table to join to that we know is in our con_separate
194  new_df <- data.frame(
195    int = 1:10,
196    char = letters[26:17],
197    stringsAsFactors = FALSE
198  )
199  DBI::dbWriteTable(con_separate, "separate_join_table", new_df)
200
201  table_four <- ds %>%
202    select(int, lgl, dbl) %>%
203    to_duckdb(con = con_separate, auto_disconnect = FALSE)
204  table_four_name <- table_four$ops$x
205
206  result <- DBI::dbGetQuery(
207    con_separate,
208    paste0(
209      "SELECT * FROM ", table_four_name,
210      " INNER JOIN separate_join_table ",
211      "ON separate_join_table.int = ", table_four_name, ".int"
212    )
213  )
214
215  expect_identical(dim(result), c(9L, 5L))
216  expect_identical(result$char, new_df[new_df$int != 4, ]$char)
217})
218