1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18skip_if_not_available("dataset")
19
20library(dplyr, warn.conflicts = FALSE)
21library(stringr)
22
23tbl <- example_data
24# Add some better string data
25tbl$verses <- verses[[1]]
26# c(" a ", "  b  ", "   c   ", ...) increasing padding
27# nchar =   3  5  7  9 11 13 15 17 19 21
28tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
29tbl$another_chr <- tail(letters, 10)
30
31test_that("basic select/filter/collect", {
32  batch <- record_batch(tbl)
33
34  b2 <- batch %>%
35    select(int, chr) %>%
36    filter(int > 5)
37
38  expect_s3_class(b2, "arrow_dplyr_query")
39  t2 <- collect(b2)
40  expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")])
41  # Test that the original object is not affected
42  expect_identical(collect(batch), tbl)
43})
44
45test_that("dim() on query", {
46  compare_dplyr_binding(
47    .input %>%
48      filter(int > 5) %>%
49      select(int, chr) %>%
50      dim(),
51    tbl
52  )
53})
54
55test_that("Print method", {
56  expect_output(
57    record_batch(tbl) %>%
58      filter(dbl > 2, chr == "d" | chr == "f") %>%
59      select(chr, int, lgl) %>%
60      filter(int < 5) %>%
61      select(int, chr) %>%
62      print(),
63    'InMemoryDataset (query)
64int: int32
65chr: string
66
67* Filter: (((dbl > 2) and ((chr == "d") or (chr == "f"))) and (int < 5))
68See $.data for the source Arrow object',
69    fixed = TRUE
70  )
71})
72
73test_that("pull", {
74  compare_dplyr_binding(
75    .input %>% pull(),
76    tbl
77  )
78  compare_dplyr_binding(
79    .input %>% pull(1),
80    tbl
81  )
82  compare_dplyr_binding(
83    .input %>% pull(chr),
84    tbl
85  )
86  compare_dplyr_binding(
87    .input %>%
88      filter(int > 4) %>%
89      rename(strng = chr) %>%
90      pull(strng),
91    tbl
92  )
93})
94
95test_that("collect(as_data_frame=FALSE)", {
96  batch <- record_batch(tbl)
97
98  b1 <- batch %>% collect(as_data_frame = FALSE)
99
100  expect_r6_class(b1, "RecordBatch")
101
102  b2 <- batch %>%
103    select(int, chr) %>%
104    filter(int > 5) %>%
105    collect(as_data_frame = FALSE)
106
107  # collect(as_data_frame = FALSE) always returns Table now
108  expect_r6_class(b2, "Table")
109  expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
110  expect_equal(as.data.frame(b2), expected)
111
112  b3 <- batch %>%
113    select(int, strng = chr) %>%
114    filter(int > 5) %>%
115    collect(as_data_frame = FALSE)
116  expect_r6_class(b3, "Table")
117  expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
118
119  b4 <- batch %>%
120    select(int, strng = chr) %>%
121    filter(int > 5) %>%
122    group_by(int) %>%
123    collect(as_data_frame = FALSE)
124  expect_s3_class(b4, "arrow_dplyr_query")
125  expect_equal(
126    as.data.frame(b4),
127    expected %>%
128      rename(strng = chr) %>%
129      group_by(int)
130  )
131})
132
133test_that("compute()", {
134  batch <- record_batch(tbl)
135
136  b1 <- batch %>% compute()
137
138  expect_r6_class(b1, "RecordBatch")
139
140  b2 <- batch %>%
141    select(int, chr) %>%
142    filter(int > 5) %>%
143    compute()
144
145  expect_r6_class(b2, "Table")
146  expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
147  expect_equal(as.data.frame(b2), expected)
148
149  b3 <- batch %>%
150    select(int, strng = chr) %>%
151    filter(int > 5) %>%
152    compute()
153  expect_r6_class(b3, "Table")
154  expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
155
156  b4 <- batch %>%
157    select(int, strng = chr) %>%
158    filter(int > 5) %>%
159    group_by(int) %>%
160    compute()
161  expect_s3_class(b4, "arrow_dplyr_query")
162  expect_equal(
163    as.data.frame(b4),
164    expected %>%
165      rename(strng = chr) %>%
166      group_by(int)
167  )
168})
169
170test_that("head", {
171  batch <- record_batch(tbl)
172
173  b2 <- batch %>%
174    select(int, chr) %>%
175    filter(int > 5) %>%
176    head(2)
177  expect_s3_class(b2, "arrow_dplyr_query")
178  expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")][1:2, ]
179  expect_equal(collect(b2), expected)
180
181  b3 <- batch %>%
182    select(int, strng = chr) %>%
183    filter(int > 5) %>%
184    head(2)
185  expect_s3_class(b3, "arrow_dplyr_query")
186  expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
187
188  b4 <- batch %>%
189    select(int, strng = chr) %>%
190    filter(int > 5) %>%
191    group_by(int) %>%
192    head(2)
193  expect_s3_class(b4, "arrow_dplyr_query")
194  expect_equal(
195    as.data.frame(b4),
196    expected %>%
197      rename(strng = chr) %>%
198      group_by(int)
199  )
200
201  expect_equal(
202    batch %>%
203      select(int, strng = chr) %>%
204      filter(int > 5) %>%
205      head(2) %>%
206      mutate(twice = int * 2) %>%
207      collect(),
208    expected %>%
209      rename(strng = chr) %>%
210      mutate(twice = int * 2)
211  )
212
213  # This would fail if we evaluated head() after filter()
214  expect_equal(
215    batch %>%
216      select(int, strng = chr) %>%
217      head(2) %>%
218      filter(int > 5) %>%
219      collect(),
220    expected %>%
221      rename(strng = chr) %>%
222      filter(FALSE)
223  )
224})
225
226test_that("arrange then head returns the right data (ARROW-14162)", {
227
228  compare_dplyr_binding(
229    .input %>%
230      # mpg has ties so we need to sort by two things to get deterministic order
231      arrange(mpg, disp) %>%
232      head(4) %>%
233      collect(),
234    mtcars,
235    ignore_attr = "row.names"
236  )
237})
238
239test_that("arrange then tail returns the right data", {
240  compare_dplyr_binding(
241    .input %>%
242      # mpg has ties so we need to sort by two things to get deterministic order
243      arrange(mpg, disp) %>%
244      tail(4) %>%
245      collect(),
246    mtcars,
247    ignore_attr = "row.names"
248  )
249})
250
251test_that("tail", {
252  batch <- record_batch(tbl)
253
254  b2 <- batch %>%
255    select(int, chr) %>%
256    filter(int > 5) %>%
257    arrange(int) %>%
258    tail(2)
259
260  expect_s3_class(b2, "arrow_dplyr_query")
261  expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2)
262  expect_equal(as.data.frame(b2), expected)
263
264  b3 <- batch %>%
265    select(int, strng = chr) %>%
266    filter(int > 5) %>%
267    arrange(int) %>%
268    tail(2)
269  expect_s3_class(b3, "arrow_dplyr_query")
270  expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
271
272  b4 <- batch %>%
273    select(int, strng = chr) %>%
274    filter(int > 5) %>%
275    group_by(int) %>%
276    arrange(int) %>%
277    tail(2)
278  expect_s3_class(b4, "arrow_dplyr_query")
279  expect_equal(
280    as.data.frame(b4),
281    expected %>%
282      rename(strng = chr) %>%
283      group_by(int)
284  )
285})
286
287test_that("No duplicate field names are allowed in an arrow_dplyr_query", {
288  expect_error(
289    Table$create(tbl, tbl) %>%
290      filter(int > 0),
291    regexp = paste0(
292      'The following field names were found more than once in the data: "int", "dbl", ',
293      '"dbl2", "lgl", "false", "chr", "fct", "verses", "padded_strings"'
294    )
295  )
296})
297