1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18skip_if_not_available("dataset")
19skip_if_not_available("utf8proc")
20
21library(dplyr, warn.conflicts = FALSE)
22library(lubridate)
23library(stringr)
24library(stringi)
25
26test_that("paste, paste0, and str_c", {
27  df <- tibble(
28    v = c("A", "B", "C"),
29    w = c("a", "b", "c"),
30    x = c("d", NA_character_, "f"),
31    y = c(NA_character_, "h", "i"),
32    z = c(1.1, 2.2, NA)
33  )
34  x <- Expression$field_ref("x")
35  y <- Expression$field_ref("y")
36
37  # no NAs in data
38  compare_dplyr_binding(
39    .input %>%
40      transmute(paste(v, w)) %>%
41      collect(),
42    df
43  )
44  compare_dplyr_binding(
45    .input %>%
46      transmute(paste(v, w, sep = "-")) %>%
47      collect(),
48    df
49  )
50  compare_dplyr_binding(
51    .input %>%
52      transmute(paste0(v, w)) %>%
53      collect(),
54    df
55  )
56  compare_dplyr_binding(
57    .input %>%
58      transmute(str_c(v, w)) %>%
59      collect(),
60    df
61  )
62  compare_dplyr_binding(
63    .input %>%
64      transmute(str_c(v, w, sep = "+")) %>%
65      collect(),
66    df
67  )
68
69  # NAs in data
70  compare_dplyr_binding(
71    .input %>%
72      transmute(paste(x, y)) %>%
73      collect(),
74    df
75  )
76  compare_dplyr_binding(
77    .input %>%
78      transmute(paste(x, y, sep = "-")) %>%
79      collect(),
80    df
81  )
82  compare_dplyr_binding(
83    .input %>%
84      transmute(str_c(x, y)) %>%
85      collect(),
86    df
87  )
88
89  # non-character column in dots
90  compare_dplyr_binding(
91    .input %>%
92      transmute(paste0(x, y, z)) %>%
93      collect(),
94    df
95  )
96
97  # literal string in dots
98  compare_dplyr_binding(
99    .input %>%
100      transmute(paste(x, "foo", y)) %>%
101      collect(),
102    df
103  )
104
105  # literal NA in dots
106  compare_dplyr_binding(
107    .input %>%
108      transmute(paste(x, NA, y)) %>%
109      collect(),
110    df
111  )
112
113  # expressions in dots
114  compare_dplyr_binding(
115    .input %>%
116      transmute(paste0(x, toupper(y), as.character(z))) %>%
117      collect(),
118    df
119  )
120
121  # sep is literal NA
122  # errors in paste() (consistent with base::paste())
123  expect_error(
124    nse_funcs$paste(x, y, sep = NA_character_),
125    "Invalid separator"
126  )
127  # emits null in str_c() (consistent with stringr::str_c())
128  compare_dplyr_binding(
129    .input %>%
130      transmute(str_c(x, y, sep = NA_character_)) %>%
131      collect(),
132    df
133  )
134
135  # sep passed in dots to paste0 (which doesn't take a sep argument)
136  compare_dplyr_binding(
137    .input %>%
138      transmute(paste0(x, y, sep = "-")) %>%
139      collect(),
140    df
141  )
142
143  # known differences
144
145  # arrow allows the separator to be an array
146  expect_equal(
147    df %>%
148      Table$create() %>%
149      transmute(result = paste(x, y, sep = w)) %>%
150      collect(),
151    df %>%
152      transmute(result = paste(x, w, y, sep = ""))
153  )
154
155  # expected errors
156
157  # collapse argument not supported
158  expect_error(
159    nse_funcs$paste(x, y, collapse = ""),
160    "collapse"
161  )
162  expect_error(
163    nse_funcs$paste0(x, y, collapse = ""),
164    "collapse"
165  )
166  expect_error(
167    nse_funcs$str_c(x, y, collapse = ""),
168    "collapse"
169  )
170
171  # literal vectors of length != 1 not supported
172  expect_error(
173    nse_funcs$paste(x, character(0), y),
174    "Literal vectors of length != 1 not supported in string concatenation"
175  )
176  expect_error(
177    nse_funcs$paste(x, c(",", ";"), y),
178    "Literal vectors of length != 1 not supported in string concatenation"
179  )
180})
181
182test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
183  df <- tibble(x = c("Foo", "bar"))
184  compare_dplyr_binding(
185    .input %>%
186      filter(grepl("o", x, fixed = TRUE)) %>%
187      collect(),
188    df
189  )
190})
191
192test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", {
193  df <- tibble(x = c("Foo", "bar"))
194  compare_dplyr_binding(
195    .input %>%
196      transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>%
197      collect(),
198    df
199  )
200  compare_dplyr_binding(
201    .input %>%
202      transmute(x = gsub("o", "u", x, fixed = TRUE)) %>%
203      collect(),
204    df
205  )
206})
207
208# many of the remainder of these tests require RE2
209skip_if_not_available("re2")
210
211test_that("grepl", {
212  df <- tibble(x = c("Foo", "bar"))
213
214  for (fixed in c(TRUE, FALSE)) {
215    compare_dplyr_binding(
216      .input %>%
217        filter(grepl("Foo", x, fixed = fixed)) %>%
218        collect(),
219      df
220    )
221    compare_dplyr_binding(
222      .input %>%
223        transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>%
224        collect(),
225      df
226    )
227    compare_dplyr_binding(
228      .input %>%
229        filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>%
230        collect(),
231      df
232    )
233  }
234})
235
236test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
237  df <- tibble(x = c("Foo", "bar"))
238
239  # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE,
240  # so we can't use compare_dplyr_binding() for these tests
241  expect_equal(
242    df %>%
243      Table$create() %>%
244      filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>%
245      collect(),
246    tibble(x = "Foo")
247  )
248  expect_equal(
249    df %>%
250      Table$create() %>%
251      filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>%
252      collect(),
253    tibble(x = character(0))
254  )
255})
256
257test_that("str_detect", {
258  df <- tibble(x = c("Foo", "bar"))
259
260  compare_dplyr_binding(
261    .input %>%
262      filter(str_detect(x, regex("^F"))) %>%
263      collect(),
264    df
265  )
266  compare_dplyr_binding(
267    .input %>%
268      transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>%
269      collect(),
270    df
271  )
272  compare_dplyr_binding(
273    .input %>%
274      transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>%
275      collect(),
276    df
277  )
278  compare_dplyr_binding(
279    .input %>%
280      filter(str_detect(x, fixed("o"))) %>%
281      collect(),
282    df
283  )
284  compare_dplyr_binding(
285    .input %>%
286      filter(str_detect(x, fixed("O"))) %>%
287      collect(),
288    df
289  )
290  compare_dplyr_binding(
291    .input %>%
292      filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>%
293      collect(),
294    df
295  )
296  compare_dplyr_binding(
297    .input %>%
298      filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>%
299      collect(),
300    df
301  )
302})
303
304test_that("sub and gsub", {
305  df <- tibble(x = c("Foo", "bar"))
306
307  for (fixed in c(TRUE, FALSE)) {
308    compare_dplyr_binding(
309      .input %>%
310        transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>%
311        collect(),
312      df
313    )
314    compare_dplyr_binding(
315      .input %>%
316        transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
317        collect(),
318      df
319    )
320    compare_dplyr_binding(
321      .input %>%
322        transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
323        collect(),
324      df
325    )
326  }
327})
328
329test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
330  df <- tibble(x = c("Foo", "bar"))
331
332  # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when
333  # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests
334  expect_equal(
335    df %>%
336      Table$create() %>%
337      transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
338      collect(),
339    tibble(x = c("Fuo", "bar"))
340  )
341  expect_equal(
342    df %>%
343      Table$create() %>%
344      transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
345      collect(),
346    tibble(x = c("Fuu", "bar"))
347  )
348  expect_equal(
349    df %>%
350      Table$create() %>%
351      transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>%
352      collect(),
353    df # unchanged
354  )
355})
356
357test_that("str_replace and str_replace_all", {
358  df <- tibble(x = c("Foo", "bar"))
359
360  compare_dplyr_binding(
361    .input %>%
362      transmute(x = str_replace_all(x, "^F", "baz")) %>%
363      collect(),
364    df
365  )
366
367  compare_dplyr_binding(
368    .input %>%
369      transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
370      collect(),
371    df
372  )
373
374  compare_dplyr_binding(
375    .input %>%
376      mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
377      collect(),
378    df
379  )
380
381  compare_dplyr_binding(
382    .input %>%
383      transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
384      collect(),
385    df
386  )
387  compare_dplyr_binding(
388    .input %>%
389      transmute(x = str_replace_all(x, fixed("o"), "u")) %>%
390      collect(),
391    df
392  )
393  compare_dplyr_binding(
394    .input %>%
395      transmute(x = str_replace(x, fixed("O"), "u")) %>%
396      collect(),
397    df
398  )
399  compare_dplyr_binding(
400    .input %>%
401      transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>%
402      collect(),
403    df
404  )
405})
406
407test_that("strsplit and str_split", {
408  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
409
410  compare_dplyr_binding(
411    .input %>%
412      mutate(x = strsplit(x, "and")) %>%
413      collect(),
414    df,
415    # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray)
416    # has type information in it, but it's just a bare list from R/dplyr.
417    ignore_attr = TRUE
418  )
419  compare_dplyr_binding(
420    .input %>%
421      mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
422      collect(),
423    df,
424    ignore_attr = TRUE
425  )
426  compare_dplyr_binding(
427    .input %>%
428      mutate(x = strsplit(x, " +and +")) %>%
429      collect(),
430    df,
431    ignore_attr = TRUE
432  )
433  compare_dplyr_binding(
434    .input %>%
435      mutate(x = str_split(x, "and")) %>%
436      collect(),
437    df,
438    ignore_attr = TRUE
439  )
440  compare_dplyr_binding(
441    .input %>%
442      mutate(x = str_split(x, "and", n = 2)) %>%
443      collect(),
444    df,
445    ignore_attr = TRUE
446  )
447  compare_dplyr_binding(
448    .input %>%
449      mutate(x = str_split(x, fixed("and"), n = 2)) %>%
450      collect(),
451    df,
452    ignore_attr = TRUE
453  )
454  compare_dplyr_binding(
455    .input %>%
456      mutate(x = str_split(x, regex("and"), n = 2)) %>%
457      collect(),
458    df,
459    ignore_attr = TRUE
460  )
461  compare_dplyr_binding(
462    .input %>%
463      mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
464      collect(),
465    df,
466    ignore_attr = TRUE
467  )
468})
469
470test_that("str_to_lower, str_to_upper, and str_to_title", {
471  df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
472  compare_dplyr_binding(
473    .input %>%
474      transmute(
475        x_lower = str_to_lower(x),
476        x_upper = str_to_upper(x),
477        x_title = str_to_title(x)
478      ) %>%
479      collect(),
480    df
481  )
482
483  # Error checking a single function because they all use the same code path.
484  expect_error(
485    nse_funcs$str_to_lower("Apache Arrow", locale = "sp"),
486    "Providing a value for 'locale' other than the default ('en') is not supported by Arrow",
487    fixed = TRUE
488  )
489})
490
491test_that("arrow_*_split_whitespace functions", {
492  # use only ASCII whitespace characters
493  df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
494
495  # use only non-ASCII whitespace characters
496  df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux"))
497
498  df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
499
500  # use default option values
501  expect_equal(
502    df_ascii %>%
503      Table$create() %>%
504      mutate(x = arrow_ascii_split_whitespace(x)) %>%
505      collect(),
506    df_split,
507    ignore_attr = TRUE
508  )
509  expect_equal(
510    df_utf8 %>%
511      Table$create() %>%
512      mutate(x = arrow_utf8_split_whitespace(x)) %>%
513      collect(),
514    df_split,
515    ignore_attr = TRUE
516  )
517
518  # specify non-default option values
519  expect_equal(
520    df_ascii %>%
521      Table$create() %>%
522      mutate(
523        x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
524      ) %>%
525      collect(),
526    tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))),
527    ignore_attr = TRUE
528  )
529  expect_equal(
530    df_utf8 %>%
531      Table$create() %>%
532      mutate(
533        x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
534      ) %>%
535      collect(),
536    tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))),
537    ignore_attr = TRUE
538  )
539})
540
541test_that("errors and warnings in string splitting", {
542  # These conditions generate an error, but abandon_ship() catches the error,
543  # issues a warning, and pulls the data into R (if computing on InMemoryDataset)
544  # Elsewhere we test that abandon_ship() works,
545  # so here we can just call the functions directly
546
547  x <- Expression$field_ref("x")
548  expect_error(
549    nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)),
550    "Case-insensitive string splitting not supported by Arrow"
551  )
552  expect_error(
553    nse_funcs$str_split(x, coll("and.?")),
554    "Pattern modifier `coll()` not supported by Arrow",
555    fixed = TRUE
556  )
557  expect_error(
558    nse_funcs$str_split(x, boundary(type = "word")),
559    "Pattern modifier `boundary()` not supported by Arrow",
560    fixed = TRUE
561  )
562  expect_error(
563    nse_funcs$str_split(x, "and", n = 0),
564    "Splitting strings into zero parts not supported by Arrow"
565  )
566
567  # This condition generates a warning
568  expect_warning(
569    nse_funcs$str_split(x, fixed("and"), simplify = TRUE),
570    "Argument 'simplify = TRUE' will be ignored"
571  )
572})
573
574test_that("errors and warnings in string detection and replacement", {
575  x <- Expression$field_ref("x")
576
577  expect_error(
578    nse_funcs$str_detect(x, boundary(type = "character")),
579    "Pattern modifier `boundary()` not supported by Arrow",
580    fixed = TRUE
581  )
582  expect_error(
583    nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"),
584    "Pattern modifier `coll()` not supported by Arrow",
585    fixed = TRUE
586  )
587
588  # This condition generates a warning
589  expect_warning(
590    nse_funcs$str_replace_all(x, regex("o", multiline = TRUE), "u"),
591    "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
592  )
593})
594
595test_that("backreferences in pattern in string detection", {
596  skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
597  df <- tibble(x = c("Foo", "bar"))
598
599  compare_dplyr_binding(
600    .input %>%
601      filter(str_detect(x, regex("F([aeiou])\\1"))) %>%
602      collect(),
603    df
604  )
605})
606
607test_that("backreferences (substitutions) in string replacement", {
608  df <- tibble(x = c("Foo", "bar"))
609
610  compare_dplyr_binding(
611    .input %>%
612      transmute(desc = sub(
613        "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?",
614        "path `\\2` on server `\\1`",
615        url
616      )) %>%
617      collect(),
618    tibble(url = "https://arrow.apache.org/docs/r/")
619  )
620  compare_dplyr_binding(
621    .input %>%
622      transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
623      collect(),
624    df
625  )
626  compare_dplyr_binding(
627    .input %>%
628      transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
629      collect(),
630    df
631  )
632  compare_dplyr_binding(
633    .input %>%
634      transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
635      collect(),
636    df
637  )
638})
639
640test_that("edge cases in string detection and replacement", {
641  # in case-insensitive fixed match/replace, test that "\\E" in the search
642  # string and backslashes in the replacement string are interpreted literally.
643  # this test does not use compare_dplyr_binding() because base::sub() and
644  # base::grepl() do not support ignore.case = TRUE when fixed = TRUE.
645  expect_equal(
646    tibble(x = c("\\Q\\e\\D")) %>%
647      Table$create() %>%
648      filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>%
649      collect(),
650    tibble(x = c("\\Q\\e\\D"))
651  )
652  expect_equal(
653    tibble(x = c("\\Q\\e\\D")) %>%
654      Table$create() %>%
655      transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>%
656      collect(),
657    tibble(x = c("\\Q\\L\\D"))
658  )
659
660  # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's
661  # added in case-insensitive regex match/replace
662  compare_dplyr_binding(
663    .input %>%
664      filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>%
665      collect(),
666    tibble(x = c("ABC"))
667  )
668  compare_dplyr_binding(
669    .input %>%
670      transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>%
671      collect(),
672    tibble(x = c("ABC"))
673  )
674})
675
676test_that("strptime", {
677  # base::strptime() defaults to local timezone
678  # but arrow's strptime defaults to UTC.
679  # So that tests are consistent, set the local timezone to UTC
680  # TODO: consider reevaluating this workaround after ARROW-12980
681  withr::local_timezone("UTC")
682
683  t_string <- tibble(x = c("2018-10-07 19:04:05", NA))
684  t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA))
685
686  expect_equal(
687    t_string %>%
688      Table$create() %>%
689      mutate(
690        x = strptime(x)
691      ) %>%
692      collect(),
693    t_stamp,
694    ignore_attr = "tzone"
695  )
696
697  expect_equal(
698    t_string %>%
699      Table$create() %>%
700      mutate(
701        x = strptime(x, format = "%Y-%m-%d %H:%M:%S")
702      ) %>%
703      collect(),
704    t_stamp,
705    ignore_attr = "tzone"
706  )
707
708  expect_equal(
709    t_string %>%
710      Table$create() %>%
711      mutate(
712        x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns")
713      ) %>%
714      collect(),
715    t_stamp,
716    ignore_attr = "tzone"
717  )
718
719  expect_equal(
720    t_string %>%
721      Table$create() %>%
722      mutate(
723        x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s")
724      ) %>%
725      collect(),
726    t_stamp,
727    ignore_attr = "tzone"
728  )
729
730  tstring <- tibble(x = c("08-05-2008", NA))
731  tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y")
732
733  expect_equal(
734    tstring %>%
735      Table$create() %>%
736      mutate(
737        x = strptime(x, format = "%m-%d-%Y")
738      ) %>%
739      pull(),
740    # R's strptime returns POSIXlt (list type)
741    as.POSIXct(tstamp),
742    ignore_attr = "tzone"
743  )
744})
745
746test_that("errors in strptime", {
747  # Error when tz is passed
748  x <- Expression$field_ref("x")
749  expect_error(
750    nse_funcs$strptime(x, tz = "PDT"),
751    "Time zone argument not supported by Arrow"
752  )
753})
754
755test_that("strftime", {
756  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
757
758  times <- tibble(
759    datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA),
760    date = c(as.Date("2021-01-01"), NA)
761  )
762  formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u"
763  formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u"
764
765  compare_dplyr_binding(
766    .input %>%
767      mutate(x = strftime(datetime, format = formats)) %>%
768      collect(),
769    times
770  )
771
772  compare_dplyr_binding(
773    .input %>%
774      mutate(x = strftime(date, format = formats_date)) %>%
775      collect(),
776    times
777  )
778
779  compare_dplyr_binding(
780    .input %>%
781      mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>%
782      collect(),
783    times
784  )
785
786  compare_dplyr_binding(
787    .input %>%
788      mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>%
789      collect(),
790    times
791  )
792
793  withr::with_timezone(
794    "Pacific/Marquesas",
795    {
796      compare_dplyr_binding(
797        .input %>%
798          mutate(
799            x = strftime(datetime, format = formats, tz = "EST"),
800            x_date = strftime(date, format = formats_date, tz = "EST")
801          ) %>%
802          collect(),
803        times
804      )
805
806      compare_dplyr_binding(
807        .input %>%
808          mutate(
809            x = strftime(datetime, format = formats),
810            x_date = strftime(date, format = formats_date)
811          ) %>%
812          collect(),
813        times
814      )
815    }
816  )
817
818  # This check is due to differences in the way %c currently works in Arrow and R's strftime.
819  # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved.
820  expect_error(
821    times %>%
822      Table$create() %>%
823      mutate(x = strftime(datetime, format = "%c")) %>%
824      collect(),
825    "%c flag is not supported in non-C locales."
826  )
827
828  # Output precision of %S depends on the input timestamp precision.
829  # Timestamps with second precision are represented as integers while
830  # milliseconds, microsecond and nanoseconds are represented as fixed floating
831  # point numbers with 3, 6 and 9 decimal places respectively.
832  compare_dplyr_binding(
833    .input %>%
834      mutate(x = strftime(datetime, format = "%S")) %>%
835      transmute(as.double(substr(x, 1, 2))) %>%
836      collect(),
837    times,
838    tolerance = 1e-6
839  )
840})
841
842test_that("format_ISO8601", {
843  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
844  times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))
845
846  compare_dplyr_binding(
847    .input %>%
848      mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>%
849      collect(),
850    times
851  )
852
853  if (getRversion() < "3.5") {
854    # before 3.5, times$x will have no timezone attribute, so Arrow faithfully
855    # errors that there is no timezone to format:
856    expect_error(
857      times %>%
858        Table$create() %>%
859        mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
860        collect(),
861      "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z"
862    )
863
864    # See comment regarding %S flag in strftime tests
865    expect_error(
866      times %>%
867        Table$create() %>%
868        mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
869        mutate(x = gsub("\\.0*", "", x)) %>%
870        collect(),
871      "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z"
872    )
873  } else {
874    compare_dplyr_binding(
875      .input %>%
876        mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
877        collect(),
878      times
879    )
880
881    # See comment regarding %S flag in strftime tests
882    compare_dplyr_binding(
883      .input %>%
884        mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
885        mutate(x = gsub("\\.0*", "", x)) %>%
886        collect(),
887      times
888    )
889  }
890
891
892  # See comment regarding %S flag in strftime tests
893  compare_dplyr_binding(
894    .input %>%
895      mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>%
896      mutate(x = gsub("\\.0*", "", x)) %>%
897      collect(),
898    times
899  )
900})
901
902test_that("arrow_find_substring and arrow_find_substring_regex", {
903  df <- tibble(x = c("Foo and Bar", "baz and qux and quux"))
904
905  expect_equal(
906    df %>%
907      Table$create() %>%
908      mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>%
909      collect(),
910    tibble(x = c(-1, 0))
911  )
912  expect_equal(
913    df %>%
914      Table$create() %>%
915      mutate(x = arrow_find_substring(
916        x,
917        options = list(pattern = "b", ignore_case = TRUE)
918      )) %>%
919      collect(),
920    tibble(x = c(8, 0))
921  )
922  expect_equal(
923    df %>%
924      Table$create() %>%
925      mutate(x = arrow_find_substring_regex(
926        x,
927        options = list(pattern = "^[fb]")
928      )) %>%
929      collect(),
930    tibble(x = c(-1, 0))
931  )
932  expect_equal(
933    df %>%
934      Table$create() %>%
935      mutate(x = arrow_find_substring_regex(
936        x,
937        options = list(pattern = "[AEIOU]", ignore_case = TRUE)
938      )) %>%
939      collect(),
940    tibble(x = c(1, 1))
941  )
942})
943
944test_that("stri_reverse and arrow_ascii_reverse functions", {
945  df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
946
947  df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux"))
948
949  compare_dplyr_binding(
950    .input %>%
951      mutate(x = stri_reverse(x)) %>%
952      collect(),
953    df_utf8
954  )
955
956  compare_dplyr_binding(
957    .input %>%
958      mutate(x = stri_reverse(x)) %>%
959      collect(),
960    df_ascii
961  )
962
963  expect_equal(
964    df_ascii %>%
965      Table$create() %>%
966      mutate(x = arrow_ascii_reverse(x)) %>%
967      collect(),
968    tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab"))
969  )
970
971  expect_error(
972    df_utf8 %>%
973      Table$create() %>%
974      mutate(x = arrow_ascii_reverse(x)) %>%
975      collect(),
976    "Invalid: Non-ASCII sequence in input"
977  )
978})
979
980test_that("str_like", {
981  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
982
983  # TODO: After new version of stringr with str_like has been released, update all
984  # these tests to use compare_dplyr_binding
985
986  # No match - entire string
987  expect_equal(
988    df %>%
989      Table$create() %>%
990      mutate(x = str_like(x, "baz")) %>%
991      collect(),
992    tibble(x = c(FALSE, FALSE))
993  )
994
995  # Match - entire string
996  expect_equal(
997    df %>%
998      Table$create() %>%
999      mutate(x = str_like(x, "Foo and bar")) %>%
1000      collect(),
1001    tibble(x = c(TRUE, FALSE))
1002  )
1003
1004  # Wildcard
1005  expect_equal(
1006    df %>%
1007      Table$create() %>%
1008      mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>%
1009      collect(),
1010    tibble(x = c(TRUE, FALSE))
1011  )
1012
1013  # Ignore case
1014  expect_equal(
1015    df %>%
1016      Table$create() %>%
1017      mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>%
1018      collect(),
1019    tibble(x = c(FALSE, FALSE))
1020  )
1021
1022  # Single character
1023  expect_equal(
1024    df %>%
1025      Table$create() %>%
1026      mutate(x = str_like(x, "_a%")) %>%
1027      collect(),
1028    tibble(x = c(FALSE, TRUE))
1029  )
1030
1031  # This will give an error until a new version of stringr with str_like has been released
1032  skip_if_not(packageVersion("stringr") > "1.4.0")
1033  compare_dplyr_binding(
1034    .input %>%
1035      mutate(x = str_like(x, "%baz%")) %>%
1036      collect(),
1037    df
1038  )
1039})
1040
1041test_that("str_pad", {
1042  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
1043
1044  compare_dplyr_binding(
1045    .input %>%
1046      mutate(x = str_pad(x, width = 31)) %>%
1047      collect(),
1048    df
1049  )
1050
1051  compare_dplyr_binding(
1052    .input %>%
1053      mutate(x = str_pad(x, width = 30, side = "right")) %>%
1054      collect(),
1055    df
1056  )
1057
1058  compare_dplyr_binding(
1059    .input %>%
1060      mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>%
1061      collect(),
1062    df
1063  )
1064
1065  compare_dplyr_binding(
1066    .input %>%
1067      mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>%
1068      collect(),
1069    df
1070  )
1071
1072  compare_dplyr_binding(
1073    .input %>%
1074      mutate(x = str_pad(x, width = 31, side = "both")) %>%
1075      collect(),
1076    df
1077  )
1078})
1079
1080test_that("substr", {
1081  df <- tibble(x = "Apache Arrow")
1082
1083  compare_dplyr_binding(
1084    .input %>%
1085      mutate(y = substr(x, 1, 6)) %>%
1086      collect(),
1087    df
1088  )
1089
1090  compare_dplyr_binding(
1091    .input %>%
1092      mutate(y = substr(x, 0, 6)) %>%
1093      collect(),
1094    df
1095  )
1096
1097  compare_dplyr_binding(
1098    .input %>%
1099      mutate(y = substr(x, -1, 6)) %>%
1100      collect(),
1101    df
1102  )
1103
1104  compare_dplyr_binding(
1105    .input %>%
1106      mutate(y = substr(x, 6, 1)) %>%
1107      collect(),
1108    df
1109  )
1110
1111  compare_dplyr_binding(
1112    .input %>%
1113      mutate(y = substr(x, -1, -2)) %>%
1114      collect(),
1115    df
1116  )
1117
1118  compare_dplyr_binding(
1119    .input %>%
1120      mutate(y = substr(x, 9, 6)) %>%
1121      collect(),
1122    df
1123  )
1124
1125  compare_dplyr_binding(
1126    .input %>%
1127      mutate(y = substr(x, 1, 6)) %>%
1128      collect(),
1129    df
1130  )
1131
1132  compare_dplyr_binding(
1133    .input %>%
1134      mutate(y = substr(x, 8, 12)) %>%
1135      collect(),
1136    df
1137  )
1138
1139  compare_dplyr_binding(
1140    .input %>%
1141      mutate(y = substr(x, -5, -1)) %>%
1142      collect(),
1143    df
1144  )
1145
1146  expect_error(
1147    nse_funcs$substr("Apache Arrow", c(1, 2), 3),
1148    "`start` must be length 1 - other lengths are not supported in Arrow"
1149  )
1150
1151  expect_error(
1152    nse_funcs$substr("Apache Arrow", 1, c(2, 3)),
1153    "`stop` must be length 1 - other lengths are not supported in Arrow"
1154  )
1155})
1156
1157test_that("substring", {
1158  # nse_funcs$substring just calls nse_funcs$substr, tested extensively above
1159  df <- tibble(x = "Apache Arrow")
1160
1161  compare_dplyr_binding(
1162    .input %>%
1163      mutate(y = substring(x, 1, 6)) %>%
1164      collect(),
1165    df
1166  )
1167})
1168
1169test_that("str_sub", {
1170  df <- tibble(x = "Apache Arrow")
1171
1172  compare_dplyr_binding(
1173    .input %>%
1174      mutate(y = str_sub(x, 1, 6)) %>%
1175      collect(),
1176    df
1177  )
1178
1179  compare_dplyr_binding(
1180    .input %>%
1181      mutate(y = str_sub(x, 0, 6)) %>%
1182      collect(),
1183    df
1184  )
1185
1186  compare_dplyr_binding(
1187    .input %>%
1188      mutate(y = str_sub(x, -1, 6)) %>%
1189      collect(),
1190    df
1191  )
1192
1193  compare_dplyr_binding(
1194    .input %>%
1195      mutate(y = str_sub(x, 6, 1)) %>%
1196      collect(),
1197    df
1198  )
1199
1200  compare_dplyr_binding(
1201    .input %>%
1202      mutate(y = str_sub(x, -1, -2)) %>%
1203      collect(),
1204    df
1205  )
1206
1207  compare_dplyr_binding(
1208    .input %>%
1209      mutate(y = str_sub(x, -1, 3)) %>%
1210      collect(),
1211    df
1212  )
1213
1214  compare_dplyr_binding(
1215    .input %>%
1216      mutate(y = str_sub(x, 9, 6)) %>%
1217      collect(),
1218    df
1219  )
1220
1221  compare_dplyr_binding(
1222    .input %>%
1223      mutate(y = str_sub(x, 1, 6)) %>%
1224      collect(),
1225    df
1226  )
1227
1228  compare_dplyr_binding(
1229    .input %>%
1230      mutate(y = str_sub(x, 8, 12)) %>%
1231      collect(),
1232    df
1233  )
1234
1235  compare_dplyr_binding(
1236    .input %>%
1237      mutate(y = str_sub(x, -5, -1)) %>%
1238      collect(),
1239    df
1240  )
1241
1242  expect_error(
1243    nse_funcs$str_sub("Apache Arrow", c(1, 2), 3),
1244    "`start` must be length 1 - other lengths are not supported in Arrow"
1245  )
1246
1247  expect_error(
1248    nse_funcs$str_sub("Apache Arrow", 1, c(2, 3)),
1249    "`end` must be length 1 - other lengths are not supported in Arrow"
1250  )
1251})
1252
1253test_that("str_starts, str_ends, startsWith, endsWith", {
1254  df <- tibble(x = c("Foo", "bar", "baz", "qux"))
1255
1256  compare_dplyr_binding(
1257    .input %>%
1258      filter(str_starts(x, "b.*")) %>%
1259      collect(),
1260    df
1261  )
1262
1263  compare_dplyr_binding(
1264    .input %>%
1265      filter(str_starts(x, "b.*", negate = TRUE)) %>%
1266      collect(),
1267    df
1268  )
1269
1270  compare_dplyr_binding(
1271    .input %>%
1272      filter(str_starts(x, fixed("b.*"))) %>%
1273      collect(),
1274    df
1275  )
1276
1277  compare_dplyr_binding(
1278    .input %>%
1279      filter(str_starts(x, fixed("b"))) %>%
1280      collect(),
1281    df
1282  )
1283
1284  compare_dplyr_binding(
1285    .input %>%
1286      filter(str_ends(x, "r")) %>%
1287      collect(),
1288    df
1289  )
1290
1291  compare_dplyr_binding(
1292    .input %>%
1293      filter(str_ends(x, "r", negate = TRUE)) %>%
1294      collect(),
1295    df
1296  )
1297
1298  compare_dplyr_binding(
1299    .input %>%
1300      filter(str_ends(x, fixed("r$"))) %>%
1301      collect(),
1302    df
1303  )
1304
1305  compare_dplyr_binding(
1306    .input %>%
1307      filter(str_ends(x, fixed("r"))) %>%
1308      collect(),
1309    df
1310  )
1311
1312  compare_dplyr_binding(
1313    .input %>%
1314      filter(startsWith(x, "b")) %>%
1315      collect(),
1316    df
1317  )
1318
1319  compare_dplyr_binding(
1320    .input %>%
1321      filter(endsWith(x, "r")) %>%
1322      collect(),
1323    df
1324  )
1325
1326  compare_dplyr_binding(
1327    .input %>%
1328      filter(startsWith(x, "b.*")) %>%
1329      collect(),
1330    df
1331  )
1332
1333  compare_dplyr_binding(
1334    .input %>%
1335      filter(endsWith(x, "r$")) %>%
1336      collect(),
1337    df
1338  )
1339})
1340
1341test_that("str_count", {
1342  df <- tibble(
1343    cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"),
1344    dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")
1345  )
1346
1347  compare_dplyr_binding(
1348    .input %>%
1349      mutate(a_count = str_count(cities, pattern = "a")) %>%
1350      collect(),
1351    df
1352  )
1353
1354  compare_dplyr_binding(
1355    .input %>%
1356      mutate(p_count = str_count(cities, pattern = "d")) %>%
1357      collect(),
1358    df
1359  )
1360
1361  compare_dplyr_binding(
1362    .input %>%
1363      mutate(p_count = str_count(cities,
1364        pattern = regex("d", ignore_case = TRUE)
1365      )) %>%
1366      collect(),
1367    df
1368  )
1369
1370  compare_dplyr_binding(
1371    .input %>%
1372      mutate(e_count = str_count(cities, pattern = "u")) %>%
1373      collect(),
1374    df
1375  )
1376
1377  # nse_funcs$str_count() is not vectorised over pattern
1378  compare_dplyr_binding(
1379    .input %>%
1380      mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>%
1381      collect(),
1382    df,
1383    warning = TRUE
1384  )
1385
1386  compare_dplyr_binding(
1387    .input %>%
1388      mutate(dots_count = str_count(dots, ".")) %>%
1389      collect(),
1390    df
1391  )
1392
1393  compare_dplyr_binding(
1394    .input %>%
1395      mutate(dots_count = str_count(dots, fixed("."))) %>%
1396      collect(),
1397    df
1398  )
1399})
1400