1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18skip_if_not_available("dataset") 19skip_if_not_available("utf8proc") 20 21library(dplyr, warn.conflicts = FALSE) 22library(lubridate) 23library(stringr) 24library(stringi) 25 26test_that("paste, paste0, and str_c", { 27 df <- tibble( 28 v = c("A", "B", "C"), 29 w = c("a", "b", "c"), 30 x = c("d", NA_character_, "f"), 31 y = c(NA_character_, "h", "i"), 32 z = c(1.1, 2.2, NA) 33 ) 34 x <- Expression$field_ref("x") 35 y <- Expression$field_ref("y") 36 37 # no NAs in data 38 compare_dplyr_binding( 39 .input %>% 40 transmute(paste(v, w)) %>% 41 collect(), 42 df 43 ) 44 compare_dplyr_binding( 45 .input %>% 46 transmute(paste(v, w, sep = "-")) %>% 47 collect(), 48 df 49 ) 50 compare_dplyr_binding( 51 .input %>% 52 transmute(paste0(v, w)) %>% 53 collect(), 54 df 55 ) 56 compare_dplyr_binding( 57 .input %>% 58 transmute(str_c(v, w)) %>% 59 collect(), 60 df 61 ) 62 compare_dplyr_binding( 63 .input %>% 64 transmute(str_c(v, w, sep = "+")) %>% 65 collect(), 66 df 67 ) 68 69 # NAs in data 70 compare_dplyr_binding( 71 .input %>% 72 transmute(paste(x, y)) %>% 73 collect(), 74 df 75 ) 76 compare_dplyr_binding( 77 .input %>% 78 transmute(paste(x, y, sep = "-")) %>% 79 collect(), 80 df 81 ) 82 compare_dplyr_binding( 83 .input %>% 84 transmute(str_c(x, y)) %>% 85 collect(), 86 df 87 ) 88 89 # non-character column in dots 90 compare_dplyr_binding( 91 .input %>% 92 transmute(paste0(x, y, z)) %>% 93 collect(), 94 df 95 ) 96 97 # literal string in dots 98 compare_dplyr_binding( 99 .input %>% 100 transmute(paste(x, "foo", y)) %>% 101 collect(), 102 df 103 ) 104 105 # literal NA in dots 106 compare_dplyr_binding( 107 .input %>% 108 transmute(paste(x, NA, y)) %>% 109 collect(), 110 df 111 ) 112 113 # expressions in dots 114 compare_dplyr_binding( 115 .input %>% 116 transmute(paste0(x, toupper(y), as.character(z))) %>% 117 collect(), 118 df 119 ) 120 121 # sep is literal NA 122 # errors in paste() (consistent with base::paste()) 123 expect_error( 124 nse_funcs$paste(x, y, sep = NA_character_), 125 "Invalid separator" 126 ) 127 # emits null in str_c() (consistent with stringr::str_c()) 128 compare_dplyr_binding( 129 .input %>% 130 transmute(str_c(x, y, sep = NA_character_)) %>% 131 collect(), 132 df 133 ) 134 135 # sep passed in dots to paste0 (which doesn't take a sep argument) 136 compare_dplyr_binding( 137 .input %>% 138 transmute(paste0(x, y, sep = "-")) %>% 139 collect(), 140 df 141 ) 142 143 # known differences 144 145 # arrow allows the separator to be an array 146 expect_equal( 147 df %>% 148 Table$create() %>% 149 transmute(result = paste(x, y, sep = w)) %>% 150 collect(), 151 df %>% 152 transmute(result = paste(x, w, y, sep = "")) 153 ) 154 155 # expected errors 156 157 # collapse argument not supported 158 expect_error( 159 nse_funcs$paste(x, y, collapse = ""), 160 "collapse" 161 ) 162 expect_error( 163 nse_funcs$paste0(x, y, collapse = ""), 164 "collapse" 165 ) 166 expect_error( 167 nse_funcs$str_c(x, y, collapse = ""), 168 "collapse" 169 ) 170 171 # literal vectors of length != 1 not supported 172 expect_error( 173 nse_funcs$paste(x, character(0), y), 174 "Literal vectors of length != 1 not supported in string concatenation" 175 ) 176 expect_error( 177 nse_funcs$paste(x, c(",", ";"), y), 178 "Literal vectors of length != 1 not supported in string concatenation" 179 ) 180}) 181 182test_that("grepl with ignore.case = FALSE and fixed = TRUE", { 183 df <- tibble(x = c("Foo", "bar")) 184 compare_dplyr_binding( 185 .input %>% 186 filter(grepl("o", x, fixed = TRUE)) %>% 187 collect(), 188 df 189 ) 190}) 191 192test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", { 193 df <- tibble(x = c("Foo", "bar")) 194 compare_dplyr_binding( 195 .input %>% 196 transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>% 197 collect(), 198 df 199 ) 200 compare_dplyr_binding( 201 .input %>% 202 transmute(x = gsub("o", "u", x, fixed = TRUE)) %>% 203 collect(), 204 df 205 ) 206}) 207 208# many of the remainder of these tests require RE2 209skip_if_not_available("re2") 210 211test_that("grepl", { 212 df <- tibble(x = c("Foo", "bar")) 213 214 for (fixed in c(TRUE, FALSE)) { 215 compare_dplyr_binding( 216 .input %>% 217 filter(grepl("Foo", x, fixed = fixed)) %>% 218 collect(), 219 df 220 ) 221 compare_dplyr_binding( 222 .input %>% 223 transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>% 224 collect(), 225 df 226 ) 227 compare_dplyr_binding( 228 .input %>% 229 filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>% 230 collect(), 231 df 232 ) 233 } 234}) 235 236test_that("grepl with ignore.case = TRUE and fixed = TRUE", { 237 df <- tibble(x = c("Foo", "bar")) 238 239 # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE, 240 # so we can't use compare_dplyr_binding() for these tests 241 expect_equal( 242 df %>% 243 Table$create() %>% 244 filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>% 245 collect(), 246 tibble(x = "Foo") 247 ) 248 expect_equal( 249 df %>% 250 Table$create() %>% 251 filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>% 252 collect(), 253 tibble(x = character(0)) 254 ) 255}) 256 257test_that("str_detect", { 258 df <- tibble(x = c("Foo", "bar")) 259 260 compare_dplyr_binding( 261 .input %>% 262 filter(str_detect(x, regex("^F"))) %>% 263 collect(), 264 df 265 ) 266 compare_dplyr_binding( 267 .input %>% 268 transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>% 269 collect(), 270 df 271 ) 272 compare_dplyr_binding( 273 .input %>% 274 transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>% 275 collect(), 276 df 277 ) 278 compare_dplyr_binding( 279 .input %>% 280 filter(str_detect(x, fixed("o"))) %>% 281 collect(), 282 df 283 ) 284 compare_dplyr_binding( 285 .input %>% 286 filter(str_detect(x, fixed("O"))) %>% 287 collect(), 288 df 289 ) 290 compare_dplyr_binding( 291 .input %>% 292 filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>% 293 collect(), 294 df 295 ) 296 compare_dplyr_binding( 297 .input %>% 298 filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>% 299 collect(), 300 df 301 ) 302}) 303 304test_that("sub and gsub", { 305 df <- tibble(x = c("Foo", "bar")) 306 307 for (fixed in c(TRUE, FALSE)) { 308 compare_dplyr_binding( 309 .input %>% 310 transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>% 311 collect(), 312 df 313 ) 314 compare_dplyr_binding( 315 .input %>% 316 transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% 317 collect(), 318 df 319 ) 320 compare_dplyr_binding( 321 .input %>% 322 transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% 323 collect(), 324 df 325 ) 326 } 327}) 328 329test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", { 330 df <- tibble(x = c("Foo", "bar")) 331 332 # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when 333 # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests 334 expect_equal( 335 df %>% 336 Table$create() %>% 337 transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% 338 collect(), 339 tibble(x = c("Fuo", "bar")) 340 ) 341 expect_equal( 342 df %>% 343 Table$create() %>% 344 transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% 345 collect(), 346 tibble(x = c("Fuu", "bar")) 347 ) 348 expect_equal( 349 df %>% 350 Table$create() %>% 351 transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>% 352 collect(), 353 df # unchanged 354 ) 355}) 356 357test_that("str_replace and str_replace_all", { 358 df <- tibble(x = c("Foo", "bar")) 359 360 compare_dplyr_binding( 361 .input %>% 362 transmute(x = str_replace_all(x, "^F", "baz")) %>% 363 collect(), 364 df 365 ) 366 367 compare_dplyr_binding( 368 .input %>% 369 transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% 370 collect(), 371 df 372 ) 373 374 compare_dplyr_binding( 375 .input %>% 376 mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% 377 collect(), 378 df 379 ) 380 381 compare_dplyr_binding( 382 .input %>% 383 transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% 384 collect(), 385 df 386 ) 387 compare_dplyr_binding( 388 .input %>% 389 transmute(x = str_replace_all(x, fixed("o"), "u")) %>% 390 collect(), 391 df 392 ) 393 compare_dplyr_binding( 394 .input %>% 395 transmute(x = str_replace(x, fixed("O"), "u")) %>% 396 collect(), 397 df 398 ) 399 compare_dplyr_binding( 400 .input %>% 401 transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>% 402 collect(), 403 df 404 ) 405}) 406 407test_that("strsplit and str_split", { 408 df <- tibble(x = c("Foo and bar", "baz and qux and quux")) 409 410 compare_dplyr_binding( 411 .input %>% 412 mutate(x = strsplit(x, "and")) %>% 413 collect(), 414 df, 415 # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray) 416 # has type information in it, but it's just a bare list from R/dplyr. 417 ignore_attr = TRUE 418 ) 419 compare_dplyr_binding( 420 .input %>% 421 mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% 422 collect(), 423 df, 424 ignore_attr = TRUE 425 ) 426 compare_dplyr_binding( 427 .input %>% 428 mutate(x = strsplit(x, " +and +")) %>% 429 collect(), 430 df, 431 ignore_attr = TRUE 432 ) 433 compare_dplyr_binding( 434 .input %>% 435 mutate(x = str_split(x, "and")) %>% 436 collect(), 437 df, 438 ignore_attr = TRUE 439 ) 440 compare_dplyr_binding( 441 .input %>% 442 mutate(x = str_split(x, "and", n = 2)) %>% 443 collect(), 444 df, 445 ignore_attr = TRUE 446 ) 447 compare_dplyr_binding( 448 .input %>% 449 mutate(x = str_split(x, fixed("and"), n = 2)) %>% 450 collect(), 451 df, 452 ignore_attr = TRUE 453 ) 454 compare_dplyr_binding( 455 .input %>% 456 mutate(x = str_split(x, regex("and"), n = 2)) %>% 457 collect(), 458 df, 459 ignore_attr = TRUE 460 ) 461 compare_dplyr_binding( 462 .input %>% 463 mutate(x = str_split(x, "Foo|bar", n = 2)) %>% 464 collect(), 465 df, 466 ignore_attr = TRUE 467 ) 468}) 469 470test_that("str_to_lower, str_to_upper, and str_to_title", { 471 df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) 472 compare_dplyr_binding( 473 .input %>% 474 transmute( 475 x_lower = str_to_lower(x), 476 x_upper = str_to_upper(x), 477 x_title = str_to_title(x) 478 ) %>% 479 collect(), 480 df 481 ) 482 483 # Error checking a single function because they all use the same code path. 484 expect_error( 485 nse_funcs$str_to_lower("Apache Arrow", locale = "sp"), 486 "Providing a value for 'locale' other than the default ('en') is not supported by Arrow", 487 fixed = TRUE 488 ) 489}) 490 491test_that("arrow_*_split_whitespace functions", { 492 # use only ASCII whitespace characters 493 df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) 494 495 # use only non-ASCII whitespace characters 496 df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux")) 497 498 df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) 499 500 # use default option values 501 expect_equal( 502 df_ascii %>% 503 Table$create() %>% 504 mutate(x = arrow_ascii_split_whitespace(x)) %>% 505 collect(), 506 df_split, 507 ignore_attr = TRUE 508 ) 509 expect_equal( 510 df_utf8 %>% 511 Table$create() %>% 512 mutate(x = arrow_utf8_split_whitespace(x)) %>% 513 collect(), 514 df_split, 515 ignore_attr = TRUE 516 ) 517 518 # specify non-default option values 519 expect_equal( 520 df_ascii %>% 521 Table$create() %>% 522 mutate( 523 x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) 524 ) %>% 525 collect(), 526 tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))), 527 ignore_attr = TRUE 528 ) 529 expect_equal( 530 df_utf8 %>% 531 Table$create() %>% 532 mutate( 533 x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) 534 ) %>% 535 collect(), 536 tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))), 537 ignore_attr = TRUE 538 ) 539}) 540 541test_that("errors and warnings in string splitting", { 542 # These conditions generate an error, but abandon_ship() catches the error, 543 # issues a warning, and pulls the data into R (if computing on InMemoryDataset) 544 # Elsewhere we test that abandon_ship() works, 545 # so here we can just call the functions directly 546 547 x <- Expression$field_ref("x") 548 expect_error( 549 nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)), 550 "Case-insensitive string splitting not supported by Arrow" 551 ) 552 expect_error( 553 nse_funcs$str_split(x, coll("and.?")), 554 "Pattern modifier `coll()` not supported by Arrow", 555 fixed = TRUE 556 ) 557 expect_error( 558 nse_funcs$str_split(x, boundary(type = "word")), 559 "Pattern modifier `boundary()` not supported by Arrow", 560 fixed = TRUE 561 ) 562 expect_error( 563 nse_funcs$str_split(x, "and", n = 0), 564 "Splitting strings into zero parts not supported by Arrow" 565 ) 566 567 # This condition generates a warning 568 expect_warning( 569 nse_funcs$str_split(x, fixed("and"), simplify = TRUE), 570 "Argument 'simplify = TRUE' will be ignored" 571 ) 572}) 573 574test_that("errors and warnings in string detection and replacement", { 575 x <- Expression$field_ref("x") 576 577 expect_error( 578 nse_funcs$str_detect(x, boundary(type = "character")), 579 "Pattern modifier `boundary()` not supported by Arrow", 580 fixed = TRUE 581 ) 582 expect_error( 583 nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"), 584 "Pattern modifier `coll()` not supported by Arrow", 585 fixed = TRUE 586 ) 587 588 # This condition generates a warning 589 expect_warning( 590 nse_funcs$str_replace_all(x, regex("o", multiline = TRUE), "u"), 591 "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" 592 ) 593}) 594 595test_that("backreferences in pattern in string detection", { 596 skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") 597 df <- tibble(x = c("Foo", "bar")) 598 599 compare_dplyr_binding( 600 .input %>% 601 filter(str_detect(x, regex("F([aeiou])\\1"))) %>% 602 collect(), 603 df 604 ) 605}) 606 607test_that("backreferences (substitutions) in string replacement", { 608 df <- tibble(x = c("Foo", "bar")) 609 610 compare_dplyr_binding( 611 .input %>% 612 transmute(desc = sub( 613 "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?", 614 "path `\\2` on server `\\1`", 615 url 616 )) %>% 617 collect(), 618 tibble(url = "https://arrow.apache.org/docs/r/") 619 ) 620 compare_dplyr_binding( 621 .input %>% 622 transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>% 623 collect(), 624 df 625 ) 626 compare_dplyr_binding( 627 .input %>% 628 transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% 629 collect(), 630 df 631 ) 632 compare_dplyr_binding( 633 .input %>% 634 transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% 635 collect(), 636 df 637 ) 638}) 639 640test_that("edge cases in string detection and replacement", { 641 # in case-insensitive fixed match/replace, test that "\\E" in the search 642 # string and backslashes in the replacement string are interpreted literally. 643 # this test does not use compare_dplyr_binding() because base::sub() and 644 # base::grepl() do not support ignore.case = TRUE when fixed = TRUE. 645 expect_equal( 646 tibble(x = c("\\Q\\e\\D")) %>% 647 Table$create() %>% 648 filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>% 649 collect(), 650 tibble(x = c("\\Q\\e\\D")) 651 ) 652 expect_equal( 653 tibble(x = c("\\Q\\e\\D")) %>% 654 Table$create() %>% 655 transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>% 656 collect(), 657 tibble(x = c("\\Q\\L\\D")) 658 ) 659 660 # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's 661 # added in case-insensitive regex match/replace 662 compare_dplyr_binding( 663 .input %>% 664 filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>% 665 collect(), 666 tibble(x = c("ABC")) 667 ) 668 compare_dplyr_binding( 669 .input %>% 670 transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>% 671 collect(), 672 tibble(x = c("ABC")) 673 ) 674}) 675 676test_that("strptime", { 677 # base::strptime() defaults to local timezone 678 # but arrow's strptime defaults to UTC. 679 # So that tests are consistent, set the local timezone to UTC 680 # TODO: consider reevaluating this workaround after ARROW-12980 681 withr::local_timezone("UTC") 682 683 t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) 684 t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) 685 686 expect_equal( 687 t_string %>% 688 Table$create() %>% 689 mutate( 690 x = strptime(x) 691 ) %>% 692 collect(), 693 t_stamp, 694 ignore_attr = "tzone" 695 ) 696 697 expect_equal( 698 t_string %>% 699 Table$create() %>% 700 mutate( 701 x = strptime(x, format = "%Y-%m-%d %H:%M:%S") 702 ) %>% 703 collect(), 704 t_stamp, 705 ignore_attr = "tzone" 706 ) 707 708 expect_equal( 709 t_string %>% 710 Table$create() %>% 711 mutate( 712 x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns") 713 ) %>% 714 collect(), 715 t_stamp, 716 ignore_attr = "tzone" 717 ) 718 719 expect_equal( 720 t_string %>% 721 Table$create() %>% 722 mutate( 723 x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s") 724 ) %>% 725 collect(), 726 t_stamp, 727 ignore_attr = "tzone" 728 ) 729 730 tstring <- tibble(x = c("08-05-2008", NA)) 731 tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y") 732 733 expect_equal( 734 tstring %>% 735 Table$create() %>% 736 mutate( 737 x = strptime(x, format = "%m-%d-%Y") 738 ) %>% 739 pull(), 740 # R's strptime returns POSIXlt (list type) 741 as.POSIXct(tstamp), 742 ignore_attr = "tzone" 743 ) 744}) 745 746test_that("errors in strptime", { 747 # Error when tz is passed 748 x <- Expression$field_ref("x") 749 expect_error( 750 nse_funcs$strptime(x, tz = "PDT"), 751 "Time zone argument not supported by Arrow" 752 ) 753}) 754 755test_that("strftime", { 756 skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 757 758 times <- tibble( 759 datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), 760 date = c(as.Date("2021-01-01"), NA) 761 ) 762 formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" 763 formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" 764 765 compare_dplyr_binding( 766 .input %>% 767 mutate(x = strftime(datetime, format = formats)) %>% 768 collect(), 769 times 770 ) 771 772 compare_dplyr_binding( 773 .input %>% 774 mutate(x = strftime(date, format = formats_date)) %>% 775 collect(), 776 times 777 ) 778 779 compare_dplyr_binding( 780 .input %>% 781 mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>% 782 collect(), 783 times 784 ) 785 786 compare_dplyr_binding( 787 .input %>% 788 mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% 789 collect(), 790 times 791 ) 792 793 withr::with_timezone( 794 "Pacific/Marquesas", 795 { 796 compare_dplyr_binding( 797 .input %>% 798 mutate( 799 x = strftime(datetime, format = formats, tz = "EST"), 800 x_date = strftime(date, format = formats_date, tz = "EST") 801 ) %>% 802 collect(), 803 times 804 ) 805 806 compare_dplyr_binding( 807 .input %>% 808 mutate( 809 x = strftime(datetime, format = formats), 810 x_date = strftime(date, format = formats_date) 811 ) %>% 812 collect(), 813 times 814 ) 815 } 816 ) 817 818 # This check is due to differences in the way %c currently works in Arrow and R's strftime. 819 # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved. 820 expect_error( 821 times %>% 822 Table$create() %>% 823 mutate(x = strftime(datetime, format = "%c")) %>% 824 collect(), 825 "%c flag is not supported in non-C locales." 826 ) 827 828 # Output precision of %S depends on the input timestamp precision. 829 # Timestamps with second precision are represented as integers while 830 # milliseconds, microsecond and nanoseconds are represented as fixed floating 831 # point numbers with 3, 6 and 9 decimal places respectively. 832 compare_dplyr_binding( 833 .input %>% 834 mutate(x = strftime(datetime, format = "%S")) %>% 835 transmute(as.double(substr(x, 1, 2))) %>% 836 collect(), 837 times, 838 tolerance = 1e-6 839 ) 840}) 841 842test_that("format_ISO8601", { 843 skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 844 times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA)) 845 846 compare_dplyr_binding( 847 .input %>% 848 mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>% 849 collect(), 850 times 851 ) 852 853 if (getRversion() < "3.5") { 854 # before 3.5, times$x will have no timezone attribute, so Arrow faithfully 855 # errors that there is no timezone to format: 856 expect_error( 857 times %>% 858 Table$create() %>% 859 mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% 860 collect(), 861 "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z" 862 ) 863 864 # See comment regarding %S flag in strftime tests 865 expect_error( 866 times %>% 867 Table$create() %>% 868 mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% 869 mutate(x = gsub("\\.0*", "", x)) %>% 870 collect(), 871 "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z" 872 ) 873 } else { 874 compare_dplyr_binding( 875 .input %>% 876 mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% 877 collect(), 878 times 879 ) 880 881 # See comment regarding %S flag in strftime tests 882 compare_dplyr_binding( 883 .input %>% 884 mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% 885 mutate(x = gsub("\\.0*", "", x)) %>% 886 collect(), 887 times 888 ) 889 } 890 891 892 # See comment regarding %S flag in strftime tests 893 compare_dplyr_binding( 894 .input %>% 895 mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>% 896 mutate(x = gsub("\\.0*", "", x)) %>% 897 collect(), 898 times 899 ) 900}) 901 902test_that("arrow_find_substring and arrow_find_substring_regex", { 903 df <- tibble(x = c("Foo and Bar", "baz and qux and quux")) 904 905 expect_equal( 906 df %>% 907 Table$create() %>% 908 mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>% 909 collect(), 910 tibble(x = c(-1, 0)) 911 ) 912 expect_equal( 913 df %>% 914 Table$create() %>% 915 mutate(x = arrow_find_substring( 916 x, 917 options = list(pattern = "b", ignore_case = TRUE) 918 )) %>% 919 collect(), 920 tibble(x = c(8, 0)) 921 ) 922 expect_equal( 923 df %>% 924 Table$create() %>% 925 mutate(x = arrow_find_substring_regex( 926 x, 927 options = list(pattern = "^[fb]") 928 )) %>% 929 collect(), 930 tibble(x = c(-1, 0)) 931 ) 932 expect_equal( 933 df %>% 934 Table$create() %>% 935 mutate(x = arrow_find_substring_regex( 936 x, 937 options = list(pattern = "[AEIOU]", ignore_case = TRUE) 938 )) %>% 939 collect(), 940 tibble(x = c(1, 1)) 941 ) 942}) 943 944test_that("stri_reverse and arrow_ascii_reverse functions", { 945 df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) 946 947 df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) 948 949 compare_dplyr_binding( 950 .input %>% 951 mutate(x = stri_reverse(x)) %>% 952 collect(), 953 df_utf8 954 ) 955 956 compare_dplyr_binding( 957 .input %>% 958 mutate(x = stri_reverse(x)) %>% 959 collect(), 960 df_ascii 961 ) 962 963 expect_equal( 964 df_ascii %>% 965 Table$create() %>% 966 mutate(x = arrow_ascii_reverse(x)) %>% 967 collect(), 968 tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) 969 ) 970 971 expect_error( 972 df_utf8 %>% 973 Table$create() %>% 974 mutate(x = arrow_ascii_reverse(x)) %>% 975 collect(), 976 "Invalid: Non-ASCII sequence in input" 977 ) 978}) 979 980test_that("str_like", { 981 df <- tibble(x = c("Foo and bar", "baz and qux and quux")) 982 983 # TODO: After new version of stringr with str_like has been released, update all 984 # these tests to use compare_dplyr_binding 985 986 # No match - entire string 987 expect_equal( 988 df %>% 989 Table$create() %>% 990 mutate(x = str_like(x, "baz")) %>% 991 collect(), 992 tibble(x = c(FALSE, FALSE)) 993 ) 994 995 # Match - entire string 996 expect_equal( 997 df %>% 998 Table$create() %>% 999 mutate(x = str_like(x, "Foo and bar")) %>% 1000 collect(), 1001 tibble(x = c(TRUE, FALSE)) 1002 ) 1003 1004 # Wildcard 1005 expect_equal( 1006 df %>% 1007 Table$create() %>% 1008 mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>% 1009 collect(), 1010 tibble(x = c(TRUE, FALSE)) 1011 ) 1012 1013 # Ignore case 1014 expect_equal( 1015 df %>% 1016 Table$create() %>% 1017 mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>% 1018 collect(), 1019 tibble(x = c(FALSE, FALSE)) 1020 ) 1021 1022 # Single character 1023 expect_equal( 1024 df %>% 1025 Table$create() %>% 1026 mutate(x = str_like(x, "_a%")) %>% 1027 collect(), 1028 tibble(x = c(FALSE, TRUE)) 1029 ) 1030 1031 # This will give an error until a new version of stringr with str_like has been released 1032 skip_if_not(packageVersion("stringr") > "1.4.0") 1033 compare_dplyr_binding( 1034 .input %>% 1035 mutate(x = str_like(x, "%baz%")) %>% 1036 collect(), 1037 df 1038 ) 1039}) 1040 1041test_that("str_pad", { 1042 df <- tibble(x = c("Foo and bar", "baz and qux and quux")) 1043 1044 compare_dplyr_binding( 1045 .input %>% 1046 mutate(x = str_pad(x, width = 31)) %>% 1047 collect(), 1048 df 1049 ) 1050 1051 compare_dplyr_binding( 1052 .input %>% 1053 mutate(x = str_pad(x, width = 30, side = "right")) %>% 1054 collect(), 1055 df 1056 ) 1057 1058 compare_dplyr_binding( 1059 .input %>% 1060 mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% 1061 collect(), 1062 df 1063 ) 1064 1065 compare_dplyr_binding( 1066 .input %>% 1067 mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% 1068 collect(), 1069 df 1070 ) 1071 1072 compare_dplyr_binding( 1073 .input %>% 1074 mutate(x = str_pad(x, width = 31, side = "both")) %>% 1075 collect(), 1076 df 1077 ) 1078}) 1079 1080test_that("substr", { 1081 df <- tibble(x = "Apache Arrow") 1082 1083 compare_dplyr_binding( 1084 .input %>% 1085 mutate(y = substr(x, 1, 6)) %>% 1086 collect(), 1087 df 1088 ) 1089 1090 compare_dplyr_binding( 1091 .input %>% 1092 mutate(y = substr(x, 0, 6)) %>% 1093 collect(), 1094 df 1095 ) 1096 1097 compare_dplyr_binding( 1098 .input %>% 1099 mutate(y = substr(x, -1, 6)) %>% 1100 collect(), 1101 df 1102 ) 1103 1104 compare_dplyr_binding( 1105 .input %>% 1106 mutate(y = substr(x, 6, 1)) %>% 1107 collect(), 1108 df 1109 ) 1110 1111 compare_dplyr_binding( 1112 .input %>% 1113 mutate(y = substr(x, -1, -2)) %>% 1114 collect(), 1115 df 1116 ) 1117 1118 compare_dplyr_binding( 1119 .input %>% 1120 mutate(y = substr(x, 9, 6)) %>% 1121 collect(), 1122 df 1123 ) 1124 1125 compare_dplyr_binding( 1126 .input %>% 1127 mutate(y = substr(x, 1, 6)) %>% 1128 collect(), 1129 df 1130 ) 1131 1132 compare_dplyr_binding( 1133 .input %>% 1134 mutate(y = substr(x, 8, 12)) %>% 1135 collect(), 1136 df 1137 ) 1138 1139 compare_dplyr_binding( 1140 .input %>% 1141 mutate(y = substr(x, -5, -1)) %>% 1142 collect(), 1143 df 1144 ) 1145 1146 expect_error( 1147 nse_funcs$substr("Apache Arrow", c(1, 2), 3), 1148 "`start` must be length 1 - other lengths are not supported in Arrow" 1149 ) 1150 1151 expect_error( 1152 nse_funcs$substr("Apache Arrow", 1, c(2, 3)), 1153 "`stop` must be length 1 - other lengths are not supported in Arrow" 1154 ) 1155}) 1156 1157test_that("substring", { 1158 # nse_funcs$substring just calls nse_funcs$substr, tested extensively above 1159 df <- tibble(x = "Apache Arrow") 1160 1161 compare_dplyr_binding( 1162 .input %>% 1163 mutate(y = substring(x, 1, 6)) %>% 1164 collect(), 1165 df 1166 ) 1167}) 1168 1169test_that("str_sub", { 1170 df <- tibble(x = "Apache Arrow") 1171 1172 compare_dplyr_binding( 1173 .input %>% 1174 mutate(y = str_sub(x, 1, 6)) %>% 1175 collect(), 1176 df 1177 ) 1178 1179 compare_dplyr_binding( 1180 .input %>% 1181 mutate(y = str_sub(x, 0, 6)) %>% 1182 collect(), 1183 df 1184 ) 1185 1186 compare_dplyr_binding( 1187 .input %>% 1188 mutate(y = str_sub(x, -1, 6)) %>% 1189 collect(), 1190 df 1191 ) 1192 1193 compare_dplyr_binding( 1194 .input %>% 1195 mutate(y = str_sub(x, 6, 1)) %>% 1196 collect(), 1197 df 1198 ) 1199 1200 compare_dplyr_binding( 1201 .input %>% 1202 mutate(y = str_sub(x, -1, -2)) %>% 1203 collect(), 1204 df 1205 ) 1206 1207 compare_dplyr_binding( 1208 .input %>% 1209 mutate(y = str_sub(x, -1, 3)) %>% 1210 collect(), 1211 df 1212 ) 1213 1214 compare_dplyr_binding( 1215 .input %>% 1216 mutate(y = str_sub(x, 9, 6)) %>% 1217 collect(), 1218 df 1219 ) 1220 1221 compare_dplyr_binding( 1222 .input %>% 1223 mutate(y = str_sub(x, 1, 6)) %>% 1224 collect(), 1225 df 1226 ) 1227 1228 compare_dplyr_binding( 1229 .input %>% 1230 mutate(y = str_sub(x, 8, 12)) %>% 1231 collect(), 1232 df 1233 ) 1234 1235 compare_dplyr_binding( 1236 .input %>% 1237 mutate(y = str_sub(x, -5, -1)) %>% 1238 collect(), 1239 df 1240 ) 1241 1242 expect_error( 1243 nse_funcs$str_sub("Apache Arrow", c(1, 2), 3), 1244 "`start` must be length 1 - other lengths are not supported in Arrow" 1245 ) 1246 1247 expect_error( 1248 nse_funcs$str_sub("Apache Arrow", 1, c(2, 3)), 1249 "`end` must be length 1 - other lengths are not supported in Arrow" 1250 ) 1251}) 1252 1253test_that("str_starts, str_ends, startsWith, endsWith", { 1254 df <- tibble(x = c("Foo", "bar", "baz", "qux")) 1255 1256 compare_dplyr_binding( 1257 .input %>% 1258 filter(str_starts(x, "b.*")) %>% 1259 collect(), 1260 df 1261 ) 1262 1263 compare_dplyr_binding( 1264 .input %>% 1265 filter(str_starts(x, "b.*", negate = TRUE)) %>% 1266 collect(), 1267 df 1268 ) 1269 1270 compare_dplyr_binding( 1271 .input %>% 1272 filter(str_starts(x, fixed("b.*"))) %>% 1273 collect(), 1274 df 1275 ) 1276 1277 compare_dplyr_binding( 1278 .input %>% 1279 filter(str_starts(x, fixed("b"))) %>% 1280 collect(), 1281 df 1282 ) 1283 1284 compare_dplyr_binding( 1285 .input %>% 1286 filter(str_ends(x, "r")) %>% 1287 collect(), 1288 df 1289 ) 1290 1291 compare_dplyr_binding( 1292 .input %>% 1293 filter(str_ends(x, "r", negate = TRUE)) %>% 1294 collect(), 1295 df 1296 ) 1297 1298 compare_dplyr_binding( 1299 .input %>% 1300 filter(str_ends(x, fixed("r$"))) %>% 1301 collect(), 1302 df 1303 ) 1304 1305 compare_dplyr_binding( 1306 .input %>% 1307 filter(str_ends(x, fixed("r"))) %>% 1308 collect(), 1309 df 1310 ) 1311 1312 compare_dplyr_binding( 1313 .input %>% 1314 filter(startsWith(x, "b")) %>% 1315 collect(), 1316 df 1317 ) 1318 1319 compare_dplyr_binding( 1320 .input %>% 1321 filter(endsWith(x, "r")) %>% 1322 collect(), 1323 df 1324 ) 1325 1326 compare_dplyr_binding( 1327 .input %>% 1328 filter(startsWith(x, "b.*")) %>% 1329 collect(), 1330 df 1331 ) 1332 1333 compare_dplyr_binding( 1334 .input %>% 1335 filter(endsWith(x, "r$")) %>% 1336 collect(), 1337 df 1338 ) 1339}) 1340 1341test_that("str_count", { 1342 df <- tibble( 1343 cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), 1344 dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") 1345 ) 1346 1347 compare_dplyr_binding( 1348 .input %>% 1349 mutate(a_count = str_count(cities, pattern = "a")) %>% 1350 collect(), 1351 df 1352 ) 1353 1354 compare_dplyr_binding( 1355 .input %>% 1356 mutate(p_count = str_count(cities, pattern = "d")) %>% 1357 collect(), 1358 df 1359 ) 1360 1361 compare_dplyr_binding( 1362 .input %>% 1363 mutate(p_count = str_count(cities, 1364 pattern = regex("d", ignore_case = TRUE) 1365 )) %>% 1366 collect(), 1367 df 1368 ) 1369 1370 compare_dplyr_binding( 1371 .input %>% 1372 mutate(e_count = str_count(cities, pattern = "u")) %>% 1373 collect(), 1374 df 1375 ) 1376 1377 # nse_funcs$str_count() is not vectorised over pattern 1378 compare_dplyr_binding( 1379 .input %>% 1380 mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% 1381 collect(), 1382 df, 1383 warning = TRUE 1384 ) 1385 1386 compare_dplyr_binding( 1387 .input %>% 1388 mutate(dots_count = str_count(dots, ".")) %>% 1389 collect(), 1390 df 1391 ) 1392 1393 compare_dplyr_binding( 1394 .input %>% 1395 mutate(dots_count = str_count(dots, fixed("."))) %>% 1396 collect(), 1397 df 1398 ) 1399}) 1400