1## This can only be done in a locale that extends Latin-1 2(inf <- l10n_info()) 3if(!(inf$`UTF-8` || inf$`Latin-1`)) { 4 warning("this test must be done in a Latin-1 or UTF-8 locale") 5 q() 6} 7 8inp <- readLines(n = 2) 9�Latin-1 accented chars�: �� �� �<� �<� � �� 10�� 11 12inp 13(txt <- iconv(inp[1], "latin1", "")) 14(pat <- iconv(inp[2], "latin1", "")) 15if(any(is.na(c(txt, pat)))) { 16 ## backup test 17 warning("this test must be done in a Latin-1 or UTF-8 locale") 18 q() 19} 20 21testit <- function(x) {print(x); stopifnot(identical(x, 1L))} 22testit(grep(pat, txt)) 23testit(grep(pat, txt, ignore.case = TRUE)) 24testit(grep(pat, txt, useBytes = TRUE)) 25testit(grep(pat, txt, ignore.case = TRUE, useBytes = TRUE)) 26testit(grep(pat, txt, fixed = TRUE)) 27testit(grep(pat, txt, fixed = TRUE, useBytes = TRUE)) 28testit(grep(pat, txt, perl = TRUE)) 29testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE)) 30testit(grep(pat, txt, perl = TRUE, useBytes = TRUE)) 31testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE)) 32testit(grep(toupper(pat), txt, ignore.case = TRUE)) 33testit(grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE)) 34## matches in Latin-1 but not in UTF-8 35grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE) 36 37(r1 <- regexpr("en", txt, fixed=TRUE)) 38(r2 <- regexpr("en", txt, fixed=TRUE, useBytes=TRUE)) 39stopifnot(identical(r1, regexpr("en", txt))) 40stopifnot(identical(r2, regexpr("en", txt, useBytes = TRUE))) 41stopifnot(identical(r1, regexpr("en", txt, perl=TRUE))) 42stopifnot(identical(r2, regexpr("en", txt, perl=TRUE, useBytes=TRUE))) 43stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE))) 44stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, useBytes=TRUE))) 45stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE))) 46stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE, 47 useBytes=TRUE))) 48 49(r1 <- regexpr(pat, txt, fixed=TRUE)) 50(r2 <- regexpr(pat, txt, fixed=TRUE, useBytes=TRUE)) 51stopifnot(identical(r1, regexpr(pat, txt))) 52stopifnot(identical(r2, regexpr(pat, txt, useBytes=TRUE))) 53stopifnot(identical(r1, regexpr(pat, txt, perl=TRUE))) 54stopifnot(identical(r2, regexpr(pat, txt, perl=TRUE, useBytes=TRUE))) 55stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE))) 56stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, useBytes=TRUE))) 57stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE))) 58stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE, 59 useBytes=TRUE))) 60pat2 <- toupper(pat) 61stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE))) 62stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE))) 63## will not match in a UTF-8 locale 64regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE, useBytes=TRUE) 65 66 67(r1 <- gregexpr(pat, txt, fixed=TRUE)) 68(r2 <- gregexpr(pat, txt, fixed=TRUE, useBytes=TRUE)) 69stopifnot(identical(r1, gregexpr(pat, txt))) 70stopifnot(identical(r2, gregexpr(pat, txt, useBytes=TRUE))) 71stopifnot(identical(r1, gregexpr(pat, txt, perl=TRUE))) 72stopifnot(identical(r2, gregexpr(pat, txt, perl=TRUE, useBytes=TRUE))) 73stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE))) 74stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, useByte=TRUE))) 75stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE))) 76stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE, 77 useBytes=TRUE))) 78 79txt2 <- c("The", "licenses", "for", "most", "software", "are", 80 "designed", "to", "take", "away", "your", "freedom", 81 "to", "share", "and", "change", "it.", 82 "", "By", "contrast,", "the", "GNU", "General", "Public", "License", 83 "is", "intended", "to", "guarantee", "your", "freedom", "to", 84 "share", "and", "change", "free", "software", "--", 85 "to", "make", "sure", "the", "software", "is", 86 "free", "for", "all", "its", "users") 87( i <- grep("[gu]", txt2, perl = TRUE) ) 88stopifnot(identical(i, grep("[gu]", txt2))) 89## results depend on the locale 90(ot <- sub("[b-e]",".", txt2, perl = TRUE)) 91txt2[ot != sub("[b-e]",".", txt2)] 92(ot <- sub("[b-e]",".", txt2, ignore.case = TRUE, perl = TRUE)) 93txt2[ot != sub("[b-e]",".", txt2, ignore.case = TRUE)] 94 95 96## These may end up with different encodings: == copes, identical does not 97 98eq <- function(a, b) a == b 99(r1 <- gsub(pat, "ef", txt)) 100stopifnot(eq(r1, gsub(pat, "ef", txt, useBytes = TRUE))) 101stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE))) 102stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE, useBytes = TRUE))) 103stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE))) 104stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE, useBytes = TRUE))) 105 106pat <- substr(pat, 1, 1) 107(r1 <- gsub(pat, "gh", txt)) 108stopifnot(eq(r1, gsub(pat, "gh", txt, useBytes = TRUE))) 109stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE))) 110stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE, useBytes = TRUE))) 111stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE))) 112stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE, useBytes = TRUE))) 113 114 115stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx")) 116stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE) 117stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), useBytes = TRUE) 118stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE, useBytes = TRUE) 119 120## this one seems system-dependent 121(x <- gsub("\\b", "|", "The quick brown \ue8\ue9", perl = TRUE)) 122# stopifnot(identical(x, "|The| |quick| |brown| |\ue8\ue9|")) 123(x <- gsub("\\b", "|", "The quick brown fox", perl = TRUE)) 124stopifnot(identical(x, "|The| |quick| |brown| |fox|")) 125## The following is warned against in the help page, but worked in some versions 126gsub("\\b", "|", "The quick brown fox") 127 128(z <- strsplit(txt, pat)[[1]]) 129stopifnot(eq(z, strsplit(txt, pat, useBytes = TRUE)[[1]])) 130stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE)[[1]])) 131stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE, useBytes = TRUE)[[1]])) 132stopifnot(eq(z, strsplit(txt, pat, perl = TRUE)[[1]])) 133stopifnot(eq(z, strsplit(txt, pat, perl = TRUE, useBytes = TRUE)[[1]])) 134 135(z <- strsplit(txt, "[a-c]")[[1]]) 136stopifnot(eq(z, strsplit(txt, "[a-c]", useBytes = TRUE)[[1]])) 137stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE)[[1]])) 138stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE, useBytes = TRUE)[[1]])) 139 140## from strsplit.Rd 141z <- strsplit("A text I want to display with spaces", NULL)[[1]] 142stopifnot(identical(z, 143 strsplit("A text I want to display with spaces", "")[[1]])) 144 145x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech") 146(z <- strsplit(x, "e")) 147stopifnot(identical(z, strsplit(x, "e", useBytes = TRUE))) 148stopifnot(identical(z, strsplit(x, "e", fixed = TRUE))) 149stopifnot(identical(z, strsplit(x, "e", fixed = TRUE, useBytes = TRUE))) 150stopifnot(identical(z, strsplit(x, "e", perl = TRUE))) 151stopifnot(identical(z, strsplit(x, "e", perl = TRUE, useBytes = TRUE))) 152 153## moved from reg-tests-1b.R. 154## fails to match on Cygwin, Mar 2011 155## regexpr(fixed = TRUE) with a single-byte pattern matching to a MBCS string 156x <- iconv("fa\xE7ile a ", "latin1", "UTF-8") 157stopifnot(identical(regexpr(" ", x), regexpr(" ", x, fixed=TRUE))) 158# fixed=TRUE reported match position in bytes in R <= 2.10.0 159stopifnot(identical(regexpr(" a", x), regexpr(" a", x, fixed=TRUE))) 160## always worked. 161 162## this broke and segfaulted in 2.13.1 and earlier (PR#14627) 163x <- paste(rep("a ", 600), collapse="") 164testit(agrep(x, x)) 165testit(agrep(x, x, max.distance=0.5)) 166 167## this is used in QC to check dependencies and was broken intermittently by TRE changes 168stopifnot(isTRUE(grepl('^[[:space:]]*(R|[[:alpha:]][[:alnum:].]*[[:alnum:]])([[:space:]]*\\(([^) ]+)[[:space:]]+([^) ]+)\\))?[[:space:]]*$', ' R (>= 2.13.0) '))) 169 170## Bad sub() and gsub() with some regexprs PR#16009 171x <- c(NA, " abc", "a b c ", "a b c") 172(y <- gsub("\\s{2,}", " ", x)) 173stopifnot(y[-1] == c(" abc", "a b c ", "a b c")) 174x <- c("\ue4", " abc", "a b c ", "a b c") 175(y <- gsub("\\s{2,}", " ", x)) 176stopifnot(y == c(x[1], " abc", "a b c ", "a b c")) 177## results were c(x[1], " ", " ", " ") in both cases in R 3.1.1 178 179## Bad mapping of code points to characters with surrogate pairs (in R 4.0) 180stopifnot(regexpr("b", "\U0001F937b", perl = TRUE) == 2) 181 182## Mixed MBCS and "bytes" encoded, regression in r73569 (Bugzilla 18021) 183x <- rep("\u00e9ab", 2) 184Encoding(x[2]) <- "bytes" 185res <- c("a", "a") 186stopifnot(identical(regmatches(x, regexpr("a", x, perl=TRUE)), res), 187 identical(regmatches(x, regexpr("a", x)), res), 188 identical(unlist(regmatches(x, regexpr("a", x, perl=TRUE))), res), 189 identical(unlist(regmatches(x, regexpr("a", x))), res), 190 identical(unlist(regmatches(x, regexec("a", x, perl=TRUE))), res), 191 identical(unlist(regmatches(x, regexec("a", x))), res), 192 ## Fixed = TRUE 193 identical(regmatches(x, regexpr("a", x, fixed=TRUE)), res), 194 identical(unlist(regmatches(x, regexpr("a", x, fixed=TRUE))), res), 195 identical(unlist(regmatches(x, regexec("a", x, fixed=TRUE))), res)) 196 197## Bytes index computation on ASCII used as "character" on non-ASCII 198## Identical itself produces error if we end up with byte encoded 199## values, which is what we're trying to avoid. 200 201x <- rep("eab", 2) 202y <- c("eab", "e\u03b1b") 203res <- c("a", "\u03b1") 204stopifnot(identical(regmatches(y, regexpr("a", x)), res), 205 identical(regmatches(y, regexpr("a", x, perl=TRUE)), res), 206 identical(unlist(regmatches(y, gregexpr("a", x))), res), 207 identical(unlist(regmatches(y, gregexpr("a", x, perl=TRUE))), res), 208 identical(unlist(regmatches(y, regexec("a", x))), res), 209 identical(unlist(regmatches(y, regexec("a", x, perl=TRUE))), res), 210 ## Fixed = TRUE 211 identical(regmatches(y, regexpr("a", x, fixed=TRUE)), res), 212 identical(unlist(regmatches(y, gregexpr("a", x, fixed=TRUE))), res), 213 identical(unlist(regmatches(y, regexec("a", x, fixed=TRUE))), res)) 214 215## This is an adapted `gregexec` implementation from the example of `?grep`. 216## We will use it to test `gregexec`. 217ex_fn <- function(pattern, text, useBytes = FALSE, perl = FALSE) { 218 lapply( 219 regmatches( 220 text, 221 gregexpr(pattern, text, useBytes = useBytes, perl = perl) 222 ), 223 function(e) { 224 pos <- regexec(pattern, e, useBytes = useBytes, perl = perl) 225 res <- regmatches(e, pos) 226 if(length(res)) do.call(cbind, res) else character() 227 } 228 ) 229} 230 231## Captures patterns like LETTERS123 (plus a couple of Unicode chars). 232p.1.raw <- "(?:.* )?(%s[[:alpha:]\u00e9\u00d6]+)(%s[[:digit:]]+)(?: .*)?" 233p.1 <- sprintf(p.1.raw, "", "") 234p.1n <- sprintf(p.1.raw, "?<a>", "?<b>") ## named capture groups 235s.utf8 <- "H\u00e9320+W\u00d641" 236s.1 <- c( 237 "Test: A1-BC23 boo", ## matches and extra 238 "DE35", ## one full match 239 "boo", ## nomatch 240 NA, ## NA 241 s.utf8 ## UTF8 string 242) 243gr <- gregexec(p.1, s.1, perl=FALSE) 244gr.ub <- gregexec(p.1, s.1, perl=FALSE, useBytes=TRUE) 245gr.perl <- gregexec(p.1n, s.1, perl=TRUE) 246gr.perl.ub <- gregexec(p.1n, s.1, perl=TRUE, useBytes=TRUE) 247 248m.gr <- regmatches(s.1, gr) 249m.gr # inspect visually 250 251m.gr.ub <- regmatches(s.1, gr.ub) 252Encoding(m.gr.ub[[5L]]) <- "UTF-8" 253m.gr.ub.ex <- ex_fn(p.1, s.1, perl=FALSE, useBytes=TRUE) 254Encoding(m.gr.ub.ex[[5L]]) <- "UTF-8" 255 256## Named captures 257m.by.name <- do.call(cbind, regmatches(s.1, gr.perl)) 258m.by.name.1 <- do.call(cbind, regmatches(s.1, regexec(p.1n, s.1, perl=TRUE))) 259 260stopifnot( 261 ## Compare to ?grep example function 262 identical(m.gr, ex_fn(p.1, s.1, perl=FALSE)), 263 identical(m.gr.ub, m.gr.ub.ex), 264 identical(regmatches(s.1, gr.perl), ex_fn(p.1n, s.1, perl=TRUE)), 265 identical(regmatches(s.1, gr.perl.ub), 266 ex_fn(p.1n, s.1, perl=TRUE, useBytes=TRUE)), 267 ## Byte matching increments faster, but matches the same 268 all(gr.ub[[5L]] - gr[[5L]] == c(0L, 0L, 1L, 1L, 1L, 2L)), 269 identical(m.gr, m.gr.ub), 270 ## Perl and non-Perl match the same (in this case) 271 identical(m.gr, regmatches(s.1, gregexec(p.1, s.1, perl=TRUE))), 272 ## Check perl actually using TRE (no named capture support) 273 inherits(try(gregexec(p.1n, s.1), silent=TRUE), "try-error"), 274 ## Named groups work 275 identical(gr.perl[[1]]["b",], c(8L, 12L)), 276 ## Corner cases 277 identical(gregexec(p.1, character()), list()), 278 identical(gregexec(p.1n, character(), perl=TRUE), list()), 279 identical(gregexec(p.1, NULL), list()), 280 identical(gregexec(p.1n, NULL, perl=TRUE), list()), 281 ## Named capture carry over to matches 282 identical(m.by.name["a",], c("A", "BC", "DE", "H\u00e9", "W\u00d6")), 283 identical(m.by.name["b",], c("1", "23", "35", "320", "41")), 284 identical(m.by.name.1["a",], c("A", "DE", "H\u00e9")), 285 identical(m.by.name.1["b",], c("1", "35", "320")) 286) 287 288## Invert and `regmatches<-` do not work with overlapping captures, 289## but should work if we drop the full match from our data. 290drop_first_capt <- function(x) { 291 ml <- attr(x, 'match.length')[-1L,] 292 x <- x[-1L,] 293 attr(x, 'match.length') <- ml 294 x 295} 296 297## Replace with lower case and multiply nums by 100 298s.2 <- s.2a <- s.1[c(1L,5L)] 299gr.2 <- lapply(gregexec(p.1, s.2), drop_first_capt) 300m.gr.2 <- regmatches(s.2, gr.2) 301replacement <- lapply(m.gr.2, tolower) 302replacement[[1]][2,] <- as.numeric(replacement[[1]][2,]) * 100 303replacement[[2]][2,] <- as.numeric(replacement[[2]][2,]) * 100 304s.2a <- s.2 305regmatches(s.2a, gr.2) <- replacement 306 307## Replace with `invert=TRUE` 308s.2b <- s.2 309regmatches(s.2b, gr.2, invert=TRUE) <- 310 replicate(2L, c("~", "#", "~", "@", "~"), simplify=FALSE) 311 312stopifnot( 313 identical(regmatches(s.2, gr.2, invert=TRUE), 314 list(c("Test: ", "", "-", "", " boo"), c("", "", "+", "", ""))), 315 identical(s.2a, c("Test: a100-bc2300 boo", "h\u00e932000+w\u00f64100")), 316 identical(s.2b, c("~A#1~BC@23~", "~H\u00e9#320~W\u00d6@41~"))) 317 318## Check that the perl switch is working fully (h/t Michael Chirico) 319pat <- "(?<first>\\d+)" 320gregexec(pat, "123 456", perl=TRUE) 321## TRE does not support name capts 322stopifnot(inherits(try(gregexec(pat, "123 456", perl=FALSE)), "try-error")) 323local({ 324 old.warn <- options(warn = 2) 325 on.exit(options(old.warn)) 326 gregexec("123", "123 456", fixed=TRUE) # No warning with perl=FALSE 327}) 328 329