1## This can only be done in a locale that extends Latin-1
2(inf <- l10n_info())
3if(!(inf$`UTF-8` || inf$`Latin-1`)) {
4    warning("this test must be done in a Latin-1 or UTF-8 locale")
5    q()
6}
7
8inp <- readLines(n = 2)
9Latin-1 accented chars�: �� �� �<� �<� � ��
10��
11
12inp
13(txt <- iconv(inp[1], "latin1", ""))
14(pat <- iconv(inp[2], "latin1", ""))
15if(any(is.na(c(txt, pat)))) {
16    ## backup test
17    warning("this test must be done in a Latin-1 or UTF-8 locale")
18    q()
19}
20
21testit <- function(x) {print(x); stopifnot(identical(x, 1L))}
22testit(grep(pat, txt))
23testit(grep(pat, txt, ignore.case = TRUE))
24testit(grep(pat, txt, useBytes = TRUE))
25testit(grep(pat, txt, ignore.case = TRUE, useBytes = TRUE))
26testit(grep(pat, txt, fixed = TRUE))
27testit(grep(pat, txt, fixed = TRUE, useBytes = TRUE))
28testit(grep(pat, txt, perl = TRUE))
29testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE))
30testit(grep(pat, txt, perl = TRUE, useBytes = TRUE))
31testit(grep(pat, txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE))
32testit(grep(toupper(pat), txt, ignore.case = TRUE))
33testit(grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE))
34## matches in Latin-1 but not in UTF-8
35grep(toupper(pat), txt, ignore.case = TRUE, perl = TRUE, useBytes = TRUE)
36
37(r1 <- regexpr("en", txt, fixed=TRUE))
38(r2 <- regexpr("en", txt, fixed=TRUE, useBytes=TRUE))
39stopifnot(identical(r1, regexpr("en", txt)))
40stopifnot(identical(r2, regexpr("en", txt, useBytes = TRUE)))
41stopifnot(identical(r1, regexpr("en", txt, perl=TRUE)))
42stopifnot(identical(r2, regexpr("en", txt, perl=TRUE, useBytes=TRUE)))
43stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE)))
44stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, useBytes=TRUE)))
45stopifnot(identical(r1, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE)))
46stopifnot(identical(r2, regexpr("EN", txt, ignore.case=TRUE, perl=TRUE,
47                                useBytes=TRUE)))
48
49(r1 <- regexpr(pat, txt, fixed=TRUE))
50(r2 <- regexpr(pat, txt, fixed=TRUE, useBytes=TRUE))
51stopifnot(identical(r1, regexpr(pat, txt)))
52stopifnot(identical(r2, regexpr(pat, txt, useBytes=TRUE)))
53stopifnot(identical(r1, regexpr(pat, txt, perl=TRUE)))
54stopifnot(identical(r2, regexpr(pat, txt, perl=TRUE, useBytes=TRUE)))
55stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE)))
56stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, useBytes=TRUE)))
57stopifnot(identical(r1, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE)))
58stopifnot(identical(r2, regexpr(pat, txt, ignore.case=TRUE, perl=TRUE,
59                                useBytes=TRUE)))
60pat2 <- toupper(pat)
61stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE)))
62stopifnot(identical(r1, regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE)))
63## will not match in a UTF-8 locale
64regexpr(pat2, txt, ignore.case=TRUE, perl=TRUE, useBytes=TRUE)
65
66
67(r1 <- gregexpr(pat, txt, fixed=TRUE))
68(r2 <- gregexpr(pat, txt, fixed=TRUE, useBytes=TRUE))
69stopifnot(identical(r1, gregexpr(pat, txt)))
70stopifnot(identical(r2, gregexpr(pat, txt, useBytes=TRUE)))
71stopifnot(identical(r1, gregexpr(pat, txt, perl=TRUE)))
72stopifnot(identical(r2, gregexpr(pat, txt, perl=TRUE, useBytes=TRUE)))
73stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE)))
74stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, useByte=TRUE)))
75stopifnot(identical(r1, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE)))
76stopifnot(identical(r2, gregexpr(pat, txt, ignore.case=TRUE, perl=TRUE,
77                                 useBytes=TRUE)))
78
79txt2 <- c("The", "licenses", "for", "most", "software", "are",
80  "designed", "to", "take", "away", "your", "freedom",
81  "to", "share", "and", "change", "it.",
82   "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
83   "is", "intended", "to", "guarantee", "your", "freedom", "to",
84   "share", "and", "change", "free", "software", "--",
85   "to", "make", "sure", "the", "software", "is",
86   "free", "for", "all", "its", "users")
87( i <- grep("[gu]", txt2, perl = TRUE) )
88stopifnot(identical(i, grep("[gu]", txt2)))
89## results depend on the locale
90(ot <- sub("[b-e]",".", txt2, perl = TRUE))
91txt2[ot != sub("[b-e]",".", txt2)]
92(ot <- sub("[b-e]",".", txt2, ignore.case = TRUE, perl = TRUE))
93txt2[ot != sub("[b-e]",".", txt2, ignore.case = TRUE)]
94
95
96## These may end up with different encodings: == copes, identical does not
97
98eq <- function(a, b) a == b
99(r1 <- gsub(pat, "ef", txt))
100stopifnot(eq(r1, gsub(pat, "ef", txt, useBytes = TRUE)))
101stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE)))
102stopifnot(eq(r1, gsub(pat, "ef", txt, fixed = TRUE, useBytes = TRUE)))
103stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE)))
104stopifnot(eq(r1, gsub(pat, "ef", txt, perl = TRUE, useBytes = TRUE)))
105
106pat <- substr(pat, 1, 1)
107(r1 <- gsub(pat, "gh", txt))
108stopifnot(eq(r1, gsub(pat, "gh", txt, useBytes = TRUE)))
109stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE)))
110stopifnot(eq(r1, gsub(pat, "gh", txt, fixed = TRUE, useBytes = TRUE)))
111stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE)))
112stopifnot(eq(r1, gsub(pat, "gh", txt, perl = TRUE, useBytes = TRUE)))
113
114
115stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"))
116stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE)
117stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), useBytes = TRUE)
118stopifnot(identical(gsub("a*", "x", "baaac"), "xbxcx"), perl = TRUE, useBytes = TRUE)
119
120## this one seems system-dependent
121(x <- gsub("\\b", "|", "The quick brown \ue8\ue9", perl = TRUE))
122# stopifnot(identical(x, "|The| |quick| |brown| |\ue8\ue9|"))
123(x <- gsub("\\b", "|", "The quick brown fox", perl = TRUE))
124stopifnot(identical(x, "|The| |quick| |brown| |fox|"))
125## The following is warned against in the help page, but worked in some versions
126gsub("\\b", "|", "The quick brown fox")
127
128(z <- strsplit(txt, pat)[[1]])
129stopifnot(eq(z, strsplit(txt, pat, useBytes = TRUE)[[1]]))
130stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE)[[1]]))
131stopifnot(eq(z, strsplit(txt, pat, fixed = TRUE, useBytes = TRUE)[[1]]))
132stopifnot(eq(z, strsplit(txt, pat, perl = TRUE)[[1]]))
133stopifnot(eq(z, strsplit(txt, pat, perl = TRUE, useBytes = TRUE)[[1]]))
134
135(z <- strsplit(txt, "[a-c]")[[1]])
136stopifnot(eq(z, strsplit(txt, "[a-c]", useBytes = TRUE)[[1]]))
137stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE)[[1]]))
138stopifnot(eq(z, strsplit(txt, "[a-c]", perl = TRUE, useBytes = TRUE)[[1]]))
139
140## from strsplit.Rd
141z <- strsplit("A text I want to display with spaces", NULL)[[1]]
142stopifnot(identical(z,
143                    strsplit("A text I want to display with spaces", "")[[1]]))
144
145x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
146(z <- strsplit(x, "e"))
147stopifnot(identical(z, strsplit(x, "e", useBytes = TRUE)))
148stopifnot(identical(z, strsplit(x, "e", fixed = TRUE)))
149stopifnot(identical(z, strsplit(x, "e", fixed = TRUE, useBytes = TRUE)))
150stopifnot(identical(z, strsplit(x, "e", perl = TRUE)))
151stopifnot(identical(z, strsplit(x, "e", perl = TRUE, useBytes = TRUE)))
152
153## moved from reg-tests-1b.R.
154## fails to match on Cygwin, Mar 2011
155## regexpr(fixed = TRUE) with a single-byte pattern matching to a MBCS string
156x <- iconv("fa\xE7ile a ", "latin1", "UTF-8")
157stopifnot(identical(regexpr(" ", x), regexpr(" ", x, fixed=TRUE)))
158# fixed=TRUE reported match position in bytes in R <= 2.10.0
159stopifnot(identical(regexpr(" a", x), regexpr(" a", x, fixed=TRUE)))
160## always worked.
161
162## this broke and segfaulted in 2.13.1 and earlier (PR#14627)
163x <- paste(rep("a ", 600), collapse="")
164testit(agrep(x, x))
165testit(agrep(x, x, max.distance=0.5))
166
167## this is used in QC to check dependencies and was broken intermittently by TRE changes
168stopifnot(isTRUE(grepl('^[[:space:]]*(R|[[:alpha:]][[:alnum:].]*[[:alnum:]])([[:space:]]*\\(([^) ]+)[[:space:]]+([^) ]+)\\))?[[:space:]]*$', ' R (>= 2.13.0) ')))
169
170## Bad sub() and gsub() with some regexprs PR#16009
171x <- c(NA, "  abc", "a b c    ", "a  b c")
172(y <- gsub("\\s{2,}", " ", x))
173stopifnot(y[-1] == c(" abc", "a b c ", "a b c"))
174x <- c("\ue4", "  abc", "a b c    ", "a  b c")
175(y <- gsub("\\s{2,}", " ", x))
176stopifnot(y == c(x[1], " abc", "a b c ", "a b c"))
177## results were c(x[1], " ", " ", " ") in both cases in R 3.1.1
178
179## Bad mapping of code points to characters with surrogate pairs (in R 4.0)
180stopifnot(regexpr("b", "\U0001F937b", perl = TRUE) == 2)
181
182## Mixed MBCS and "bytes" encoded, regression in r73569 (Bugzilla 18021)
183x <- rep("\u00e9ab", 2)
184Encoding(x[2]) <- "bytes"
185res <- c("a", "a")
186stopifnot(identical(regmatches(x, regexpr("a", x, perl=TRUE)), res),
187          identical(regmatches(x, regexpr("a", x)), res),
188          identical(unlist(regmatches(x, regexpr("a", x, perl=TRUE))), res),
189          identical(unlist(regmatches(x, regexpr("a", x))), res),
190          identical(unlist(regmatches(x, regexec("a", x, perl=TRUE))), res),
191          identical(unlist(regmatches(x, regexec("a", x))), res),
192          ## Fixed = TRUE
193          identical(regmatches(x, regexpr("a", x, fixed=TRUE)), res),
194          identical(unlist(regmatches(x, regexpr("a", x, fixed=TRUE))), res),
195          identical(unlist(regmatches(x, regexec("a", x, fixed=TRUE))), res))
196
197## Bytes index computation on ASCII used as "character" on non-ASCII
198## Identical itself produces error if we end up with byte encoded
199## values, which is what we're trying to avoid.
200
201x <- rep("eab", 2)
202y <- c("eab", "e\u03b1b")
203res <- c("a", "\u03b1")
204stopifnot(identical(regmatches(y, regexpr("a", x)), res),
205          identical(regmatches(y, regexpr("a", x, perl=TRUE)), res),
206          identical(unlist(regmatches(y, gregexpr("a", x))), res),
207          identical(unlist(regmatches(y, gregexpr("a", x, perl=TRUE))), res),
208          identical(unlist(regmatches(y, regexec("a", x))), res),
209          identical(unlist(regmatches(y, regexec("a", x, perl=TRUE))), res),
210          ## Fixed = TRUE
211          identical(regmatches(y, regexpr("a", x, fixed=TRUE)), res),
212          identical(unlist(regmatches(y, gregexpr("a", x, fixed=TRUE))), res),
213          identical(unlist(regmatches(y, regexec("a", x, fixed=TRUE))), res))
214
215## This is an adapted `gregexec` implementation from the example of `?grep`.
216## We will use it to test `gregexec`.
217ex_fn <- function(pattern, text, useBytes = FALSE, perl = FALSE) {
218    lapply(
219        regmatches(
220            text,
221            gregexpr(pattern, text, useBytes = useBytes, perl = perl)
222        ),
223        function(e) {
224            pos <- regexec(pattern, e, useBytes = useBytes, perl = perl)
225            res <- regmatches(e, pos)
226            if(length(res)) do.call(cbind, res) else character()
227        }
228    )
229}
230
231## Captures patterns like LETTERS123 (plus a couple of Unicode chars).
232p.1.raw <- "(?:.* )?(%s[[:alpha:]\u00e9\u00d6]+)(%s[[:digit:]]+)(?: .*)?"
233p.1 <- sprintf(p.1.raw, "", "")
234p.1n <- sprintf(p.1.raw, "?<a>", "?<b>")   ## named capture groups
235s.utf8 <-  "H\u00e9320+W\u00d641"
236s.1 <- c(
237    "Test: A1-BC23 boo", ## matches and extra
238    "DE35",              ## one full match
239    "boo",               ## nomatch
240    NA,                  ## NA
241    s.utf8               ## UTF8 string
242)
243gr         <- gregexec(p.1, s.1, perl=FALSE)
244gr.ub      <- gregexec(p.1, s.1, perl=FALSE, useBytes=TRUE)
245gr.perl    <- gregexec(p.1n, s.1, perl=TRUE)
246gr.perl.ub <- gregexec(p.1n, s.1, perl=TRUE, useBytes=TRUE)
247
248m.gr       <- regmatches(s.1, gr)
249m.gr       # inspect visually
250
251m.gr.ub    <- regmatches(s.1, gr.ub)
252Encoding(m.gr.ub[[5L]]) <- "UTF-8"
253m.gr.ub.ex <- ex_fn(p.1, s.1, perl=FALSE, useBytes=TRUE)
254Encoding(m.gr.ub.ex[[5L]]) <- "UTF-8"
255
256## Named captures
257m.by.name <- do.call(cbind, regmatches(s.1, gr.perl))
258m.by.name.1 <- do.call(cbind, regmatches(s.1, regexec(p.1n, s.1, perl=TRUE)))
259
260stopifnot(
261    ## Compare to ?grep example function
262    identical(m.gr, ex_fn(p.1, s.1, perl=FALSE)),
263    identical(m.gr.ub, m.gr.ub.ex),
264    identical(regmatches(s.1, gr.perl), ex_fn(p.1n, s.1, perl=TRUE)),
265    identical(regmatches(s.1, gr.perl.ub),
266              ex_fn(p.1n, s.1, perl=TRUE, useBytes=TRUE)),
267    ## Byte matching increments faster, but matches the same
268    all(gr.ub[[5L]] - gr[[5L]] == c(0L, 0L, 1L, 1L, 1L, 2L)),
269    identical(m.gr, m.gr.ub),
270    ## Perl and non-Perl match the same (in this case)
271    identical(m.gr, regmatches(s.1, gregexec(p.1, s.1, perl=TRUE))),
272    ## Check perl actually using TRE (no named capture support)
273    inherits(try(gregexec(p.1n, s.1), silent=TRUE), "try-error"),
274    ## Named groups work
275    identical(gr.perl[[1]]["b",], c(8L, 12L)),
276    ## Corner cases
277    identical(gregexec(p.1, character()), list()),
278    identical(gregexec(p.1n, character(), perl=TRUE), list()),
279    identical(gregexec(p.1, NULL), list()),
280    identical(gregexec(p.1n, NULL, perl=TRUE), list()),
281    ## Named capture carry over to matches
282    identical(m.by.name["a",], c("A", "BC", "DE", "H\u00e9", "W\u00d6")),
283    identical(m.by.name["b",], c("1", "23", "35", "320", "41")),
284    identical(m.by.name.1["a",], c("A", "DE", "H\u00e9")),
285    identical(m.by.name.1["b",], c("1", "35", "320"))
286)
287
288## Invert and `regmatches<-` do not work with overlapping captures,
289## but should work if we drop the full match from our data.
290drop_first_capt <- function(x) {
291    ml <- attr(x, 'match.length')[-1L,]
292    x <- x[-1L,]
293    attr(x, 'match.length') <- ml
294    x
295}
296
297## Replace with lower case and multiply nums by 100
298s.2 <- s.2a <- s.1[c(1L,5L)]
299gr.2 <- lapply(gregexec(p.1, s.2), drop_first_capt)
300m.gr.2 <- regmatches(s.2, gr.2)
301replacement <- lapply(m.gr.2, tolower)
302replacement[[1]][2,] <- as.numeric(replacement[[1]][2,]) * 100
303replacement[[2]][2,] <- as.numeric(replacement[[2]][2,]) * 100
304s.2a <- s.2
305regmatches(s.2a, gr.2) <- replacement
306
307## Replace with `invert=TRUE`
308s.2b <- s.2
309regmatches(s.2b, gr.2, invert=TRUE) <-
310    replicate(2L, c("~", "#", "~", "@", "~"), simplify=FALSE)
311
312stopifnot(
313    identical(regmatches(s.2, gr.2, invert=TRUE),
314              list(c("Test: ", "", "-", "", " boo"), c("", "", "+", "", ""))),
315    identical(s.2a, c("Test: a100-bc2300 boo", "h\u00e932000+w\u00f64100")),
316    identical(s.2b, c("~A#1~BC@23~", "~H\u00e9#320~W\u00d6@41~")))
317
318## Check that the perl switch is working fully (h/t Michael Chirico)
319pat <- "(?<first>\\d+)"
320gregexec(pat, "123 456", perl=TRUE)
321## TRE does not support name capts
322stopifnot(inherits(try(gregexec(pat, "123 456", perl=FALSE)), "try-error"))
323local({
324    old.warn <- options(warn = 2)
325    on.exit(options(old.warn))
326    gregexec("123", "123 456", fixed=TRUE) # No warning with perl=FALSE
327})
328
329