1#' Match function
2#'
3#' @param data character vector to match against
4#' @param pattern regular expression to use for matching
5#' @param global use global matching
6#' @param options regular expression options
7#' @param locations rather than returning the values of the matched (or
8#' captured) string, return a \code{data.frame} of the match locations in the
9#' string.
10#' @param ... options passed to regexpr or gregexpr
11#' @return if no captures, returns a logical vector the same length as the
12#' input character vector specifying if the relevant value matched or not.  If
13#' there are captures in the regular expression, returns a \code{data.frame} with a
14#' column for each capture group.  If \code{global} is \code{TRUE}, returns a
15#' list of \code{data.frame}s.
16#' @seealso \code{\link{regexp}} Section "Perl-like Regular Expressions" for a
17#' discussion of the supported options
18#' @examples
19#' string <- c("this is a", "test string")
20#' re_matches(string, rex("test")) # FALSE FALSE
21#'
22#' # named capture
23#' re_matches(string, rex(capture(alphas, name = "first_word"), space,
24#'   capture(alphas, name = "second_word")))
25#' #   first_word second_word
26#' # 1       this          is
27#' # 2       test      string
28#'
29#' # capture returns NA when it fails to match
30#' re_matches(string, rex(capture("test")))
31#' #      1
32#' # 1 test
33#' # 2 <NA>
34#' @aliases matches m
35#' @export re_matches matches m
36re_matches <- matches <- m <- function(data, pattern, global = FALSE, options = NULL, locations = FALSE, ...) {
37
38  pattern <- add_options(pattern, options)
39
40  process_matches <- function(match, string) {
41
42    if(no_capture(match)) {
43
44      # if no capture and no location just return if the regex matched
45      if(!locations) {
46        return(match != -1L)
47      }
48
49      # else return a data frame of the start and end locations
50      match[ match == -1L ] <- NA_integer_
51      starts <- match
52      attributes(starts) <- NULL
53
54      lengths <- attr(match, "match.length")
55      ends <- starts + lengths - 1L
56
57      return(data.frame(start = starts, end = ends))
58    }
59
60    # if a capture return a data frame with the capture results for each string
61    starts <- attr(match, "capture.start")
62    lengths <- attr(match, "capture.length")
63    ends <- starts + lengths - 1L
64
65    not_matched <- starts == -1L
66
67    strings <- substring(string, starts, ends)
68
69    strings[not_matched] <- NA_character_
70
71    res <- matrix(ncol = ncol(starts), strings)
72
73    nms <- auto_name(attr(match, "capture.names"))
74
75    if (!locations) {
76      colnames(res) <- nms
77      return(as.data.frame(res, stringsAsFactors = FALSE, check.names = FALSE))
78    }
79
80    starts[not_matched] <- NA_integer_
81
82    ends[not_matched] <- NA_integer_
83
84    indexes <- unlist(lapply(seq_len(ncol(res)), function(x) {
85        seq(x, by = ncol(res), length.out = 3)
86    }))
87
88    full <- data.frame(res, starts, ends, stringsAsFactors = FALSE, check.names = FALSE)[, indexes, drop = FALSE]
89    full_names <- unlist(Map(function(name) c(name, paste(sep=".", name, c("start", "end"))), nms, USE.NAMES=F))
90    colnames(full) <- full_names
91
92    full
93  }
94
95  if(global %==% TRUE) {
96    mapply(process_matches, gregexpr(pattern = pattern, data, perl = TRUE, ...), data, SIMPLIFY = FALSE)
97  }
98  else {
99    process_matches(regexpr(pattern = pattern, data, perl = TRUE, ...), data)
100  }
101}
102
103#' Substitute regular expressions in a string with another string.
104#'
105#' @param data character vector to substitute
106#' @param pattern regular expression to match
107#' @param replacement replacement text to use
108#' @param global substitute all occurrences
109#' @param options option flags
110#' @param ... options passed to sub or gsub
111#' @seealso \code{\link{regexp}} Section "Perl-like Regular Expressions" for a
112#' discussion of the supported options
113#' @examples
114#' string <- c("this is a Test", "string")
115#' re_substitutes(string, "test", "not a test", options = "insensitive")
116#' re_substitutes(string, "i", "x", global = TRUE)
117#' re_substitutes(string, "(test)", "not a \\1", options = "insensitive")
118#' @aliases substitutes s
119#' @export re_substitutes substitutes s
120re_substitutes <- substitutes <- s <- function(data, pattern, replacement, global = FALSE, options = NULL, ...) {
121  pattern <- add_options(pattern, options)
122  method <- if (isTRUE(global)) gsub else sub
123  method(x = data, pattern = pattern, replacement = replacement, perl = TRUE, ...)
124}
125
126add_options <- function(pattern, options) {
127  if (!is.null(options)) {
128    options <- match_args(options, names(option_map))
129    p("(?", p(option_map[options]), ")", pattern)
130  }
131  else {
132    pattern
133  }
134}
135
136match_args <- function(arg, choices) {
137  matches <- pmatch(arg, choices)
138  if (any(is.na(matches))) {
139    stop(gettextf("'arg' should be one of %s", paste(dQuote(choices),
140          collapse = ", ")), domain = NA)
141  }
142  choices[matches]
143}
144
145option_map <- c(
146  "insensitive" = "i",
147  "multi-line" = "m",
148  "single-line" = "s",
149  "extended" = "x",
150  "ungreedy" = "U"
151  )
152
153no_capture <- function(match) {
154  is.null(attr(match, "capture.start", exact = TRUE))
155}
156
157auto_name <- function(names) {
158  missing <- names == ""
159  if (all(!missing)) {
160    return(names)
161  }
162  names[missing] <- seq_along(names)[missing]
163  names
164}
165