1#' Extract Data From All Regular Expression Matches Into a Data Frame
2#'
3#' @inherit re_exec
4#'
5#' @description
6#'
7#' Match a regular expression to a string, and return matches, match positions,
8#' and capture groups.  This function is like its
9#' \code{\link[=re_match_all]{match}} counterpart, except it returns
10#' match/capture group start and end positions in addition to the matched
11#' values.
12#'
13#' @seealso \code{\link[base]{gregexpr}}, which this function wraps
14#' @return A tidy data frame (see Section \dQuote{Tidy Data}).  The entries
15#'   within the match records within the list columns will be one vectors
16#'   as long as there are matches for the corresponding text element.
17#' @family tidy regular expression matching
18#' @export
19#' @examples
20#' name_rex <- paste0(
21#'   "(?<first>[[:upper:]][[:lower:]]+) ",
22#'   "(?<last>[[:upper:]][[:lower:]]+)"
23#' )
24#' notables <- c(
25#'   "  Ben Franklin and Jefferson Davis",
26#'   "\tMillard Fillmore"
27#' )
28#' # All occurrences
29#' allpos <- re_exec_all(notables, name_rex)
30#' allpos
31#'
32#' # Custom $ to extract matches and positions
33#' allpos$first$match
34#' allpos$first$start
35#' allpos$first$end
36re_exec_all <- function(text, pattern, perl = TRUE, ...) {
37
38  text <- as.character(text)
39  stopifnot(is.character(pattern), length(pattern) == 1, !is.na(pattern))
40
41  if (length(text) == 0) {
42    res <- empty_result(text, pattern, perl = perl, ...)
43    for (i in seq_along(res)) {
44      if (is.list(res[[i]])) {
45        res[[i]] <- new_rematch_allrecords(res[[i]])
46      }
47    }
48    return(res)
49  }
50
51  match <- gregexpr(pattern, text, perl = perl, ...)
52
53  rec_names <- c("match", "start", "end")
54  colnames <- c(attr(match[[1]], "capture.names"), ".match")
55  num_groups <- length(colnames) - 1L
56  non_rec <- structure(
57    list(character(0), integer(0), integer(0)),
58    names = rec_names
59  )
60
61  ## Non-matching strings have a rather strange special form,
62  ## so we just treat them differently
63  non <- vapply(match, function(m) m[1] == -1, TRUE)
64  yes <- !non
65  res <- replicate(length(text), list(), simplify = FALSE)
66  if (any(non)) {
67    res[non] <- list(replicate(num_groups + 1, non_rec, simplify = FALSE))
68  }
69  if (any(yes)) {
70    res[yes] <- mapply(exec1, text[yes], match[yes], SIMPLIFY = FALSE)
71  }
72
73  res <- lapply(seq_along(res[[1]]), function(i) {
74    new_rematch_allrecords(lapply(res, "[[", i))
75  })
76
77  res <- structure(
78    res,
79    names = colnames,
80    row.names = seq_along(text),
81    class = c("tbl_df", "tbl", "data.frame")
82  )
83
84  res$.text <- text
85  nc <- ncol(res)
86  res[, c(seq_len(nc - 2), nc, nc - 1)]
87}
88
89exec1 <- function(text1, match1) {
90
91  start    <- as.vector(match1)
92  length   <- attr(match1, "match.length")
93  end      <- start + length - 1L
94  matchstr <- substring(text1, start, end)
95  matchrec <- list(match = matchstr, start = start, end = end)
96  colnames <- c(attr(match1, "capture.names"), ".match")
97
98  ## substring fails if the index is length zero,
99  ## need to handle special case
100  res <- if (is.null(attr(match1, "capture.start"))) {
101    replicate(length(colnames), matchrec, simplify = FALSE)
102
103  } else {
104    gstart  <- unname(attr(match1, "capture.start"))
105    glength <- unname(attr(match1, "capture.length"))
106    gend    <- gstart + glength - 1L
107
108    groupstr <- substring(text1, gstart, gend)
109    dim(groupstr) <- dim(gstart)
110
111    c(
112      lapply(
113        seq_len(ncol(groupstr)),
114        function(i) {
115          list(match = groupstr[, i], start = gstart[, i], end = gend[, i])
116        }
117      ),
118      list(.match = matchrec)
119    )
120  }
121
122  res
123}
124
125new_rematch_allrecords <- function(x) {
126  structure(x, class = c("rematch_allrecords", "list"))
127}
128