1#' Extract Data From First Regular Expression Match Into a Data Frame
2#'
3#' @description
4#'
5#' Match a regular expression to a string, and return matches, match positions,
6#' and capture groups.  This function is like its
7#' \code{\link[=re_match]{match}} counterpart, except it returns match/capture
8#' group start and end positions in addition to the matched values.
9#'
10#' @section Tidy Data:
11#'
12#' The return value is a tidy data frame where each row
13#' corresponds to an element of the input character vector \code{text}.  The
14#' values from \code{text} appear for reference in the \code{.text} character
15#' column.  All other columns are list columns containing the match data.  The
16#' \code{.match} column contains the match information for full regular
17#' expression matches while other columns correspond to capture groups if there
18#' are any, and PCRE matches are enabled with \code{perl = TRUE} (this is on by
19#' default).  If capture groups are named the corresponding columns will bear
20#' those names.
21#'
22#' Each match data column list contains match records, one for each element in
23#' \code{text}.  A match record is a named list, with entries \code{match},
24#' \code{start} and \code{end} that are respectively the matching (sub) string,
25#' the start, and the end positions (using one based indexing).
26#'
27#' @section Extracting Match Data:
28#'
29#' To make it easier to extract matching substrings or positions, a special
30#' \code{$} operator is defined on match columns, both for the \code{.match}
31#' column and the columns corresponding to the capture groups.  See examples
32#' below.
33#'
34#' @inheritParams re_match_all
35#' @seealso \code{\link[base]{regexpr}}, which this function wraps
36#' @param x Object returned by \code{re_exec} or \code{re_exec_all}.
37#' @param name \code{match}, \code{start} or \code{end}.
38#' @return A tidy data frame (see Section \dQuote{Tidy Data}).  Match record
39#'   entries are one length vectors that are set to NA if there is no match.
40#' @family tidy regular expression matching
41#' @export
42#' @examples
43#' name_rex <- paste0(
44#'   "(?<first>[[:upper:]][[:lower:]]+) ",
45#'   "(?<last>[[:upper:]][[:lower:]]+)"
46#' )
47#' notables <- c(
48#'   "  Ben Franklin and Jefferson Davis",
49#'   "\tMillard Fillmore"
50#' )
51#' # Match first occurrence
52#' pos <- re_exec(notables, name_rex)
53#' pos
54#'
55#' # Custom $ to extract matches and positions
56#' pos$first$match
57#' pos$first$start
58#' pos$first$end
59
60re_exec <- function(text, pattern, perl=TRUE, ...) {
61
62  stopifnot(is.character(pattern), length(pattern) == 1, !is.na(pattern))
63  text <- as.character(text)
64
65  match <- regexpr(pattern, text, perl = perl, ...)
66
67  start  <- as.vector(match)
68  length <- attr(match, "match.length")
69  end    <- start + length - 1L
70
71  matchstr <- substring(text, start, end)
72  matchstr[ start == -1 ] <- NA_character_
73  end     [ start == -1 ] <- NA_integer_
74  start   [ start == -1 ] <- NA_integer_
75
76  names <- c("match", "start", "end")
77
78  matchlist <- new_rematch_records(
79    lapply(seq_along(text), function(i) {
80      structure(list(matchstr[i], start[i], end[i]), names = names)
81    })
82  )
83
84  res <- new_tibble(
85    list(text, matchlist),
86    names = c(".text", ".match"),
87    nrow = length(text)
88  )
89
90  if (!is.null(attr(match, "capture.start"))) {
91
92    gstart  <- unname(attr(match, "capture.start"))
93    glength <- unname(attr(match, "capture.length"))
94    gend    <- gstart + glength - 1L
95
96    groupstr <- substring(text, gstart, gend)
97    groupstr[ gstart == -1 ] <- NA_character_
98    gend    [ gstart == -1 ] <- NA_integer_
99    gstart  [ gstart == -1 ] <- NA_integer_
100    dim(groupstr) <- dim(gstart)
101
102    grouplists <- lapply(
103      seq_along(attr(match, "capture.names")),
104      function(g) {
105        new_rematch_records(
106          lapply(seq_along(text), function(i) {
107            structure(
108              list(groupstr[i, g], gstart[i, g], gend[i, g]),
109              names = names
110            )
111          })
112        )
113      }
114    )
115
116    res <- new_tibble(
117      c(grouplists, res),
118      names = c(attr(match, "capture.names"), ".text", ".match"),
119      nrow = length(res[[1]])
120    )
121  }
122
123  res
124}
125
126new_rematch_records <- function(x) {
127  structure(x, class = c("rematch_records", "list"))
128}
129