1#' Extract Data From First Regular Expression Match Into a Data Frame 2#' 3#' @description 4#' 5#' Match a regular expression to a string, and return matches, match positions, 6#' and capture groups. This function is like its 7#' \code{\link[=re_match]{match}} counterpart, except it returns match/capture 8#' group start and end positions in addition to the matched values. 9#' 10#' @section Tidy Data: 11#' 12#' The return value is a tidy data frame where each row 13#' corresponds to an element of the input character vector \code{text}. The 14#' values from \code{text} appear for reference in the \code{.text} character 15#' column. All other columns are list columns containing the match data. The 16#' \code{.match} column contains the match information for full regular 17#' expression matches while other columns correspond to capture groups if there 18#' are any, and PCRE matches are enabled with \code{perl = TRUE} (this is on by 19#' default). If capture groups are named the corresponding columns will bear 20#' those names. 21#' 22#' Each match data column list contains match records, one for each element in 23#' \code{text}. A match record is a named list, with entries \code{match}, 24#' \code{start} and \code{end} that are respectively the matching (sub) string, 25#' the start, and the end positions (using one based indexing). 26#' 27#' @section Extracting Match Data: 28#' 29#' To make it easier to extract matching substrings or positions, a special 30#' \code{$} operator is defined on match columns, both for the \code{.match} 31#' column and the columns corresponding to the capture groups. See examples 32#' below. 33#' 34#' @inheritParams re_match_all 35#' @seealso \code{\link[base]{regexpr}}, which this function wraps 36#' @param x Object returned by \code{re_exec} or \code{re_exec_all}. 37#' @param name \code{match}, \code{start} or \code{end}. 38#' @return A tidy data frame (see Section \dQuote{Tidy Data}). Match record 39#' entries are one length vectors that are set to NA if there is no match. 40#' @family tidy regular expression matching 41#' @export 42#' @examples 43#' name_rex <- paste0( 44#' "(?<first>[[:upper:]][[:lower:]]+) ", 45#' "(?<last>[[:upper:]][[:lower:]]+)" 46#' ) 47#' notables <- c( 48#' " Ben Franklin and Jefferson Davis", 49#' "\tMillard Fillmore" 50#' ) 51#' # Match first occurrence 52#' pos <- re_exec(notables, name_rex) 53#' pos 54#' 55#' # Custom $ to extract matches and positions 56#' pos$first$match 57#' pos$first$start 58#' pos$first$end 59 60re_exec <- function(text, pattern, perl=TRUE, ...) { 61 62 stopifnot(is.character(pattern), length(pattern) == 1, !is.na(pattern)) 63 text <- as.character(text) 64 65 match <- regexpr(pattern, text, perl = perl, ...) 66 67 start <- as.vector(match) 68 length <- attr(match, "match.length") 69 end <- start + length - 1L 70 71 matchstr <- substring(text, start, end) 72 matchstr[ start == -1 ] <- NA_character_ 73 end [ start == -1 ] <- NA_integer_ 74 start [ start == -1 ] <- NA_integer_ 75 76 names <- c("match", "start", "end") 77 78 matchlist <- new_rematch_records( 79 lapply(seq_along(text), function(i) { 80 structure(list(matchstr[i], start[i], end[i]), names = names) 81 }) 82 ) 83 84 res <- new_tibble( 85 list(text, matchlist), 86 names = c(".text", ".match"), 87 nrow = length(text) 88 ) 89 90 if (!is.null(attr(match, "capture.start"))) { 91 92 gstart <- unname(attr(match, "capture.start")) 93 glength <- unname(attr(match, "capture.length")) 94 gend <- gstart + glength - 1L 95 96 groupstr <- substring(text, gstart, gend) 97 groupstr[ gstart == -1 ] <- NA_character_ 98 gend [ gstart == -1 ] <- NA_integer_ 99 gstart [ gstart == -1 ] <- NA_integer_ 100 dim(groupstr) <- dim(gstart) 101 102 grouplists <- lapply( 103 seq_along(attr(match, "capture.names")), 104 function(g) { 105 new_rematch_records( 106 lapply(seq_along(text), function(i) { 107 structure( 108 list(groupstr[i, g], gstart[i, g], gend[i, g]), 109 names = names 110 ) 111 }) 112 ) 113 } 114 ) 115 116 res <- new_tibble( 117 c(grouplists, res), 118 names = c(attr(match, "capture.names"), ".text", ".match"), 119 nrow = length(res[[1]]) 120 ) 121 } 122 123 res 124} 125 126new_rematch_records <- function(x) { 127 structure(x, class = c("rematch_records", "list")) 128} 129