1#' Extract Data From All Regular Expression Matches Into a Data Frame 2#' 3#' @inherit re_exec 4#' 5#' @description 6#' 7#' Match a regular expression to a string, and return matches, match positions, 8#' and capture groups. This function is like its 9#' \code{\link[=re_match_all]{match}} counterpart, except it returns 10#' match/capture group start and end positions in addition to the matched 11#' values. 12#' 13#' @seealso \code{\link[base]{gregexpr}}, which this function wraps 14#' @return A tidy data frame (see Section \dQuote{Tidy Data}). The entries 15#' within the match records within the list columns will be one vectors 16#' as long as there are matches for the corresponding text element. 17#' @family tidy regular expression matching 18#' @export 19#' @examples 20#' name_rex <- paste0( 21#' "(?<first>[[:upper:]][[:lower:]]+) ", 22#' "(?<last>[[:upper:]][[:lower:]]+)" 23#' ) 24#' notables <- c( 25#' " Ben Franklin and Jefferson Davis", 26#' "\tMillard Fillmore" 27#' ) 28#' # All occurrences 29#' allpos <- re_exec_all(notables, name_rex) 30#' allpos 31#' 32#' # Custom $ to extract matches and positions 33#' allpos$first$match 34#' allpos$first$start 35#' allpos$first$end 36re_exec_all <- function(text, pattern, perl = TRUE, ...) { 37 38 text <- as.character(text) 39 stopifnot(is.character(pattern), length(pattern) == 1, !is.na(pattern)) 40 41 if (length(text) == 0) { 42 res <- empty_result(text, pattern, perl = perl, ...) 43 for (i in seq_along(res)) { 44 if (is.list(res[[i]])) { 45 res[[i]] <- new_rematch_allrecords(res[[i]]) 46 } 47 } 48 return(res) 49 } 50 51 match <- gregexpr(pattern, text, perl = perl, ...) 52 53 rec_names <- c("match", "start", "end") 54 colnames <- c(attr(match[[1]], "capture.names"), ".match") 55 num_groups <- length(colnames) - 1L 56 non_rec <- structure( 57 list(character(0), integer(0), integer(0)), 58 names = rec_names 59 ) 60 61 ## Non-matching strings have a rather strange special form, 62 ## so we just treat them differently 63 non <- vapply(match, function(m) m[1] == -1, TRUE) 64 yes <- !non 65 res <- replicate(length(text), list(), simplify = FALSE) 66 if (any(non)) { 67 res[non] <- list(replicate(num_groups + 1, non_rec, simplify = FALSE)) 68 } 69 if (any(yes)) { 70 res[yes] <- mapply(exec1, text[yes], match[yes], SIMPLIFY = FALSE) 71 } 72 73 res <- lapply(seq_along(res[[1]]), function(i) { 74 new_rematch_allrecords(lapply(res, "[[", i)) 75 }) 76 77 res <- structure( 78 res, 79 names = colnames, 80 row.names = seq_along(text), 81 class = c("tbl_df", "tbl", "data.frame") 82 ) 83 84 res$.text <- text 85 nc <- ncol(res) 86 res[, c(seq_len(nc - 2), nc, nc - 1)] 87} 88 89exec1 <- function(text1, match1) { 90 91 start <- as.vector(match1) 92 length <- attr(match1, "match.length") 93 end <- start + length - 1L 94 matchstr <- substring(text1, start, end) 95 matchrec <- list(match = matchstr, start = start, end = end) 96 colnames <- c(attr(match1, "capture.names"), ".match") 97 98 ## substring fails if the index is length zero, 99 ## need to handle special case 100 res <- if (is.null(attr(match1, "capture.start"))) { 101 replicate(length(colnames), matchrec, simplify = FALSE) 102 103 } else { 104 gstart <- unname(attr(match1, "capture.start")) 105 glength <- unname(attr(match1, "capture.length")) 106 gend <- gstart + glength - 1L 107 108 groupstr <- substring(text1, gstart, gend) 109 dim(groupstr) <- dim(gstart) 110 111 c( 112 lapply( 113 seq_len(ncol(groupstr)), 114 function(i) { 115 list(match = groupstr[, i], start = gstart[, i], end = gend[, i]) 116 } 117 ), 118 list(.match = matchrec) 119 ) 120 } 121 122 res 123} 124 125new_rematch_allrecords <- function(x) { 126 structure(x, class = c("rematch_allrecords", "list")) 127} 128