1#' Find nodes that match an xpath expression. 2#' 3#' Xpath is like regular expressions for trees - it's worth learning if 4#' you're trying to extract nodes from arbitrary locations in a document. 5#' Use `xml_find_all` to find all matches - if there's no match you'll 6#' get an empty result. Use `xml_find_first` to find a specific match - 7#' if there's no match you'll get an `xml_missing` node. 8#' 9#' @section Deprecated functions: 10#' `xml_find_one()` has been deprecated. Instead use 11#' `xml_find_first()`. 12 13#' @param xpath A string containing an xpath (1.0) expression. 14#' @inheritParams xml_name 15#' @param ... Further arguments passed to or from other methods. 16#' @return `xml_find_all` returns a nodeset if applied to a node, and a nodeset 17#' or a list of nodesets if applied to a nodeset. If there are no matches, 18#' the nodeset(s) will be empty. Within each nodeset, the result will always 19#' be unique; repeated nodes are automatically de-duplicated. 20#' 21#' `xml_find_first` returns a node if applied to a node, and a nodeset 22#' if applied to a nodeset. The output is *always* the same size as 23#' the input. If there are no matches, `xml_find_first` will return a 24#' missing node; if there are multiple matches, it will return the first 25#' only. 26#' 27#' `xml_find_num`, `xml_find_chr`, `xml_find_lgl` return 28#' numeric, character and logical results respectively. 29#' @export 30#' @seealso [xml_ns_strip()] to remove the default namespaces 31#' @examples 32#' x <- read_xml("<foo><bar><baz/></bar><baz/></foo>") 33#' xml_find_all(x, ".//baz") 34#' xml_path(xml_find_all(x, ".//baz")) 35#' 36#' # Note the difference between .// and // 37#' # // finds anywhere in the document (ignoring the current node) 38#' # .// finds anywhere beneath the current node 39#' (bar <- xml_find_all(x, ".//bar")) 40#' xml_find_all(bar, ".//baz") 41#' xml_find_all(bar, "//baz") 42#' 43#' # Find all vs find one ----------------------------------------------------- 44#' x <- read_xml("<body> 45#' <p>Some <b>text</b>.</p> 46#' <p>Some <b>other</b> <b>text</b>.</p> 47#' <p>No bold here!</p> 48#' </body>") 49#' para <- xml_find_all(x, ".//p") 50#' 51#' # By default, if you apply xml_find_all to a nodeset, it finds all matches, 52#' # de-duplicates them, and returns as a single nodeset. This means you 53#' # never know how many results you'll get 54#' xml_find_all(para, ".//b") 55#' 56#' # If you set flatten to FALSE, though, xml_find_all will return a list of 57#' # nodesets, where each nodeset contains the matches for the corresponding 58#' # node in the original nodeset. 59#' xml_find_all(para, ".//b", flatten = FALSE) 60#' 61#' # xml_find_first only returns the first match per input node. If there are 0 62#' # matches it will return a missing node 63#' xml_find_first(para, ".//b") 64#' xml_text(xml_find_first(para, ".//b")) 65#' 66#' # Namespaces --------------------------------------------------------------- 67#' # If the document uses namespaces, you'll need use xml_ns to form 68#' # a unique mapping between full namespace url and a short prefix 69#' x <- read_xml(' 70#' <root xmlns:f = "http://foo.com" xmlns:g = "http://bar.com"> 71#' <f:doc><g:baz /></f:doc> 72#' <f:doc><g:baz /></f:doc> 73#' </root> 74#' ') 75#' xml_find_all(x, ".//f:doc") 76#' xml_find_all(x, ".//f:doc", xml_ns(x)) 77xml_find_all <- function(x, xpath, ns = xml_ns(x), ...) { 78 UseMethod("xml_find_all") 79} 80 81#' @export 82xml_find_all.xml_missing <- function(x, xpath, ns = xml_ns(x), ...) { 83 xml_nodeset() 84} 85 86#' @export 87xml_find_all.xml_node <- function(x, xpath, ns = xml_ns(x), ...) { 88 nodes <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf) 89 xml_nodeset(nodes) 90} 91 92#' @param flatten A logical indicating whether to return a single, flattened 93#' nodeset or a list of nodesets. 94#' @export 95#' @rdname xml_find_all 96xml_find_all.xml_nodeset <- function(x, xpath, ns = xml_ns(x), flatten = TRUE, ...) { 97 if (length(x) == 0) 98 return(xml_nodeset()) 99 100 res <- lapply(x, function(x) .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)) 101 102 if (isTRUE(flatten)) { 103 return(xml_nodeset(unlist(recursive = FALSE, res))) 104 } 105 106 res[] <- lapply(res, xml_nodeset) 107 res 108} 109 110#' @export 111#' @rdname xml_find_all 112xml_find_first <- function(x, xpath, ns = xml_ns(x)) { 113 UseMethod("xml_find_first") 114} 115 116xml_find_first.xml_missing <- function(x, xpath, ns = xml_ns(x)) { 117 xml_missing() 118} 119 120#' @export 121xml_find_first.xml_node <- function(x, xpath, ns = xml_ns(x)) { 122 res <- .Call(xpath_search, x$node, x$doc, xpath, ns, 1) 123 if (length(res) == 1) { 124 res[[1]] 125 } else { 126 res 127 } 128} 129 130#' @export 131xml_find_first.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) { 132 if (length(x) == 0) 133 return(xml_nodeset()) 134 135 xml_nodeset(lapply(x, function(x) 136 xml_find_first(x, xpath = xpath, ns = ns)), deduplicate = FALSE) 137} 138 139 140#' @export 141#' @rdname xml_find_all 142xml_find_num <- function(x, xpath, ns = xml_ns(x)) { 143 UseMethod("xml_find_num") 144} 145 146#' @export 147xml_find_num.xml_node <- function(x, xpath, ns = xml_ns(x)) { 148 res <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf) 149 if (!is.numeric(res)) { 150 stop("result of type: ", sQuote(class(res)), ", not numeric", call. = FALSE) 151 } 152 res 153} 154 155#' @export 156xml_find_num.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) { 157 if (length(x) == 0) 158 return(numeric()) 159 160 vapply(x, function(x) xml_find_num(x, xpath = xpath, ns = ns), numeric(1)) 161} 162 163#' @export 164xml_find_num.xml_missing <- function(x, xpath, ns = xml_ns(x)) { 165 numeric(0) 166} 167 168#' @export 169#' @rdname xml_find_all 170xml_find_chr <- function(x, xpath, ns = xml_ns(x)) { 171 UseMethod("xml_find_chr") 172} 173 174#' @export 175xml_find_chr.xml_node <- function(x, xpath, ns = xml_ns(x)) { 176 res <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf) 177 if (!is.character(res)) { 178 stop("result of type: ", sQuote(class(res)), ", not character", call. = FALSE) 179 } 180 res 181} 182 183#' @export 184xml_find_chr.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) { 185 if (length(x) == 0) 186 return(character()) 187 188 vapply(x, function(x) xml_find_chr(x, xpath = xpath, ns = ns), character(1)) 189} 190 191#' @export 192xml_find_chr.xml_missing <- function(x, xpath, ns = xml_ns(x)) { 193 character(0) 194} 195 196#' @export 197#' @rdname xml_find_all 198xml_find_lgl <- function(x, xpath, ns = xml_ns(x)) { 199 UseMethod("xml_find_lgl") 200} 201 202#' @export 203xml_find_lgl.xml_node <- function(x, xpath, ns = xml_ns(x)) { 204 res <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf) 205 if (!is.logical(res)) { 206 stop("result of type: ", sQuote(class(res)), ", not logical", call. = FALSE) 207 } 208 res 209} 210 211#' @export 212xml_find_lgl.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) { 213 if (length(x) == 0) 214 return(logical()) 215 216 vapply(x, function(x) xml_find_lgl(x, xpath = xpath, ns = ns), logical(1)) 217} 218 219#' @export 220xml_find_lgl.xml_missing <- function(x, xpath, ns = xml_ns(x)) { 221 logical(0) 222} 223 224# Deprecated functions ---------------------------------------------------- 225 226#' @rdname xml_find_all 227#' @usage NULL 228#' @export 229xml_find_one <- function(x, xpath, ns = xml_ns(x)) { 230 .Deprecated("xml_find_first") 231 UseMethod("xml_find_first") 232} 233