1#' Find nodes that match an xpath expression.
2#'
3#' Xpath is like regular expressions for trees - it's worth learning if
4#' you're trying to extract nodes from arbitrary locations in a document.
5#' Use `xml_find_all` to find all matches - if there's no match you'll
6#' get an empty result. Use `xml_find_first` to find a specific match -
7#' if there's no match you'll get an `xml_missing` node.
8#'
9#' @section Deprecated functions:
10#' `xml_find_one()` has been deprecated. Instead use
11#' `xml_find_first()`.
12
13#' @param xpath A string containing an xpath (1.0) expression.
14#' @inheritParams xml_name
15#' @param ... Further arguments passed to or from other methods.
16#' @return `xml_find_all` returns a nodeset if applied to a node, and a nodeset
17#'   or a list of nodesets if applied to a nodeset. If there are no matches,
18#'   the nodeset(s) will be empty. Within each nodeset, the result will always
19#'   be unique; repeated nodes are automatically de-duplicated.
20#'
21#'   `xml_find_first` returns a node if applied to a node, and a nodeset
22#'   if applied to a nodeset. The output is *always* the same size as
23#'   the input. If there are no matches, `xml_find_first` will return a
24#'   missing node; if there are multiple matches, it will return the first
25#'   only.
26#'
27#'   `xml_find_num`, `xml_find_chr`, `xml_find_lgl` return
28#'   numeric, character and logical results respectively.
29#' @export
30#' @seealso [xml_ns_strip()] to remove the default namespaces
31#' @examples
32#' x <- read_xml("<foo><bar><baz/></bar><baz/></foo>")
33#' xml_find_all(x, ".//baz")
34#' xml_path(xml_find_all(x, ".//baz"))
35#'
36#' # Note the difference between .// and //
37#' # //  finds anywhere in the document (ignoring the current node)
38#' # .// finds anywhere beneath the current node
39#' (bar <- xml_find_all(x, ".//bar"))
40#' xml_find_all(bar, ".//baz")
41#' xml_find_all(bar, "//baz")
42#'
43#' # Find all vs find one -----------------------------------------------------
44#' x <- read_xml("<body>
45#'   <p>Some <b>text</b>.</p>
46#'   <p>Some <b>other</b> <b>text</b>.</p>
47#'   <p>No bold here!</p>
48#' </body>")
49#' para <- xml_find_all(x, ".//p")
50#'
51#' # By default, if you apply xml_find_all to a nodeset, it finds all matches,
52#' # de-duplicates them, and returns as a single nodeset. This means you
53#' # never know how many results you'll get
54#' xml_find_all(para, ".//b")
55#'
56#' # If you set flatten to FALSE, though, xml_find_all will return a list of
57#' # nodesets, where each nodeset contains the matches for the corresponding
58#' # node in the original nodeset.
59#' xml_find_all(para, ".//b", flatten = FALSE)
60#'
61#' # xml_find_first only returns the first match per input node. If there are 0
62#' # matches it will return a missing node
63#' xml_find_first(para, ".//b")
64#' xml_text(xml_find_first(para, ".//b"))
65#'
66#' # Namespaces ---------------------------------------------------------------
67#' # If the document uses namespaces, you'll need use xml_ns to form
68#' # a unique mapping between full namespace url and a short prefix
69#' x <- read_xml('
70#'  <root xmlns:f = "http://foo.com" xmlns:g = "http://bar.com">
71#'    <f:doc><g:baz /></f:doc>
72#'    <f:doc><g:baz /></f:doc>
73#'  </root>
74#' ')
75#' xml_find_all(x, ".//f:doc")
76#' xml_find_all(x, ".//f:doc", xml_ns(x))
77xml_find_all <- function(x, xpath, ns = xml_ns(x), ...) {
78  UseMethod("xml_find_all")
79}
80
81#' @export
82xml_find_all.xml_missing <- function(x, xpath, ns = xml_ns(x), ...) {
83  xml_nodeset()
84}
85
86#' @export
87xml_find_all.xml_node <- function(x, xpath, ns = xml_ns(x), ...) {
88  nodes <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)
89  xml_nodeset(nodes)
90}
91
92#' @param flatten A logical indicating whether to return a single, flattened
93#'   nodeset or a list of nodesets.
94#' @export
95#' @rdname xml_find_all
96xml_find_all.xml_nodeset <- function(x, xpath, ns = xml_ns(x), flatten = TRUE, ...) {
97  if (length(x) == 0)
98    return(xml_nodeset())
99
100  res <- lapply(x, function(x) .Call(xpath_search, x$node, x$doc, xpath, ns, Inf))
101
102  if (isTRUE(flatten)) {
103    return(xml_nodeset(unlist(recursive = FALSE, res)))
104  }
105
106  res[] <- lapply(res, xml_nodeset)
107  res
108}
109
110#' @export
111#' @rdname xml_find_all
112xml_find_first <- function(x, xpath, ns = xml_ns(x)) {
113  UseMethod("xml_find_first")
114}
115
116xml_find_first.xml_missing <- function(x, xpath, ns = xml_ns(x)) {
117  xml_missing()
118}
119
120#' @export
121xml_find_first.xml_node <- function(x, xpath, ns = xml_ns(x)) {
122  res <- .Call(xpath_search, x$node, x$doc, xpath, ns, 1)
123  if (length(res) == 1) {
124     res[[1]]
125  } else {
126    res
127  }
128}
129
130#' @export
131xml_find_first.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) {
132  if (length(x) == 0)
133    return(xml_nodeset())
134
135  xml_nodeset(lapply(x, function(x)
136      xml_find_first(x, xpath = xpath, ns = ns)), deduplicate = FALSE)
137}
138
139
140#' @export
141#' @rdname xml_find_all
142xml_find_num <- function(x, xpath, ns = xml_ns(x)) {
143  UseMethod("xml_find_num")
144}
145
146#' @export
147xml_find_num.xml_node <- function(x, xpath, ns = xml_ns(x)) {
148  res <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)
149  if (!is.numeric(res)) {
150    stop("result of type: ", sQuote(class(res)), ", not numeric", call. = FALSE)
151  }
152  res
153}
154
155#' @export
156xml_find_num.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) {
157  if (length(x) == 0)
158    return(numeric())
159
160  vapply(x, function(x) xml_find_num(x, xpath = xpath, ns = ns), numeric(1))
161}
162
163#' @export
164xml_find_num.xml_missing <- function(x, xpath, ns = xml_ns(x)) {
165   numeric(0)
166}
167
168#' @export
169#' @rdname xml_find_all
170xml_find_chr <- function(x, xpath, ns = xml_ns(x)) {
171  UseMethod("xml_find_chr")
172}
173
174#' @export
175xml_find_chr.xml_node <- function(x, xpath, ns = xml_ns(x)) {
176  res <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)
177  if (!is.character(res)) {
178    stop("result of type: ", sQuote(class(res)), ", not character", call. = FALSE)
179  }
180  res
181}
182
183#' @export
184xml_find_chr.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) {
185  if (length(x) == 0)
186    return(character())
187
188  vapply(x, function(x) xml_find_chr(x, xpath = xpath, ns = ns), character(1))
189}
190
191#' @export
192xml_find_chr.xml_missing <- function(x, xpath, ns = xml_ns(x)) {
193   character(0)
194}
195
196#' @export
197#' @rdname xml_find_all
198xml_find_lgl <- function(x, xpath, ns = xml_ns(x)) {
199  UseMethod("xml_find_lgl")
200}
201
202#' @export
203xml_find_lgl.xml_node <- function(x, xpath, ns = xml_ns(x)) {
204  res <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)
205  if (!is.logical(res)) {
206    stop("result of type: ", sQuote(class(res)), ", not logical", call. = FALSE)
207  }
208  res
209}
210
211#' @export
212xml_find_lgl.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) {
213  if (length(x) == 0)
214    return(logical())
215
216  vapply(x, function(x) xml_find_lgl(x, xpath = xpath, ns = ns), logical(1))
217}
218
219#' @export
220xml_find_lgl.xml_missing <- function(x, xpath, ns = xml_ns(x)) {
221   logical(0)
222}
223
224# Deprecated functions ----------------------------------------------------
225
226#' @rdname xml_find_all
227#' @usage NULL
228#' @export
229xml_find_one <- function(x, xpath, ns = xml_ns(x)) {
230  .Deprecated("xml_find_first")
231  UseMethod("xml_find_first")
232}
233