1#' Select elements from an HTML document
2#'
3#' `html_element()` and `html_elements()` find HTML element using CSS selectors
4#' or XPath expressions. CSS selectors are particularly useful in conjunction
5#' with <https://selectorgadget.com/>, which makes it very easy to discover the
6#' selector you need.
7#'
8#' @section CSS selector support:
9#'
10#' CSS selectors are translated to XPath selectors by the \pkg{selectr}
11#' package, which is a port of the python \pkg{cssselect} library,
12#' <https://pythonhosted.org/cssselect/>.
13#'
14#' It implements the majority of CSS3 selectors, as described in
15#' <http://www.w3.org/TR/2011/REC-css3-selectors-20110929/>. The
16#' exceptions are listed below:
17#'
18#' * Pseudo selectors that require interactivity are ignored:
19#'   `:hover`, `:active`, `:focus`, `:target`, `:visited`.
20#' * The following pseudo classes don't work with the wild card element, *:
21#'   `*:first-of-type`, `*:last-of-type`, `*:nth-of-type`,
22#'   `*:nth-last-of-type`, `*:only-of-type`
23#' * It supports `:contains(text)`
24#' * You can use !=, `[foo!=bar]` is the same as `:not([foo=bar])`
25#' * `:not()` accepts a sequence of simple selectors, not just a single
26#'   simple selector.
27#'
28#' @param x Either a document, a node set or a single node.
29#' @param css,xpath Elements to select. Supply one of `css` or `xpath`
30#'   depending on whether you want to use a CSS selector or XPath 1.0
31#'   expression.
32#' @returns `html_element()` returns a nodeset the same length as the input.
33#'   `html_elements()` flattens the output so there's no direct way to map
34#'   the output to the input.
35#' @export
36#' @examples
37#' html <- minimal_html("
38#'   <h1>This is a heading</h1>
39#'   <p id='first'>This is a paragraph</p>
40#'   <p class='important'>This is an important paragraph</p>
41#' ")
42#'
43#' html %>% html_element("h1")
44#' html %>% html_elements("p")
45#' html %>% html_elements(".important")
46#' html %>% html_elements("#first")
47#'
48#' # html_element() vs html_elements() --------------------------------------
49#' html <- minimal_html("
50#'   <ul>
51#'     <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li>
52#'     <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li>
53#'     <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li>
54#'     <li><b>R4-P17</b> is a <i>droid</i></li>
55#'   </ul>
56#' ")
57#' li <- html %>% html_elements("li")
58#'
59#' # When applied to a node set, html_elements() returns all matching elements
60#' # beneath any of the inputs, flattening results into a new node set.
61#' li %>% html_elements("i")
62#'
63#' # When applied to a node set, html_element() always returns a vector the
64#' # same length as the input, using a "missing" element where needed.
65#' li %>% html_element("i")
66#' # and html_text() and html_attr() will return NA
67#' li %>% html_element("i") %>% html_text2()
68#' li %>% html_element("span") %>% html_attr("class")
69html_element <- function(x, css, xpath) {
70  UseMethod("html_element")
71}
72
73#' @export
74#' @rdname html_element
75html_elements <- function(x, css, xpath) {
76  UseMethod("html_elements")
77}
78
79#' @export
80html_elements.default <- function(x, css, xpath) {
81  xml2::xml_find_all(x, make_selector(css, xpath))
82}
83
84#' @export
85html_element.default <- function(x, css, xpath) {
86  xml2::xml_find_first(x, make_selector(css, xpath))
87}
88
89make_selector <- function(css, xpath) {
90  if (missing(css) && missing(xpath))
91    stop("Please supply one of css or xpath", call. = FALSE)
92  if (!missing(css) && !missing(xpath))
93    stop("Please supply css or xpath, not both", call. = FALSE)
94
95  if (!missing(css)) {
96    if (!is.character(css) && length(css) == 1)
97      stop("`css` must be a string")
98
99    selectr::css_to_xpath(css, prefix = ".//")
100  } else {
101    if (!is.character(xpath) && length(xpath) == 1)
102      stop("`xpath` must be a string")
103
104    xpath
105  }
106}
107
108