1#' Select elements from an HTML document 2#' 3#' `html_element()` and `html_elements()` find HTML element using CSS selectors 4#' or XPath expressions. CSS selectors are particularly useful in conjunction 5#' with <https://selectorgadget.com/>, which makes it very easy to discover the 6#' selector you need. 7#' 8#' @section CSS selector support: 9#' 10#' CSS selectors are translated to XPath selectors by the \pkg{selectr} 11#' package, which is a port of the python \pkg{cssselect} library, 12#' <https://pythonhosted.org/cssselect/>. 13#' 14#' It implements the majority of CSS3 selectors, as described in 15#' <http://www.w3.org/TR/2011/REC-css3-selectors-20110929/>. The 16#' exceptions are listed below: 17#' 18#' * Pseudo selectors that require interactivity are ignored: 19#' `:hover`, `:active`, `:focus`, `:target`, `:visited`. 20#' * The following pseudo classes don't work with the wild card element, *: 21#' `*:first-of-type`, `*:last-of-type`, `*:nth-of-type`, 22#' `*:nth-last-of-type`, `*:only-of-type` 23#' * It supports `:contains(text)` 24#' * You can use !=, `[foo!=bar]` is the same as `:not([foo=bar])` 25#' * `:not()` accepts a sequence of simple selectors, not just a single 26#' simple selector. 27#' 28#' @param x Either a document, a node set or a single node. 29#' @param css,xpath Elements to select. Supply one of `css` or `xpath` 30#' depending on whether you want to use a CSS selector or XPath 1.0 31#' expression. 32#' @returns `html_element()` returns a nodeset the same length as the input. 33#' `html_elements()` flattens the output so there's no direct way to map 34#' the output to the input. 35#' @export 36#' @examples 37#' html <- minimal_html(" 38#' <h1>This is a heading</h1> 39#' <p id='first'>This is a paragraph</p> 40#' <p class='important'>This is an important paragraph</p> 41#' ") 42#' 43#' html %>% html_element("h1") 44#' html %>% html_elements("p") 45#' html %>% html_elements(".important") 46#' html %>% html_elements("#first") 47#' 48#' # html_element() vs html_elements() -------------------------------------- 49#' html <- minimal_html(" 50#' <ul> 51#' <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li> 52#' <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li> 53#' <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li> 54#' <li><b>R4-P17</b> is a <i>droid</i></li> 55#' </ul> 56#' ") 57#' li <- html %>% html_elements("li") 58#' 59#' # When applied to a node set, html_elements() returns all matching elements 60#' # beneath any of the inputs, flattening results into a new node set. 61#' li %>% html_elements("i") 62#' 63#' # When applied to a node set, html_element() always returns a vector the 64#' # same length as the input, using a "missing" element where needed. 65#' li %>% html_element("i") 66#' # and html_text() and html_attr() will return NA 67#' li %>% html_element("i") %>% html_text2() 68#' li %>% html_element("span") %>% html_attr("class") 69html_element <- function(x, css, xpath) { 70 UseMethod("html_element") 71} 72 73#' @export 74#' @rdname html_element 75html_elements <- function(x, css, xpath) { 76 UseMethod("html_elements") 77} 78 79#' @export 80html_elements.default <- function(x, css, xpath) { 81 xml2::xml_find_all(x, make_selector(css, xpath)) 82} 83 84#' @export 85html_element.default <- function(x, css, xpath) { 86 xml2::xml_find_first(x, make_selector(css, xpath)) 87} 88 89make_selector <- function(css, xpath) { 90 if (missing(css) && missing(xpath)) 91 stop("Please supply one of css or xpath", call. = FALSE) 92 if (!missing(css) && !missing(xpath)) 93 stop("Please supply css or xpath, not both", call. = FALSE) 94 95 if (!missing(css)) { 96 if (!is.character(css) && length(css) == 1) 97 stop("`css` must be a string") 98 99 selectr::css_to_xpath(css, prefix = ".//") 100 } else { 101 if (!is.character(xpath) && length(xpath) == 1) 102 stop("`xpath` must be a string") 103 104 xpath 105 } 106} 107 108