1#' Parse forms and set values 2#' 3#' Use `html_form()` to extract a form, set values with `html_form_set()`, 4#' and submit it with `html_form_submit()`. 5#' 6#' @export 7#' @inheritParams html_name 8#' @param base_url Base url of underlying HTML document. The default, `NULL`, 9#' uses the url of the HTML document underlying `x`. 10#' @seealso HTML 4.01 form specification: 11#' <http://www.w3.org/TR/html401/interact/forms.html> 12#' @return 13#' * `html_form()` returns as S3 object with class `rvest_form` when applied 14#' to a single element. It returns a list of `rvest_form` objects when 15#' applied to multiple elements or a document. 16#' 17#' * `html_form_set()` returns an `rvest_form` object. 18#' 19#' * `html_form_submit()` submits the form, returning an httr response which 20#' can be parsed with [read_html()]. 21#' @examples 22#' html <- read_html("http://www.google.com") 23#' search <- html_form(html)[[1]] 24#' 25#' search <- search %>% html_form_set(q = "My little pony", hl = "fr") 26#' 27#' # Or if you have a list of values, use !!! 28#' vals <- list(q = "web scraping", hl = "en") 29#' search <- search %>% html_form_set(!!!vals) 30#' 31#' # To submit and get result: 32#' \dontrun{ 33#' resp <- html_form_submit(search) 34#' read_html(resp) 35#' } 36html_form <- function(x, base_url = NULL) UseMethod("html_form") 37 38#' @export 39html_form.xml_document <- function(x, base_url = NULL) { 40 html_form(xml2::xml_find_all(x, ".//form"), base_url = base_url) 41} 42 43#' @export 44html_form.xml_nodeset <- function(x, base_url = NULL) { 45 lapply(x, html_form, base_url = base_url) 46} 47 48#' @export 49html_form.xml_node <- function(x, base_url = NULL) { 50 stopifnot(xml2::xml_name(x) == "form") 51 52 attr <- as.list(xml2::xml_attrs(x)) 53 name <- attr$id %||% attr$name %||% "<unnamed>" # for human readers 54 method <- toupper(attr$method %||% "GET") 55 enctype <- convert_enctype(attr$enctype) 56 57 nodes <- html_elements(x, "input, select, textarea, button") 58 fields <- lapply(nodes, function(x) { 59 switch(xml2::xml_name(x), 60 textarea = parse_textarea(x), 61 input = parse_input(x), 62 select = parse_select(x), 63 button = parse_button(x) 64 ) 65 }) 66 names(fields) <- map_chr(fields, function(x) x$name %||% "") 67 68 structure( 69 list( 70 name = name, 71 method = method, 72 action = xml2::url_absolute(attr$action, base_url %||% xml2::xml_url(x)), 73 enctype = enctype, 74 fields = fields 75 ), 76 class = "rvest_form") 77} 78 79#' @export 80print.rvest_form <- function(x, ...) { 81 cat("<form> '", x$name, "' (", x$method, " ", x$action, ")\n", sep = "") 82 cat(format_list(x$fields, indent = 1), "\n", sep = "") 83} 84 85 86# set ---------------------------------------------------------------- 87 88#' @rdname html_form 89#' @param form A form 90#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name-value pairs giving 91#' fields to modify. 92#' 93#' Provide a character vector to set multiple checkboxes in a set or 94#' select multiple values from a multi-select. 95#' @export 96html_form_set <- function(form, ...) { 97 check_form(form) 98 99 new_values <- list2(...) 100 check_fields(form, new_values) 101 102 for (field in names(new_values)) { 103 type <- form$fields[[field]]$type %||% "non-input" 104 if (type == "hidden") { 105 warn(paste0("Setting value of hidden field '", field, "'.")) 106 } else if (type == "submit") { 107 abort(paste0("Can't change value of input with type submit: '", field, "'.")) 108 } 109 110 form$fields[[field]]$value <- new_values[[field]] 111 } 112 113 form 114} 115 116# submit ------------------------------------------------------------------ 117 118#' @rdname html_form 119#' @param submit Which button should be used to submit the form? 120#' * `NULL`, the default, uses the first button. 121#' * A string selects a button by its name. 122#' * A number selects a button using its relative position. 123#' @export 124html_form_submit <- function(form, submit = NULL) { 125 check_form(form) 126 127 subm <- submission_build(form, submit) 128 submission_submit(subm) 129} 130 131submission_build <- function(form, submit) { 132 method <- form$method 133 if (!(method %in% c("POST", "GET"))) { 134 warn(paste0("Invalid method (", method, "), defaulting to GET")) 135 method <- "GET" 136 } 137 138 if (length(form$action) == 0) { 139 abort("`form` doesn't contain a `action` attribute") 140 } 141 142 list( 143 method = method, 144 enctype = form$enctype, 145 action = form$action, 146 values = submission_build_values(form, submit) 147 ) 148} 149 150submission_submit <- function(x, ...) { 151 if (x$method == "POST") { 152 httr::POST(url = x$action, body = x$values, encode = x$enctype, ...) 153 } else { 154 httr::GET(url = x$action, query = x$values, ...) 155 } 156} 157 158submission_build_values <- function(form, submit = NULL) { 159 fields <- form$fields 160 submit <- submission_find_submit(fields, submit) 161 entry_list <- c(Filter(Negate(is_button), fields), list(submit)) 162 entry_list <- Filter(function(x) !is.null(x$name), entry_list) 163 164 if (length(entry_list) == 0) { 165 return(list()) 166 } 167 168 values <- lapply(entry_list, function(x) as.character(x$value)) 169 names <- map_chr(entry_list, "[[", "name") 170 171 out <- set_names(unlist(values, use.names = FALSE), rep(names, lengths(values))) 172 as.list(out) 173} 174 175submission_find_submit <- function(fields, idx) { 176 buttons <- Filter(is_button, fields) 177 178 if (is.null(idx)) { 179 if (length(buttons) == 0) { 180 list() 181 } else { 182 if (length(buttons) > 1) { 183 inform(paste0("Submitting with '", buttons[[1]]$name, "'")) 184 } 185 buttons[[1]] 186 } 187 } else if (is.numeric(idx) && length(idx) == 1) { 188 if (idx < 1 || idx > length(buttons)) { 189 abort("Numeric `submit` out of range") 190 } 191 buttons[[idx]] 192 } else if (is.character(idx) && length(idx) == 1) { 193 if (!idx %in% names(buttons)) { 194 abort(c( 195 paste0("No <input> found with name '", idx, "'."), 196 i = paste0("Possible values: ", paste0(names(buttons), collapse = ", ")) 197 )) 198 } 199 buttons[[idx]] 200 } else { 201 abort("`submit` must be NULL, a string, or a number.") 202 } 203} 204 205is_button <- function(x) { 206 tolower(x$type) %in% c("submit", "image", "button") 207} 208 209# Field parsing ----------------------------------------------------------- 210 211rvest_field <- function(type, name, value, attr, ...) { 212 structure( 213 list( 214 type = type, 215 name = name, 216 value = value, 217 attr = attr, 218 ... 219 ), 220 class = "rvest_field" 221 ) 222} 223 224#' @export 225format.rvest_field <- function(x, ...) { 226 if (x$type == "password") { 227 value <- paste0(rep("*", nchar(x$value %||% "")), collapse = "") 228 } else { 229 value <- paste(x$value, collapse = ", ") 230 value <- str_trunc(encodeString(value), 20) 231 } 232 233 paste0("<field> (", x$type, ") ", x$name, ": ", value) 234} 235 236#' @export 237print.rvest_field <- function(x, ...) { 238 cat(format(x, ...), "\n", sep = "") 239 invisible(x) 240} 241 242parse_input <- function(x) { 243 attr <- as.list(xml2::xml_attrs(x)) 244 rvest_field( 245 type = attr$type %||% "text", 246 name = attr$name, 247 value = attr$value, 248 attr = attr 249 ) 250} 251 252parse_select <- function(x) { 253 attr <- as.list(xml2::xml_attrs(x)) 254 options <- parse_options(html_elements(x, "option")) 255 256 rvest_field( 257 type = "select", 258 name = attr$name, 259 value = options$value, 260 attr = attr, 261 options = options$options 262 ) 263} 264parse_options <- function(options) { 265 parse_option <- function(option) { 266 name <- xml2::xml_text(option) 267 list( 268 value = xml2::xml_attr(option, "value", default = name), 269 name = name, 270 selected = xml2::xml_has_attr(option, "selected") 271 ) 272 } 273 274 parsed <- lapply(options, parse_option) 275 value <- map_chr(parsed, "[[", "value") 276 name <- map_chr(parsed, "[[", "name") 277 selected <- map_lgl(parsed, "[[", "selected") 278 279 list( 280 value = value[selected], 281 options = stats::setNames(value, name) 282 ) 283} 284 285parse_textarea <- function(x) { 286 attr <- as.list(xml2::xml_attrs(x)) 287 288 rvest_field( 289 type = "textarea", 290 name = attr$name, 291 value = xml2::xml_text(x), 292 attr = attr 293 ) 294} 295 296parse_button <- function(x) { 297 attr <- as.list(xml2::xml_attrs(x)) 298 299 rvest_field( 300 type = "button", 301 name = attr$name, 302 value = attr$value, 303 attr = attr 304 ) 305} 306 307# Helpers ----------------------------------------------------------------- 308 309convert_enctype <- function(x) { 310 if (is.null(x)) { 311 "form" 312 } else if (x == "application/x-www-form-urlencoded") { 313 "form" 314 } else if (x == "multipart/form-data") { 315 "multipart" 316 } else { 317 warn(paste0("Unknown enctype (", x, "). Defaulting to form encoded.")) 318 "form" 319 } 320} 321 322format_list <- function(x, indent = 0) { 323 spaces <- paste(rep(" ", indent), collapse = "") 324 325 formatted <- vapply(x, format, character(1)) 326 paste0(spaces, formatted, collapse = "\n") 327} 328 329check_fields <- function(form, values) { 330 no_match <- setdiff(names(values), names(form$fields)) 331 if (length(no_match) > 0) { 332 str <- paste("'", no_match, "'", collapse = ", ") 333 abort(paste0("Can't set value of fields that don't exist: ", str)) 334 } 335} 336