1#' Parse forms and set values
2#'
3#' Use `html_form()` to extract a form, set values with `html_form_set()`,
4#' and submit it with `html_form_submit()`.
5#'
6#' @export
7#' @inheritParams html_name
8#' @param base_url Base url of underlying HTML document. The default, `NULL`,
9#'   uses the url of the HTML document underlying `x`.
10#' @seealso HTML 4.01 form specification:
11#'   <http://www.w3.org/TR/html401/interact/forms.html>
12#' @return
13#' * `html_form()` returns as S3 object with class `rvest_form` when applied
14#'   to a single element. It returns a list of `rvest_form` objects when
15#'   applied to multiple elements or a document.
16#'
17#' * `html_form_set()` returns an `rvest_form` object.
18#'
19#' * `html_form_submit()` submits the form, returning an httr response which
20#'   can be parsed with [read_html()].
21#' @examples
22#' html <- read_html("http://www.google.com")
23#' search <- html_form(html)[[1]]
24#'
25#' search <- search %>% html_form_set(q = "My little pony", hl = "fr")
26#'
27#' # Or if you have a list of values, use !!!
28#' vals <- list(q = "web scraping", hl = "en")
29#' search <- search %>% html_form_set(!!!vals)
30#'
31#' # To submit and get result:
32#' \dontrun{
33#' resp <- html_form_submit(search)
34#' read_html(resp)
35#' }
36html_form <- function(x, base_url = NULL) UseMethod("html_form")
37
38#' @export
39html_form.xml_document <- function(x, base_url = NULL) {
40  html_form(xml2::xml_find_all(x, ".//form"), base_url = base_url)
41}
42
43#' @export
44html_form.xml_nodeset <- function(x, base_url = NULL) {
45  lapply(x, html_form, base_url = base_url)
46}
47
48#' @export
49html_form.xml_node <- function(x, base_url = NULL) {
50  stopifnot(xml2::xml_name(x) == "form")
51
52  attr <- as.list(xml2::xml_attrs(x))
53  name <- attr$id %||% attr$name %||% "<unnamed>" # for human readers
54  method <- toupper(attr$method %||% "GET")
55  enctype <- convert_enctype(attr$enctype)
56
57  nodes <- html_elements(x, "input, select, textarea, button")
58  fields <- lapply(nodes, function(x) {
59    switch(xml2::xml_name(x),
60      textarea = parse_textarea(x),
61      input = parse_input(x),
62      select = parse_select(x),
63      button = parse_button(x)
64    )
65  })
66  names(fields) <- map_chr(fields, function(x) x$name %||% "")
67
68  structure(
69    list(
70      name = name,
71      method = method,
72      action = xml2::url_absolute(attr$action, base_url %||% xml2::xml_url(x)),
73      enctype = enctype,
74      fields = fields
75    ),
76    class = "rvest_form")
77}
78
79#' @export
80print.rvest_form <- function(x, ...) {
81  cat("<form> '", x$name, "' (", x$method, " ", x$action, ")\n", sep = "")
82  cat(format_list(x$fields, indent = 1), "\n", sep = "")
83}
84
85
86# set ----------------------------------------------------------------
87
88#' @rdname html_form
89#' @param form A form
90#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name-value pairs giving
91#'   fields to modify.
92#'
93#'   Provide a character vector to set multiple checkboxes in a set or
94#'   select multiple values from a multi-select.
95#' @export
96html_form_set <- function(form, ...) {
97  check_form(form)
98
99  new_values <- list2(...)
100  check_fields(form, new_values)
101
102  for (field in names(new_values)) {
103    type <- form$fields[[field]]$type %||% "non-input"
104    if (type == "hidden") {
105      warn(paste0("Setting value of hidden field '", field, "'."))
106    } else if (type == "submit") {
107      abort(paste0("Can't change value of input with type submit: '", field, "'."))
108    }
109
110    form$fields[[field]]$value <- new_values[[field]]
111  }
112
113  form
114}
115
116# submit ------------------------------------------------------------------
117
118#' @rdname html_form
119#' @param submit Which button should be used to submit the form?
120#'   * `NULL`, the default, uses the first button.
121#'   * A string selects a button by its name.
122#'   * A number selects a button using its relative position.
123#' @export
124html_form_submit <- function(form, submit = NULL) {
125  check_form(form)
126
127  subm <- submission_build(form, submit)
128  submission_submit(subm)
129}
130
131submission_build <- function(form, submit) {
132  method <- form$method
133  if (!(method %in% c("POST", "GET"))) {
134    warn(paste0("Invalid method (", method, "), defaulting to GET"))
135    method <- "GET"
136  }
137
138  if (length(form$action) == 0) {
139    abort("`form` doesn't contain a `action` attribute")
140  }
141
142  list(
143    method = method,
144    enctype = form$enctype,
145    action = form$action,
146    values = submission_build_values(form, submit)
147  )
148}
149
150submission_submit <- function(x, ...) {
151  if (x$method == "POST") {
152    httr::POST(url = x$action, body = x$values, encode = x$enctype, ...)
153  } else {
154    httr::GET(url = x$action, query = x$values, ...)
155  }
156}
157
158submission_build_values <- function(form, submit = NULL) {
159  fields <- form$fields
160  submit <- submission_find_submit(fields, submit)
161  entry_list <- c(Filter(Negate(is_button), fields), list(submit))
162  entry_list <- Filter(function(x) !is.null(x$name), entry_list)
163
164  if (length(entry_list) == 0) {
165    return(list())
166  }
167
168  values <- lapply(entry_list, function(x) as.character(x$value))
169  names <- map_chr(entry_list, "[[", "name")
170
171  out <- set_names(unlist(values, use.names = FALSE), rep(names, lengths(values)))
172  as.list(out)
173}
174
175submission_find_submit <- function(fields, idx) {
176  buttons <- Filter(is_button, fields)
177
178  if (is.null(idx)) {
179    if (length(buttons) == 0) {
180      list()
181    } else {
182      if (length(buttons) > 1) {
183        inform(paste0("Submitting with '", buttons[[1]]$name, "'"))
184      }
185      buttons[[1]]
186    }
187  } else if (is.numeric(idx) && length(idx) == 1) {
188    if (idx < 1 || idx > length(buttons)) {
189      abort("Numeric `submit` out of range")
190    }
191    buttons[[idx]]
192  } else if (is.character(idx) && length(idx) == 1) {
193    if (!idx %in% names(buttons)) {
194      abort(c(
195        paste0("No <input> found with name '", idx, "'."),
196        i = paste0("Possible values: ", paste0(names(buttons), collapse = ", "))
197      ))
198    }
199    buttons[[idx]]
200  } else {
201    abort("`submit` must be NULL, a string, or a number.")
202  }
203}
204
205is_button <- function(x) {
206  tolower(x$type) %in% c("submit", "image", "button")
207}
208
209# Field parsing -----------------------------------------------------------
210
211rvest_field <- function(type, name, value, attr, ...) {
212  structure(
213    list(
214      type = type,
215      name = name,
216      value = value,
217      attr = attr,
218      ...
219    ),
220    class = "rvest_field"
221  )
222}
223
224#' @export
225format.rvest_field <- function(x, ...) {
226  if (x$type == "password") {
227    value <- paste0(rep("*", nchar(x$value %||% "")), collapse = "")
228  } else {
229    value <- paste(x$value, collapse = ", ")
230    value <- str_trunc(encodeString(value), 20)
231  }
232
233  paste0("<field> (", x$type, ") ", x$name, ": ", value)
234}
235
236#' @export
237print.rvest_field <- function(x, ...) {
238  cat(format(x, ...), "\n", sep = "")
239  invisible(x)
240}
241
242parse_input <- function(x) {
243  attr <- as.list(xml2::xml_attrs(x))
244  rvest_field(
245    type = attr$type %||% "text",
246    name = attr$name,
247    value = attr$value,
248    attr = attr
249  )
250}
251
252parse_select <- function(x) {
253  attr <- as.list(xml2::xml_attrs(x))
254  options <- parse_options(html_elements(x, "option"))
255
256  rvest_field(
257    type = "select",
258    name = attr$name,
259    value = options$value,
260    attr = attr,
261    options = options$options
262  )
263}
264parse_options <- function(options) {
265  parse_option <- function(option) {
266    name <- xml2::xml_text(option)
267    list(
268      value = xml2::xml_attr(option, "value", default = name),
269      name = name,
270      selected = xml2::xml_has_attr(option, "selected")
271    )
272  }
273
274  parsed <- lapply(options, parse_option)
275  value <-  map_chr(parsed, "[[", "value")
276  name <- map_chr(parsed, "[[", "name")
277  selected <- map_lgl(parsed, "[[", "selected")
278
279  list(
280    value = value[selected],
281    options = stats::setNames(value, name)
282  )
283}
284
285parse_textarea <- function(x) {
286  attr <- as.list(xml2::xml_attrs(x))
287
288  rvest_field(
289    type = "textarea",
290    name = attr$name,
291    value = xml2::xml_text(x),
292    attr = attr
293  )
294}
295
296parse_button <- function(x) {
297  attr <- as.list(xml2::xml_attrs(x))
298
299  rvest_field(
300    type = "button",
301    name = attr$name,
302    value = attr$value,
303    attr = attr
304  )
305}
306
307# Helpers -----------------------------------------------------------------
308
309convert_enctype <- function(x) {
310  if (is.null(x)) {
311    "form"
312  } else if (x == "application/x-www-form-urlencoded") {
313    "form"
314  } else if (x == "multipart/form-data") {
315    "multipart"
316  } else {
317    warn(paste0("Unknown enctype (", x, "). Defaulting to form encoded."))
318    "form"
319  }
320}
321
322format_list <- function(x, indent = 0) {
323  spaces <- paste(rep("  ", indent), collapse = "")
324
325  formatted <- vapply(x, format, character(1))
326  paste0(spaces, formatted, collapse = "\n")
327}
328
329check_fields <- function(form, values) {
330  no_match <- setdiff(names(values), names(form$fields))
331  if (length(no_match) > 0) {
332    str <- paste("'", no_match, "'", collapse = ", ")
333    abort(paste0("Can't set value of fields that don't exist: ", str))
334  }
335}
336