1#' Parse an html table into a data frame
2#'
3#' The algorithm mimics what a browser does, but repeats the values of merged
4#' cells in every cell that cover.
5#'
6#' @inheritParams html_name
7#' @param header Use first row as header? If `NA`, will use first row
8#'   if it consists of `<th>` tags.
9#'
10#'   If `TRUE`, column names are left exactly as they are in the source
11#'   document, which may require post-processing to generate a valid data
12#'   frame.
13#' @param trim Remove leading and trailing whitespace within each cell?
14#' @param fill Deprecated - missing cells in tables are now always
15#'    automatically filled with `NA`.
16#' @param dec The character used as decimal place marker.
17#' @param na.strings Character vector of values that will be converted to `NA`
18#'    if `convert` is `TRUE`.
19#' @param convert If `TRUE`, will run [`type.convert()`] to interpret texts as
20#'    integer, double, or `NA`.
21#' @return
22#' When applied to a single element, `html_table()` returns a single tibble.
23#' When applied to multiple elements or a document, `html_table()` returns
24#' a list of tibbles.
25#' @export
26#' @examples
27#' sample1 <- minimal_html("<table>
28#'   <tr><th>Col A</th><th>Col B</th></tr>
29#'   <tr><td>1</td><td>x</td></tr>
30#'   <tr><td>4</td><td>y</td></tr>
31#'   <tr><td>10</td><td>z</td></tr>
32#' </table>")
33#' sample1 %>%
34#'   html_element("table") %>%
35#'   html_table()
36#'
37#' # Values in merged cells will be duplicated
38#' sample2 <- minimal_html("<table>
39#'   <tr><th>A</th><th>B</th><th>C</th></tr>
40#'   <tr><td>1</td><td>2</td><td>3</td></tr>
41#'   <tr><td colspan='2'>4</td><td>5</td></tr>
42#'   <tr><td>6</td><td colspan='2'>7</td></tr>
43#' </table>")
44#' sample2 %>%
45#'   html_element("table") %>%
46#'   html_table()
47#'
48#' # If a row is missing cells, they'll be filled with NAs
49#' sample3 <- minimal_html("<table>
50#'   <tr><th>A</th><th>B</th><th>C</th></tr>
51#'   <tr><td colspan='2'>1</td><td>2</td></tr>
52#'   <tr><td colspan='2'>3</td></tr>
53#'   <tr><td>4</td></tr>
54#' </table>")
55#' sample3 %>%
56#'   html_element("table") %>%
57#'   html_table()
58html_table <- function(x,
59                       header = NA,
60                       trim = TRUE,
61                       fill = deprecated(),
62                       dec = ".",
63                       na.strings = "NA",
64                       convert = TRUE
65  ) {
66
67  UseMethod("html_table")
68}
69
70#' @export
71html_table.xml_document <- function(x,
72                                    header = NA,
73                                    trim = TRUE,
74                                    fill = deprecated(),
75                                    dec = ".",
76                                    na.strings = "NA",
77                                    convert = TRUE) {
78  tables <- xml2::xml_find_all(x, ".//table")
79  html_table(
80    tables,
81    header = header,
82    trim = trim,
83    fill = fill,
84    dec = dec,
85    na.strings = na.strings,
86    convert = convert
87  )
88}
89
90#' @export
91html_table.xml_nodeset <- function(x,
92                                   header = NA,
93                                   trim = TRUE,
94                                   fill = deprecated(),
95                                   dec = ".",
96                                   na.strings = "NA",
97                                   convert = TRUE) {
98  lapply(
99    x,
100    html_table,
101    header = header,
102    trim = trim,
103    fill = fill,
104    dec = dec,
105    na.strings = na.strings,
106    convert = convert
107  )
108}
109
110#' @export
111html_table.xml_node <- function(x,
112                                header = NA,
113                                trim = TRUE,
114                                fill = deprecated(),
115                                dec = ".",
116                                na.strings = "NA",
117                                convert = TRUE) {
118
119  if (lifecycle::is_present(fill) && !isTRUE(fill)) {
120    lifecycle::deprecate_warn(
121      when = "1.0.0",
122      what = "html_table(fill = )",
123      details = "An improved algorithm fills by default so it is no longer needed."
124    )
125  }
126
127  ns <- xml2::xml_ns(x)
128  rows <- xml2::xml_find_all(x, ".//tr", ns = ns)
129  cells <- lapply(rows, xml2::xml_find_all, ".//td|.//th", ns = ns)
130
131  if (length(cells) == 0) {
132    return(tibble::tibble())
133  }
134
135  out <- table_fill(cells, trim = trim)
136
137  if (is.na(header)) {
138    header <- all(html_name(cells[[1]]) == "th")
139  }
140  if (header) {
141    col_names <- out[1, , drop = FALSE]
142    out <- out[-1, , drop = FALSE]
143  } else {
144    col_names <- paste0("X", seq_len(ncol(out)))
145  }
146
147  colnames(out) <- col_names
148  df <- tibble::as_tibble(out, .name_repair = "minimal")
149
150  if (isTRUE(convert)) {
151    df[] <- lapply(df, function(x) {
152        utils::type.convert(x, as.is = TRUE, dec = dec, na.strings = na.strings)
153    })
154  }
155
156  df
157}
158
159# Table fillng algorithm --------------------------------------------------
160# Base on https://html.spec.whatwg.org/multipage/tables.html#forming-a-table
161
162table_fill <- function(cells, trim = TRUE) {
163  width <- 0
164  height <- length(cells) # initial estimate
165  values <- vector("list", height)
166
167  # list of downward spanning cells
168  dw <- dw_init()
169
170  # https://html.spec.whatwg.org/multipage/tables.html#algorithm-for-processing-rows
171  for (i in seq_along(cells)) {
172    row <- cells[[i]]
173    if (length(row) == 0) {
174      next
175    }
176
177    rowspan <- as.integer(html_attr(row, "rowspan", default = NA_character_))
178    rowspan[is.na(rowspan)] <- 1
179    colspan <- as.integer(html_attr(row, "colspan", default = NA_character_))
180    colspan[is.na(colspan)] <- 1
181    text <- html_text(row)
182    if (isTRUE(trim)) {
183      text <- gsub("^[[:space:]\u00a0]+|[[:space:]\u00a0]+$", "", text)
184    }
185
186    vals <- rep(NA_character_, width)
187    col <- 1
188    j <- 1
189    while(j <= length(row)) {
190      if (col %in% dw$col) {
191        cell <- dw_find(dw, col)
192        cell_text <- cell$text
193        cell_colspan <- cell$colspan
194      } else {
195        cell_text <- text[[j]]
196        cell_colspan <- colspan[[j]]
197
198        if (rowspan[[j]] > 1) {
199          dw <- dw_add(dw, col, rowspan[[j]], colspan[[j]], text[[j]])
200        }
201
202        j <- j + 1
203      }
204      vals[col:(col + cell_colspan - 1L)] <- cell_text
205      col <- col + cell_colspan
206    }
207
208    # Add any downward cells after last <td>
209    for(j in seq2(col - 1L, width)) {
210      if (j %in% dw$col) {
211        cell <- dw_find(dw, j)
212        vals[j:(j + cell$colspan - 1L)] <- cell$text
213      }
214    }
215
216    dw <- dw_prune(dw)
217    values[[i]] <- vals
218
219    height <- max(height, i + max(rowspan) - 1L)
220    width <- max(width, col - 1L)
221  }
222
223  # Add any downward cells after <tr>
224  i <- length(values) + 1
225  length(values) <- height
226  while (length(dw$col) > 0) {
227    vals <- rep(NA_character_, width)
228    for (col in dw$col) {
229      cell <- dw_find(dw, col)
230      vals[col:(col + cell$colspan - 1L)] <- cell$text
231    }
232    values[[i]] <- vals
233    i <- i + 1
234    dw <- dw_prune(dw)
235  }
236
237  values <- lapply(values, `[`, seq_len(width))
238  matrix(unlist(values), ncol = width, byrow = TRUE)
239}
240
241dw_find <- function(dw, col) {
242  match <- col == dw$col
243  list(
244    col = dw$col[match],
245    rowspan = dw$rowspan[match],
246    colspan = dw$colspan[match],
247    text = dw$text[match]
248  )
249}
250
251dw_init <- function() {
252  list(
253    col = integer(),
254    rowspan = integer(),
255    colspan = integer(),
256    text = character()
257  )
258}
259
260dw_add <- function(dw, col, rowspan, colspan, text) {
261  dw$col <-     c(dw$col, col)
262  dw$text <-    c(dw$text, text)
263  dw$rowspan <- c(dw$rowspan, rowspan)
264  dw$colspan <- c(dw$colspan, colspan)
265  dw
266}
267
268dw_prune <- function(dw) {
269  dw$rowspan <- dw$rowspan - 1L
270  keep <- dw$rowspan > 0L
271
272  dw$col <-     dw$col[keep]
273  dw$text <-    dw$text[keep]
274  dw$rowspan <- dw$rowspan[keep]
275  dw$colspan <- dw$colspan[keep]
276  dw
277}
278