1#' Parse an html table into a data frame 2#' 3#' The algorithm mimics what a browser does, but repeats the values of merged 4#' cells in every cell that cover. 5#' 6#' @inheritParams html_name 7#' @param header Use first row as header? If `NA`, will use first row 8#' if it consists of `<th>` tags. 9#' 10#' If `TRUE`, column names are left exactly as they are in the source 11#' document, which may require post-processing to generate a valid data 12#' frame. 13#' @param trim Remove leading and trailing whitespace within each cell? 14#' @param fill Deprecated - missing cells in tables are now always 15#' automatically filled with `NA`. 16#' @param dec The character used as decimal place marker. 17#' @param na.strings Character vector of values that will be converted to `NA` 18#' if `convert` is `TRUE`. 19#' @param convert If `TRUE`, will run [`type.convert()`] to interpret texts as 20#' integer, double, or `NA`. 21#' @return 22#' When applied to a single element, `html_table()` returns a single tibble. 23#' When applied to multiple elements or a document, `html_table()` returns 24#' a list of tibbles. 25#' @export 26#' @examples 27#' sample1 <- minimal_html("<table> 28#' <tr><th>Col A</th><th>Col B</th></tr> 29#' <tr><td>1</td><td>x</td></tr> 30#' <tr><td>4</td><td>y</td></tr> 31#' <tr><td>10</td><td>z</td></tr> 32#' </table>") 33#' sample1 %>% 34#' html_element("table") %>% 35#' html_table() 36#' 37#' # Values in merged cells will be duplicated 38#' sample2 <- minimal_html("<table> 39#' <tr><th>A</th><th>B</th><th>C</th></tr> 40#' <tr><td>1</td><td>2</td><td>3</td></tr> 41#' <tr><td colspan='2'>4</td><td>5</td></tr> 42#' <tr><td>6</td><td colspan='2'>7</td></tr> 43#' </table>") 44#' sample2 %>% 45#' html_element("table") %>% 46#' html_table() 47#' 48#' # If a row is missing cells, they'll be filled with NAs 49#' sample3 <- minimal_html("<table> 50#' <tr><th>A</th><th>B</th><th>C</th></tr> 51#' <tr><td colspan='2'>1</td><td>2</td></tr> 52#' <tr><td colspan='2'>3</td></tr> 53#' <tr><td>4</td></tr> 54#' </table>") 55#' sample3 %>% 56#' html_element("table") %>% 57#' html_table() 58html_table <- function(x, 59 header = NA, 60 trim = TRUE, 61 fill = deprecated(), 62 dec = ".", 63 na.strings = "NA", 64 convert = TRUE 65 ) { 66 67 UseMethod("html_table") 68} 69 70#' @export 71html_table.xml_document <- function(x, 72 header = NA, 73 trim = TRUE, 74 fill = deprecated(), 75 dec = ".", 76 na.strings = "NA", 77 convert = TRUE) { 78 tables <- xml2::xml_find_all(x, ".//table") 79 html_table( 80 tables, 81 header = header, 82 trim = trim, 83 fill = fill, 84 dec = dec, 85 na.strings = na.strings, 86 convert = convert 87 ) 88} 89 90#' @export 91html_table.xml_nodeset <- function(x, 92 header = NA, 93 trim = TRUE, 94 fill = deprecated(), 95 dec = ".", 96 na.strings = "NA", 97 convert = TRUE) { 98 lapply( 99 x, 100 html_table, 101 header = header, 102 trim = trim, 103 fill = fill, 104 dec = dec, 105 na.strings = na.strings, 106 convert = convert 107 ) 108} 109 110#' @export 111html_table.xml_node <- function(x, 112 header = NA, 113 trim = TRUE, 114 fill = deprecated(), 115 dec = ".", 116 na.strings = "NA", 117 convert = TRUE) { 118 119 if (lifecycle::is_present(fill) && !isTRUE(fill)) { 120 lifecycle::deprecate_warn( 121 when = "1.0.0", 122 what = "html_table(fill = )", 123 details = "An improved algorithm fills by default so it is no longer needed." 124 ) 125 } 126 127 ns <- xml2::xml_ns(x) 128 rows <- xml2::xml_find_all(x, ".//tr", ns = ns) 129 cells <- lapply(rows, xml2::xml_find_all, ".//td|.//th", ns = ns) 130 131 if (length(cells) == 0) { 132 return(tibble::tibble()) 133 } 134 135 out <- table_fill(cells, trim = trim) 136 137 if (is.na(header)) { 138 header <- all(html_name(cells[[1]]) == "th") 139 } 140 if (header) { 141 col_names <- out[1, , drop = FALSE] 142 out <- out[-1, , drop = FALSE] 143 } else { 144 col_names <- paste0("X", seq_len(ncol(out))) 145 } 146 147 colnames(out) <- col_names 148 df <- tibble::as_tibble(out, .name_repair = "minimal") 149 150 if (isTRUE(convert)) { 151 df[] <- lapply(df, function(x) { 152 utils::type.convert(x, as.is = TRUE, dec = dec, na.strings = na.strings) 153 }) 154 } 155 156 df 157} 158 159# Table fillng algorithm -------------------------------------------------- 160# Base on https://html.spec.whatwg.org/multipage/tables.html#forming-a-table 161 162table_fill <- function(cells, trim = TRUE) { 163 width <- 0 164 height <- length(cells) # initial estimate 165 values <- vector("list", height) 166 167 # list of downward spanning cells 168 dw <- dw_init() 169 170 # https://html.spec.whatwg.org/multipage/tables.html#algorithm-for-processing-rows 171 for (i in seq_along(cells)) { 172 row <- cells[[i]] 173 if (length(row) == 0) { 174 next 175 } 176 177 rowspan <- as.integer(html_attr(row, "rowspan", default = NA_character_)) 178 rowspan[is.na(rowspan)] <- 1 179 colspan <- as.integer(html_attr(row, "colspan", default = NA_character_)) 180 colspan[is.na(colspan)] <- 1 181 text <- html_text(row) 182 if (isTRUE(trim)) { 183 text <- gsub("^[[:space:]\u00a0]+|[[:space:]\u00a0]+$", "", text) 184 } 185 186 vals <- rep(NA_character_, width) 187 col <- 1 188 j <- 1 189 while(j <= length(row)) { 190 if (col %in% dw$col) { 191 cell <- dw_find(dw, col) 192 cell_text <- cell$text 193 cell_colspan <- cell$colspan 194 } else { 195 cell_text <- text[[j]] 196 cell_colspan <- colspan[[j]] 197 198 if (rowspan[[j]] > 1) { 199 dw <- dw_add(dw, col, rowspan[[j]], colspan[[j]], text[[j]]) 200 } 201 202 j <- j + 1 203 } 204 vals[col:(col + cell_colspan - 1L)] <- cell_text 205 col <- col + cell_colspan 206 } 207 208 # Add any downward cells after last <td> 209 for(j in seq2(col - 1L, width)) { 210 if (j %in% dw$col) { 211 cell <- dw_find(dw, j) 212 vals[j:(j + cell$colspan - 1L)] <- cell$text 213 } 214 } 215 216 dw <- dw_prune(dw) 217 values[[i]] <- vals 218 219 height <- max(height, i + max(rowspan) - 1L) 220 width <- max(width, col - 1L) 221 } 222 223 # Add any downward cells after <tr> 224 i <- length(values) + 1 225 length(values) <- height 226 while (length(dw$col) > 0) { 227 vals <- rep(NA_character_, width) 228 for (col in dw$col) { 229 cell <- dw_find(dw, col) 230 vals[col:(col + cell$colspan - 1L)] <- cell$text 231 } 232 values[[i]] <- vals 233 i <- i + 1 234 dw <- dw_prune(dw) 235 } 236 237 values <- lapply(values, `[`, seq_len(width)) 238 matrix(unlist(values), ncol = width, byrow = TRUE) 239} 240 241dw_find <- function(dw, col) { 242 match <- col == dw$col 243 list( 244 col = dw$col[match], 245 rowspan = dw$rowspan[match], 246 colspan = dw$colspan[match], 247 text = dw$text[match] 248 ) 249} 250 251dw_init <- function() { 252 list( 253 col = integer(), 254 rowspan = integer(), 255 colspan = integer(), 256 text = character() 257 ) 258} 259 260dw_add <- function(dw, col, rowspan, colspan, text) { 261 dw$col <- c(dw$col, col) 262 dw$text <- c(dw$text, text) 263 dw$rowspan <- c(dw$rowspan, rowspan) 264 dw$colspan <- c(dw$colspan, colspan) 265 dw 266} 267 268dw_prune <- function(dw) { 269 dw$rowspan <- dw$rowspan - 1L 270 keep <- dw$rowspan > 0L 271 272 dw$col <- dw$col[keep] 273 dw$text <- dw$text[keep] 274 dw$rowspan <- dw$rowspan[keep] 275 dw$colspan <- dw$colspan[keep] 276 dw 277} 278