1#' Find External Resource References 2#' 3#' Given an R Markdown document or HTML file, attempt to determine the set of 4#' additional files needed in order to render and display the document. 5#' 6#' This routine applies heuristics in order to scan a document for 7#' possible resource references. 8#' 9#' In R Markdown documents, it looks for references to files implicitly 10#' referenced in Markdown (e.g. \code{![alt](img.png)}), in the document's 11#' YAML header, in raw HTML chunks, and as quoted strings in R code chunks 12#' (e.g. \code{read.csv("data.csv")}). 13#' 14#' Resources specified explicitly in the YAML header for R Markdown 15#' documents are also returned. To specify resources in YAML, use the 16#' \code{resource_files} key: 17#' 18#' \preformatted{--- 19#' title: My Document 20#' author: My Name 21#' resource_files: 22#' - data/mydata.csv 23#' - images/figure.png 24#' ---} 25#' 26#' Each item in the \code{resource_files} list can refer to: 27#' \enumerate{ 28#' \item A single file, such as \code{images/figure.png}, or 29#' \item A directory, such as \code{resources/data}, in which case all of the 30#' directory's content will be recursively included, or 31#' \item A wildcard pattern, such as \code{data/*.csv}, in which case all of 32#' the files matching the pattern will be included. No recursion is done in 33#' this case. 34#' } 35#' 36#' In HTML files (and raw HTML chunks in R Markdown documents), this routine 37#' searches for resources specified in common tag attributes, such as 38#' \code{<img src="...">}, \code{<link href="...">}, etc. 39#' 40#' In all cases, only resources that exist on disk and are contained in the 41#' document's directory (or a child thereof) are returned. 42#' @param input_file path to the R Markdown document or HTML file to process 43#' @inheritParams render 44#' @return A data frame with the following columns: 45#' \describe{ 46#' \item{path}{The relative path from the document to the resource} 47#' \item{explicit}{Whether the resource was specified explicitly 48#' (\code{TRUE}) or discovered implicitly (\code{FALSE})} 49#' \item{web}{Whether the resource is needed to display a Web page rendered 50#' from the document} 51#' } 52#' @export 53find_external_resources <- function(input_file, encoding = 'UTF-8') { 54 55 # ensure we're working with valid input 56 ext <- tolower(xfun::file_ext(input_file)) 57 if (!(ext %in% c("md", "rmd", "qmd", "html", "htm", "r", "css"))) { 58 stop("Resource discovery is only supported for R Markdown files or HTML files.") 59 } 60 61 if (!file.exists(input_file)) { 62 stop("The input file file '", input_file, "' does not exist.") 63 } 64 65 # set up the frame we'll use to report results 66 discovered_resources <- data.frame( 67 path = character(0), explicit = logical(0), web = logical(0) 68 ) 69 70 input_dir <- dirname(normalize_path(input_file)) 71 72 # discover a single resource--tests a string to see if it corresponds to a 73 # resource on disk; if so, adds it to the list of known resources and returns 74 # TRUE 75 discover_single_resource <- function(path, explicit, web) { 76 77 if (!(is.character(path) && length(path) == 1 && path != "." && path != ".." && 78 file.exists(file.path(input_dir, path)))) 79 return(FALSE) 80 81 ext <- tolower(xfun::file_ext(file.path(input_dir, path))) 82 83 if (identical(ext, "r")) { 84 # if this is a .R script, look for resources it contains, too 85 discover_r_resources(file.path(input_dir, path), discover_single_resource) 86 } else if (identical(ext, "css")) { 87 # if it's a CSS file, look for files it references (e.g. fonts/images) 88 discover_css_resources(file.path(input_dir, path), discover_single_resource) 89 } 90 # if this is an implicitly discovered resource, it needs to refer to 91 # a file rather than a directory 92 if (!explicit && dir_exists(file.path(input_dir, path))) { 93 return(FALSE) 94 } 95 96 # this looks valid; remember it 97 discovered_resources <<- rbind(discovered_resources, data.frame( 98 path = path, explicit = explicit, web = web, stringsAsFactors = FALSE 99 )) 100 TRUE 101 } 102 103 # run the main resource discovery appropriate to the file type 104 if (ext %in% c("md", "rmd", "qmd")) { 105 # discover R Markdown doc resources--scans the document itself as described 106 # in comments above, renders as Markdown, and invokes HTML discovery 107 # on the result 108 discover_rmd_resources(input_file, discover_single_resource) 109 } else if (ext %in% c("htm", "html")) { 110 # discover HTML resources 111 discover_html_resources(input_file, discover_single_resource) 112 113 # if the HTML file represents a rendered R Markdown document, it may have a 114 # sidecar _files folder; include that if it's present 115 sidecar_files_dir <- knitr_files_dir(input_file) 116 files_dir_info <- file.info(sidecar_files_dir) 117 if (isTRUE(files_dir_info$isdir)) { 118 # we probably auto-discovered some resources from _files--exclude those 119 # since they'll be covered by the directory 120 files_dir_prefix <- file.path(basename(sidecar_files_dir), "") 121 files_dir_matches <- substr( 122 discovered_resources$path, 1, nchar(files_dir_prefix) 123 ) == files_dir_prefix 124 discovered_resources <- discovered_resources[!files_dir_matches, , drop = FALSE] 125 126 # add the directory itself 127 discovered_resources <- rbind(discovered_resources, data.frame( 128 path = files_dir_prefix, explicit = FALSE, web = TRUE, 129 stringsAsFactors = FALSE) 130 ) 131 } 132 } else if (ext == "r") { 133 discover_r_resources(input_file, discover_single_resource) 134 } else if (ext == "css") { 135 discover_css_resources(input_file, discover_single_resource) 136 } 137 138 # clean row names (they're not meaningful) 139 rownames(discovered_resources) <- NULL 140 141 # convert paths from factors if necssary, and clean any redundant ./ leaders 142 discovered_resources$path <- as.character(discovered_resources$path) 143 has_prefix <- grepl("^\\./", discovered_resources$path) 144 discovered_resources$path[has_prefix] <- substring(discovered_resources$path[has_prefix], 3) 145 146 discovered_resources 147} 148 149# discovers resources in a single HTML file 150discover_html_resources <- function(html_file, discover_single_resource) { 151 # resource accumulator 152 discover_resource <- function(node, att, val, idx) { 153 res_file <- utils::URLdecode(val) 154 discover_single_resource(res_file, FALSE, TRUE) 155 } 156 157 # create a single string with all of the lines in the document 158 html_lines <- file_string(html_file) 159 160 # parse the HTML and invoke our resource discovery callbacks 161 call_resource_attrs(html_lines, discover_resource) 162} 163 164# discovers resources in a single R Markdown document 165discover_rmd_resources <- function(rmd_file, discover_single_resource) { 166 167 # create a UTF-8 encoded Markdown file to serve as the resource discovery 168 # source 169 md_file <- tempfile(fileext = ".md") 170 input_dir <- dirname(normalize_path(rmd_file)) 171 output_dir <- dirname(md_file) 172 rmd_content <- read_utf8(rmd_file) 173 if (length(i <- grep('^---\\s*$', rmd_content)) >= 2 && i[1] == 1) { 174 rmd_content <- append(rmd_content, 'citeproc: false', i[2] - 1) 175 } 176 write_utf8(rmd_content, md_file) 177 178 # create a vector of temporary files; anything in here 179 # will be cleaned up on exit 180 temp_files <- md_file 181 on.exit(unlink(temp_files, recursive = TRUE), add = TRUE) 182 183 # discovers render-time resources; if any are found, adds them to the list of 184 # discovered resources and copies them alongside the input document. 185 discover_render_resource <- function(output_render_file) { 186 if (discover_single_resource(output_render_file, FALSE, FALSE)) { 187 # mirror original directory structure so we don't need to mutate input 188 # prior to render 189 output_target_file <- file.path(output_dir, output_render_file) 190 if (!file.exists(dirname(output_target_file))) { 191 dir.create(dirname(output_target_file), showWarnings = FALSE, recursive = TRUE) 192 } 193 194 # copy the original resource to the temporary render folder 195 file.copy(file.path(input_dir, output_render_file), output_target_file) 196 197 # clean up this file when we're done 198 temp_files <<- c(temp_files, output_target_file) 199 } 200 } 201 202 # parse the YAML front matter to discover resources named there 203 front_matter <- yaml_front_matter(md_file) 204 205 # Check for content referred to by output format calls to the includes 206 # function (for generating headers/footers/etc. at render time), and for 207 # references to files in pandoc arguments. 208 # 209 # These will be needed to produce even a vanilla Markdown variant of the input 210 # document, so copy them to the temporary folder in preparation for rendering 211 # (in addition to marking them as required resources). 212 output_formats <- front_matter[["output"]] 213 if (is.list(output_formats)) { 214 for (output_format in output_formats) { 215 if (is.list(output_format)) { 216 output_render_files <- unlist(output_format[c( 217 'includes', 'pandoc_args', 'logo', 'reference_doc', 'reference_docx', 'template' 218 )]) 219 lapply(output_render_files, discover_render_resource) 220 } 221 } 222 } 223 224 # check for explicitly named resources 225 if (!is.null(front_matter$resource_files)) { 226 lapply(front_matter$resource_files, function(res) { 227 explicit_res <- if (is.character(res)) { 228 list(path = res, explicit = TRUE, web = is_web_file(res)) 229 } else if (is.list(res) && length(names(res)) > 0) { 230 # list--happens when web flag is specified explicitly in YAML. 231 list(path = names(res)[[1]], 232 explicit = TRUE, 233 web = if (is.null(res$web)) is_web_file(res) else res$web) 234 } 235 236 # check the extracted filename to see if it exists 237 if (!is.null(explicit_res)) { 238 if (grepl("*", explicit_res$path, fixed = TRUE)) { 239 # if the resource file spec includes a wildcard, list the files 240 # that match the pattern 241 files <- list.files( 242 file.path(input_dir, dirname(explicit_res$path)), 243 utils::glob2rx(basename(explicit_res$path)) 244 ) 245 lapply(files, function(f) discover_single_resource( 246 file.path(dirname(explicit_res$path), f), TRUE, web = is_web_file(f) 247 )) 248 } else { 249 # no wildcard, see whether this resource refers to a directory or to 250 # an individual file 251 info <- file.info(file.path(input_dir, explicit_res$path)) 252 if (is.na(info$isdir)) { 253 # implies that the file doesn't exist (should we warn here?) 254 NULL 255 } else if (isTRUE(info$isdir)) { 256 # if the resource file spec is a directory, include all the files in 257 # the directory, recursively 258 files <- list.files( 259 file.path(input_dir, explicit_res$path), recursive = TRUE 260 ) 261 lapply(files, function(f) discover_single_resource( 262 file.path(explicit_res$path, f), TRUE, web = is_web_file(f) 263 )) 264 } else { 265 # isdir is false--this is an individual file; return it 266 discover_single_resource(explicit_res$path, explicit_res$explicit, explicit_res$web) 267 } 268 } 269 } else { 270 discover_single_resource(explicit_res$path, explicit_res$explicit, explicit_res$web) 271 } 272 }) 273 } 274 275 # check for a 'preview' yaml metadata entry 276 if (!is.null(front_matter[["preview"]])) { 277 discover_single_resource(front_matter[["preview"]], explicit = FALSE, web = TRUE) 278 } 279 280 # check for bibliography and csl files at the top level 281 for (bibfile in c("bibliography", "csl")) { 282 lapply(front_matter[[bibfile]], discover_render_resource) 283 } 284 285 # check for parameter values that look like files. 286 if (!is.null(front_matter$params)) { 287 # This is the raw parameter information and has not had any YAML tag 288 # processing performed. See `knitr:::resolve_params`. 289 lapply(front_matter$params, function(param) { 290 if (is.list(param)) { 291 if (identical(param$input, "file")) { 292 if (!is.null(param$value)) { 293 # We treat param filenames as non-web resources. 294 discover_single_resource(param$value, TRUE, FALSE) 295 } 296 } 297 } 298 }) 299 } 300 301 # check for knitr child documents in R Markdown documents 302 if (tolower(xfun::file_ext(rmd_file)) %in% c("qmd", "rmd")) { 303 chunk_lines <- gregexpr(knitr::all_patterns$md$chunk.begin, rmd_content, perl = TRUE) 304 for (idx in seq_along(chunk_lines)) { 305 chunk_line <- chunk_lines[idx][[1]] 306 if (is.na(chunk_line) || chunk_line < 0) next 307 chunk_start <- attr(chunk_line, "capture.start", exact = TRUE) + 1 308 chunk_text <- substr( 309 rmd_content[idx], chunk_start, 310 chunk_start + attr(chunk_line, "capture.length", exact = TRUE) - 2 311 ) 312 for (child_expr in c("\\bchild\\s*=\\s*'([^']+)'", "\\bchild\\s*=\\s*\"([^\"]+)\"")) { 313 child_match <- gregexpr(child_expr, chunk_text, perl = TRUE)[[1]] 314 if (child_match > 0) { 315 child_start <- attr(child_match, "capture.start", exact = TRUE) 316 child_text <- substr( 317 chunk_text, child_start, 318 child_start + attr(child_match, "capture.length", exact = TRUE) - 1 319 ) 320 discover_render_resource(child_text) 321 } 322 } 323 } 324 } 325 326 # render "raw" markdown to HTML 327 html_file <- tempfile(fileext = ".html") 328 on.exit(unlink(html_file), add = TRUE) 329 330 # check to see what format this document is going to render as; if it's a 331 # format that produces HTML, let it render as-is, but if it isn't, render as 332 # html_document to pick up dependencies 333 output_format <- output_format_from_yaml_front_matter(rmd_content) 334 335 output_format_function <- eval(xfun::parse_only(output_format$name)) 336 337 override_output_format <- if (!is_pandoc_to_html(output_format_function()$pandoc)) "html_document" 338 339 html_file <- render( 340 md_file, override_output_format, html_file, quiet = TRUE, 341 output_options = list( 342 self_contained = FALSE, 343 pandoc_args = c("--metadata", "pagetitle=PREVIEW") 344 ) 345 ) 346 347 # clean up output file and its supporting files directory 348 temp_files <- c(temp_files, html_file, knitr_files_dir(md_file), knitr_files_dir(html_file)) 349 350 # run the HTML resource discovery mechanism on the rendered output 351 discover_html_resources(html_file, discover_single_resource) 352 353 # if this is an R Markdown file, purl the file to extract just the R code 354 if (tolower(xfun::file_ext(rmd_file)) %in% c("qmd", "rmd")) { 355 r_file <- tempfile(fileext = ".R") 356 # suppress possible try() errors https://github.com/rstudio/rmarkdown/issues/1247 357 try_file <- tempfile() 358 opts <- options(try.outFile = try_file) 359 on.exit({ 360 unlink(c(r_file, try_file)); options(opts) 361 }, add = TRUE) 362 knitr::purl(md_file, output = r_file, quiet = TRUE, documentation = 0) 363 temp_files <- c(temp_files, r_file) 364 discover_r_resources(r_file, discover_single_resource) 365 } 366} 367 368discover_r_resources <- function(r_file, discover_single_resource) { 369 370 # read the lines from the R file 371 r_lines <- read_utf8(r_file) 372 373 # clean comments from the R code (simply; consider: # inside strings) 374 r_lines <- sub("#.*$", "", r_lines) 375 376 # find quoted strings in the code and attempt to ascertain whether they are 377 # files on disk 378 r_lines <- one_string(r_lines) 379 quoted_strs <- Reduce(c, lapply(c("\"[^\"\n]*\"", "'[^'\n]*'"), function(pat) { 380 matches <- unlist(regmatches(r_lines, gregexpr(pat, r_lines))) 381 substr(matches, 2, nchar(matches) - 1) 382 })) 383 384 # consider any quoted string containing a valid relative path to a file that 385 # exists on disk to be a reference 386 for (quoted_str in quoted_strs) { 387 if (nchar(quoted_str) > 0) 388 discover_single_resource(quoted_str, FALSE, is_web_file(quoted_str)) 389 } 390} 391 392# copies the external resources needed to render original_input into 393# intermediates_dir; with skip_web, skips web resources. returns a character 394# vector containing paths to all resources copied. 395copy_render_intermediates <- function(original_input, intermediates_dir, skip_web) { 396 397 # start with an empty set of intermediates 398 intermediates <- c() 399 400 # extract all the resources used by the input file; note that this actually 401 # runs another (non-knitting) render, and that recursion is avoided because 402 # we explicitly render with self-contained = FALSE while discovering 403 # resources 404 resources <- find_external_resources(original_input) 405 dest_dir <- normalize_path(intermediates_dir) 406 source_dir <- dirname(normalize_path(original_input)) 407 408 # process each returned reosurce 409 by(resources, seq_len(nrow(resources)), function(res) { 410 # skip web resources if requested 411 if (skip_web && res$web) return() 412 dest <- copy_file_with_dir(res$path, dest_dir, source_dir) 413 intermediates <<- c(intermediates, dest) 414 }) 415 416 # return the list of files we generated 417 intermediates 418} 419 420# copy a file from a relative path to a destination dir, and preserve its 421# original dir structure, e.g., if we copy foo/bar.txt to /tmp, the destination 422# file should be /tmp/foo/bar.txt instead of /tmp/bar.txt 423copy_file_with_dir <- function(path, dest, from = '.') { 424 dest <- file.path(dest, path) 425 path <- file.path(from, path) 426 if (!file.exists(path)) return() 427 if (!dir_exists(dirname(dest))) dir.create(dirname(dest), recursive = TRUE) 428 file.copy(path, dest) 429 dest 430} 431 432discover_css_resources <- function(css_file, discover_single_resource) { 433 434 css_lines <- read_utf8(css_file) 435 436 discover_resource <- function(node, att, val, idx) { 437 res_file <- utils::URLdecode(val) 438 discover_single_resource(res_file, FALSE, TRUE) 439 } 440 441 call_css_resource_attrs(one_string(css_lines), discover_resource) 442} 443 444# given a filename, return true if the file appears to be a web file 445is_web_file <- function(filename) { 446 tolower(xfun::file_ext(filename)) %in% c( 447 "css", "gif", "htm", "html", "jpeg", "jpg", "js", "mp3", "mp4", "png", "wav" 448 ) 449} 450