1#' Find External Resource References
2#'
3#' Given an R Markdown document or HTML file, attempt to determine the set of
4#' additional files needed in order to render and display the document.
5#'
6#' This routine applies heuristics in order to scan a document for
7#' possible resource references.
8#'
9#' In R Markdown documents, it looks for references to files implicitly
10#' referenced in Markdown (e.g. \code{![alt](img.png)}), in the document's
11#' YAML header, in raw HTML chunks, and as quoted strings in R code chunks
12#' (e.g. \code{read.csv("data.csv")}).
13#'
14#' Resources specified explicitly in the YAML header for R Markdown
15#' documents are also returned. To specify resources in YAML, use the
16#' \code{resource_files} key:
17#'
18#'   \preformatted{---
19#' title: My Document
20#' author: My Name
21#' resource_files:
22#'   - data/mydata.csv
23#'   - images/figure.png
24#' ---}
25#'
26#' Each item in the \code{resource_files} list can refer to:
27#' \enumerate{
28#' \item A single file, such as \code{images/figure.png}, or
29#' \item A directory, such as \code{resources/data}, in which case all of the
30#'   directory's content will be recursively included, or
31#' \item A wildcard pattern, such as \code{data/*.csv}, in which case all of
32#'   the files matching the pattern will be included. No recursion is done in
33#'   this case.
34#' }
35#'
36#' In HTML files (and raw HTML chunks in R Markdown documents), this routine
37#' searches for resources specified in common tag attributes, such as
38#' \code{<img src="...">}, \code{<link href="...">}, etc.
39#'
40#' In all cases, only resources that exist on disk and are contained in the
41#' document's directory (or a child thereof) are returned.
42#' @param input_file path to the R Markdown document or HTML file to process
43#' @inheritParams render
44#' @return A data frame with the following columns:
45#'   \describe{
46#'    \item{path}{The relative path from the document to the resource}
47#'    \item{explicit}{Whether the resource was specified explicitly
48#'      (\code{TRUE}) or discovered implicitly (\code{FALSE})}
49#'    \item{web}{Whether the resource is needed to display a Web page rendered
50#'      from the document}
51#'   }
52#' @export
53find_external_resources <- function(input_file, encoding = 'UTF-8') {
54
55  # ensure we're working with valid input
56  ext <- tolower(xfun::file_ext(input_file))
57  if (!(ext %in% c("md", "rmd", "qmd", "html", "htm", "r", "css"))) {
58    stop("Resource discovery is only supported for R Markdown files or HTML files.")
59  }
60
61  if (!file.exists(input_file)) {
62    stop("The input file file '", input_file, "' does not exist.")
63  }
64
65  # set up the frame we'll use to report results
66  discovered_resources <- data.frame(
67    path = character(0), explicit = logical(0), web = logical(0)
68  )
69
70  input_dir <- dirname(normalize_path(input_file))
71
72  # discover a single resource--tests a string to see if it corresponds to a
73  # resource on disk; if so, adds it to the list of known resources and returns
74  # TRUE
75  discover_single_resource <- function(path, explicit, web) {
76
77    if (!(is.character(path) && length(path) == 1 && path != "." && path != ".." &&
78          file.exists(file.path(input_dir, path))))
79      return(FALSE)
80
81    ext <- tolower(xfun::file_ext(file.path(input_dir, path)))
82
83    if (identical(ext, "r")) {
84      # if this is a .R script, look for resources it contains, too
85      discover_r_resources(file.path(input_dir, path), discover_single_resource)
86    } else if (identical(ext, "css")) {
87      # if it's a CSS file, look for files it references (e.g. fonts/images)
88      discover_css_resources(file.path(input_dir, path), discover_single_resource)
89    }
90    # if this is an implicitly discovered resource, it needs to refer to
91    # a file rather than a directory
92    if (!explicit && dir_exists(file.path(input_dir, path))) {
93      return(FALSE)
94    }
95
96    # this looks valid; remember it
97    discovered_resources <<- rbind(discovered_resources, data.frame(
98      path = path, explicit = explicit, web = web, stringsAsFactors = FALSE
99    ))
100    TRUE
101  }
102
103  # run the main resource discovery appropriate to the file type
104  if (ext %in% c("md", "rmd", "qmd")) {
105    # discover R Markdown doc resources--scans the document itself as described
106    # in comments above, renders as Markdown, and invokes HTML discovery
107    # on the result
108    discover_rmd_resources(input_file, discover_single_resource)
109  } else if (ext %in% c("htm", "html")) {
110    # discover HTML resources
111    discover_html_resources(input_file, discover_single_resource)
112
113    # if the HTML file represents a rendered R Markdown document, it may have a
114    # sidecar _files folder; include that if it's present
115    sidecar_files_dir <- knitr_files_dir(input_file)
116    files_dir_info <- file.info(sidecar_files_dir)
117    if (isTRUE(files_dir_info$isdir)) {
118      # we probably auto-discovered some resources from _files--exclude those
119      # since they'll be covered by the directory
120      files_dir_prefix <- file.path(basename(sidecar_files_dir), "")
121      files_dir_matches <- substr(
122        discovered_resources$path, 1, nchar(files_dir_prefix)
123      ) == files_dir_prefix
124      discovered_resources <- discovered_resources[!files_dir_matches, , drop = FALSE]
125
126      # add the directory itself
127      discovered_resources <- rbind(discovered_resources, data.frame(
128        path = files_dir_prefix, explicit = FALSE, web = TRUE,
129        stringsAsFactors = FALSE)
130      )
131    }
132  } else if (ext == "r") {
133    discover_r_resources(input_file, discover_single_resource)
134  } else if (ext == "css") {
135    discover_css_resources(input_file, discover_single_resource)
136  }
137
138  # clean row names (they're not meaningful)
139  rownames(discovered_resources) <- NULL
140
141  # convert paths from factors if necssary, and clean any redundant ./ leaders
142  discovered_resources$path <- as.character(discovered_resources$path)
143  has_prefix <- grepl("^\\./", discovered_resources$path)
144  discovered_resources$path[has_prefix] <- substring(discovered_resources$path[has_prefix], 3)
145
146  discovered_resources
147}
148
149# discovers resources in a single HTML file
150discover_html_resources <- function(html_file, discover_single_resource) {
151  # resource accumulator
152  discover_resource <- function(node, att, val, idx) {
153    res_file <- utils::URLdecode(val)
154    discover_single_resource(res_file, FALSE, TRUE)
155  }
156
157  # create a single string with all of the lines in the document
158  html_lines <- file_string(html_file)
159
160  # parse the HTML and invoke our resource discovery callbacks
161  call_resource_attrs(html_lines, discover_resource)
162}
163
164# discovers resources in a single R Markdown document
165discover_rmd_resources <- function(rmd_file, discover_single_resource) {
166
167  # create a UTF-8 encoded Markdown file to serve as the resource discovery
168  # source
169  md_file <- tempfile(fileext = ".md")
170  input_dir <- dirname(normalize_path(rmd_file))
171  output_dir <- dirname(md_file)
172  rmd_content <- read_utf8(rmd_file)
173  if (length(i <- grep('^---\\s*$', rmd_content)) >= 2 && i[1] == 1) {
174    rmd_content <- append(rmd_content, 'citeproc: false', i[2] - 1)
175  }
176  write_utf8(rmd_content, md_file)
177
178  # create a vector of temporary files; anything in here
179  # will be cleaned up on exit
180  temp_files <- md_file
181  on.exit(unlink(temp_files, recursive = TRUE), add = TRUE)
182
183  # discovers render-time resources; if any are found, adds them to the list of
184  # discovered resources and copies them alongside the input document.
185  discover_render_resource <- function(output_render_file) {
186    if (discover_single_resource(output_render_file, FALSE, FALSE)) {
187      # mirror original directory structure so we don't need to mutate input
188      # prior to render
189      output_target_file <- file.path(output_dir, output_render_file)
190      if (!file.exists(dirname(output_target_file))) {
191        dir.create(dirname(output_target_file), showWarnings = FALSE, recursive = TRUE)
192      }
193
194      # copy the original resource to the temporary render folder
195      file.copy(file.path(input_dir, output_render_file), output_target_file)
196
197      # clean up this file when we're done
198      temp_files <<- c(temp_files, output_target_file)
199    }
200  }
201
202  # parse the YAML front matter to discover resources named there
203  front_matter <- yaml_front_matter(md_file)
204
205  # Check for content referred to by output format calls to the includes
206  # function (for generating headers/footers/etc. at render time), and for
207  # references to files in pandoc arguments.
208  #
209  # These will be needed to produce even a vanilla Markdown variant of the input
210  # document, so copy them to the temporary folder in preparation for rendering
211  # (in addition to marking them as required resources).
212  output_formats <- front_matter[["output"]]
213  if (is.list(output_formats)) {
214    for (output_format in output_formats) {
215      if (is.list(output_format)) {
216        output_render_files <- unlist(output_format[c(
217          'includes', 'pandoc_args', 'logo', 'reference_doc', 'reference_docx', 'template'
218        )])
219        lapply(output_render_files, discover_render_resource)
220      }
221    }
222  }
223
224  # check for explicitly named resources
225  if (!is.null(front_matter$resource_files)) {
226    lapply(front_matter$resource_files, function(res) {
227      explicit_res <- if (is.character(res)) {
228        list(path = res, explicit = TRUE, web = is_web_file(res))
229      } else if (is.list(res) && length(names(res)) > 0) {
230        # list--happens when web flag is specified explicitly in YAML.
231        list(path = names(res)[[1]],
232             explicit = TRUE,
233             web = if (is.null(res$web)) is_web_file(res) else res$web)
234      }
235
236      # check the extracted filename to see if it exists
237      if (!is.null(explicit_res)) {
238        if (grepl("*", explicit_res$path, fixed = TRUE)) {
239          # if the resource file spec includes a wildcard, list the files
240          # that match the pattern
241          files <- list.files(
242            file.path(input_dir, dirname(explicit_res$path)),
243            utils::glob2rx(basename(explicit_res$path))
244          )
245          lapply(files, function(f) discover_single_resource(
246            file.path(dirname(explicit_res$path), f), TRUE, web = is_web_file(f)
247          ))
248        } else {
249          # no wildcard, see whether this resource refers to a directory or to
250          # an individual file
251          info <- file.info(file.path(input_dir, explicit_res$path))
252          if (is.na(info$isdir)) {
253            # implies that the file doesn't exist (should we warn here?)
254            NULL
255          } else if (isTRUE(info$isdir)) {
256            # if the resource file spec is a directory, include all the files in
257            # the directory, recursively
258            files <- list.files(
259              file.path(input_dir, explicit_res$path), recursive = TRUE
260            )
261            lapply(files, function(f) discover_single_resource(
262              file.path(explicit_res$path, f), TRUE, web = is_web_file(f)
263            ))
264          } else {
265            # isdir is false--this is an individual file; return it
266            discover_single_resource(explicit_res$path, explicit_res$explicit, explicit_res$web)
267          }
268        }
269      } else {
270        discover_single_resource(explicit_res$path, explicit_res$explicit, explicit_res$web)
271      }
272    })
273  }
274
275  # check for a 'preview' yaml metadata entry
276  if (!is.null(front_matter[["preview"]])) {
277    discover_single_resource(front_matter[["preview"]], explicit = FALSE, web = TRUE)
278  }
279
280  # check for bibliography and csl files at the top level
281  for (bibfile in c("bibliography", "csl")) {
282    lapply(front_matter[[bibfile]], discover_render_resource)
283  }
284
285  # check for parameter values that look like files.
286  if (!is.null(front_matter$params)) {
287    # This is the raw parameter information and has not had any YAML tag
288    # processing performed. See `knitr:::resolve_params`.
289    lapply(front_matter$params, function(param) {
290      if (is.list(param)) {
291        if (identical(param$input, "file")) {
292          if (!is.null(param$value)) {
293            # We treat param filenames as non-web resources.
294            discover_single_resource(param$value, TRUE, FALSE)
295          }
296        }
297      }
298    })
299  }
300
301  # check for knitr child documents in R Markdown documents
302  if (tolower(xfun::file_ext(rmd_file)) %in% c("qmd", "rmd")) {
303    chunk_lines <- gregexpr(knitr::all_patterns$md$chunk.begin, rmd_content, perl = TRUE)
304    for (idx in seq_along(chunk_lines)) {
305      chunk_line <- chunk_lines[idx][[1]]
306      if (is.na(chunk_line) || chunk_line < 0) next
307      chunk_start <- attr(chunk_line, "capture.start", exact = TRUE) + 1
308      chunk_text <- substr(
309        rmd_content[idx], chunk_start,
310        chunk_start + attr(chunk_line, "capture.length", exact = TRUE) - 2
311      )
312      for (child_expr in c("\\bchild\\s*=\\s*'([^']+)'", "\\bchild\\s*=\\s*\"([^\"]+)\"")) {
313        child_match <- gregexpr(child_expr, chunk_text, perl = TRUE)[[1]]
314        if (child_match > 0) {
315          child_start <- attr(child_match, "capture.start", exact = TRUE)
316          child_text <- substr(
317            chunk_text, child_start,
318            child_start + attr(child_match, "capture.length", exact = TRUE) - 1
319          )
320          discover_render_resource(child_text)
321        }
322      }
323    }
324  }
325
326  # render "raw" markdown to HTML
327  html_file <- tempfile(fileext = ".html")
328  on.exit(unlink(html_file), add = TRUE)
329
330  # check to see what format this document is going to render as; if it's a
331  # format that produces HTML, let it render as-is, but if it isn't, render as
332  # html_document to pick up dependencies
333  output_format <- output_format_from_yaml_front_matter(rmd_content)
334
335  output_format_function <- eval(xfun::parse_only(output_format$name))
336
337  override_output_format <- if (!is_pandoc_to_html(output_format_function()$pandoc)) "html_document"
338
339  html_file <- render(
340    md_file, override_output_format, html_file, quiet = TRUE,
341    output_options = list(
342      self_contained = FALSE,
343      pandoc_args = c("--metadata", "pagetitle=PREVIEW")
344    )
345  )
346
347  # clean up output file and its supporting files directory
348  temp_files <- c(temp_files, html_file, knitr_files_dir(md_file), knitr_files_dir(html_file))
349
350  # run the HTML resource discovery mechanism on the rendered output
351  discover_html_resources(html_file, discover_single_resource)
352
353  # if this is an R Markdown file, purl the file to extract just the R code
354  if (tolower(xfun::file_ext(rmd_file)) %in% c("qmd", "rmd")) {
355    r_file <- tempfile(fileext = ".R")
356    # suppress possible try() errors https://github.com/rstudio/rmarkdown/issues/1247
357    try_file <- tempfile()
358    opts <- options(try.outFile = try_file)
359    on.exit({
360      unlink(c(r_file, try_file)); options(opts)
361    }, add = TRUE)
362    knitr::purl(md_file, output = r_file, quiet = TRUE, documentation = 0)
363    temp_files <- c(temp_files, r_file)
364    discover_r_resources(r_file, discover_single_resource)
365  }
366}
367
368discover_r_resources <- function(r_file, discover_single_resource) {
369
370  # read the lines from the R file
371  r_lines <- read_utf8(r_file)
372
373  # clean comments from the R code (simply; consider: # inside strings)
374  r_lines <- sub("#.*$", "", r_lines)
375
376  # find quoted strings in the code and attempt to ascertain whether they are
377  # files on disk
378  r_lines <- one_string(r_lines)
379  quoted_strs <- Reduce(c, lapply(c("\"[^\"\n]*\"", "'[^'\n]*'"), function(pat) {
380    matches <- unlist(regmatches(r_lines, gregexpr(pat, r_lines)))
381    substr(matches, 2, nchar(matches) - 1)
382  }))
383
384  # consider any quoted string containing a valid relative path to a file that
385  # exists on disk to be a reference
386  for (quoted_str in quoted_strs) {
387    if (nchar(quoted_str) > 0)
388       discover_single_resource(quoted_str, FALSE, is_web_file(quoted_str))
389  }
390}
391
392# copies the external resources needed to render original_input into
393# intermediates_dir; with skip_web, skips web resources. returns a character
394# vector containing paths to all resources copied.
395copy_render_intermediates <- function(original_input, intermediates_dir, skip_web) {
396
397  # start with an empty set of intermediates
398  intermediates <- c()
399
400  # extract all the resources used by the input file; note that this actually
401  # runs another (non-knitting) render, and that recursion is avoided because
402  # we explicitly render with self-contained = FALSE while discovering
403  # resources
404  resources <- find_external_resources(original_input)
405  dest_dir <- normalize_path(intermediates_dir)
406  source_dir <- dirname(normalize_path(original_input))
407
408  # process each returned reosurce
409  by(resources, seq_len(nrow(resources)), function(res) {
410    # skip web resources if requested
411    if (skip_web && res$web) return()
412    dest <- copy_file_with_dir(res$path, dest_dir, source_dir)
413    intermediates <<- c(intermediates, dest)
414  })
415
416  # return the list of files we generated
417  intermediates
418}
419
420# copy a file from a relative path to a destination dir, and preserve its
421# original dir structure, e.g., if we copy foo/bar.txt to /tmp, the destination
422# file should be /tmp/foo/bar.txt instead of /tmp/bar.txt
423copy_file_with_dir <- function(path, dest, from = '.') {
424  dest <- file.path(dest, path)
425  path <- file.path(from, path)
426  if (!file.exists(path)) return()
427  if (!dir_exists(dirname(dest))) dir.create(dirname(dest), recursive = TRUE)
428  file.copy(path, dest)
429  dest
430}
431
432discover_css_resources <- function(css_file, discover_single_resource) {
433
434  css_lines <- read_utf8(css_file)
435
436  discover_resource <- function(node, att, val, idx) {
437    res_file <- utils::URLdecode(val)
438    discover_single_resource(res_file, FALSE, TRUE)
439  }
440
441  call_css_resource_attrs(one_string(css_lines), discover_resource)
442}
443
444# given a filename, return true if the file appears to be a web file
445is_web_file <- function(filename) {
446  tolower(xfun::file_ext(filename)) %in% c(
447    "css", "gif", "htm", "html", "jpeg", "jpg", "js", "mp3", "mp4", "png", "wav"
448  )
449}
450