1#' Convert a document with pandoc
2#'
3#' Convert documents to and from various formats using the pandoc utility.
4#'
5#' @param input Character vector containing paths to input files
6#'   (files must be UTF-8 encoded)
7#' @param to Format to convert to (if not specified, you must specify
8#'   \code{output})
9#' @param from Format to convert from (if not specified then the format is
10#'   determined based on the file extension of \code{input}).
11#' @param output Output file (if not specified then determined based on format
12#'   being converted to)
13#' @param citeproc \code{TRUE} to run the pandoc-citeproc filter (for processing
14#'   citations) as part of the conversion
15#' @param options Character vector of command line options to pass to pandoc.
16#' @param verbose \code{TRUE} to show the pandoc command line which was executed
17#' @param wd Working directory in which code will be executed. If not
18#'   supplied, defaults to the common base directory of \code{input}
19#'
20#' @details Supported input and output formats are described in the
21#'   \href{http://johnmacfarlane.net/pandoc/README.html}{pandoc user guide}.
22#'
23#'   The system path as well as the version of pandoc shipped with RStudio (if
24#'   running under RStudio) are scanned for pandoc and the highest version
25#'   available is used.
26#'
27#' @examples
28#' \dontrun{
29#' library(rmarkdown)
30#'
31#' # convert markdown to various formats
32#' pandoc_convert("input.md", to = "html")
33#' pandoc_convert("input.md", to = "pdf")
34#'
35#' # process citations
36#' pandoc_convert("input.md", to = "html", citeproc = TRUE)
37#'
38#' # add some pandoc options
39#' pandoc_convert("input.md", to="pdf", options = c("--listings"))
40#' }
41#'
42#' @export
43pandoc_convert <- function(input,
44                           to = NULL,
45                           from = NULL,
46                           output = NULL,
47                           citeproc = FALSE,
48                           options = NULL,
49                           verbose = FALSE,
50                           wd = NULL) {
51
52  # ensure we've scanned for pandoc
53  find_pandoc()
54
55  # execute in specified working directory
56  if (is.null(wd)) {
57    wd <- base_dir(input)
58  }
59  oldwd <- setwd(wd)
60  on.exit(setwd(oldwd), add = TRUE)
61
62
63  # input file and formats
64  args <- c(input)
65  if (!is.null(to))
66    args <- c(args, "--to", to)
67  if (!is.null(from))
68    args <- c(args, "--from", from)
69
70  #  output file
71  if (!is.null(output))
72    args <- c(args, "--output", output)
73
74  # set pandoc stack size
75  stack_size <- getOption("pandoc.stack.size", default = "512m")
76  args <- c(c("+RTS", paste0("-K", stack_size), "-RTS"), args)
77
78  # additional command line options
79  args <- c(args, options)
80
81  # citeproc filter if requested
82  if (citeproc) {
83    args <- c(args, "--filter", pandoc_citeproc())
84    # --natbib/--biblatex conflicts with '--filter pandoc-citeproc'
85    i <- stats::na.omit(match(c("--natbib", "--biblatex"), options))
86    if (length(i)) options <- options[-i]
87  }
88
89  # build the conversion command
90  command <- paste(quoted(pandoc()), paste(quoted(args), collapse = " "))
91
92  # show it in verbose mode
93  if (verbose)
94    cat(command, "\n")
95
96  # run the conversion
97  with_pandoc_safe_environment({
98    result <- system(command)
99  })
100  if (result != 0)
101    stop("pandoc document conversion failed with error ", result, call. = FALSE)
102
103  invisible(NULL)
104}
105
106#' Check pandoc availabilty and version
107#'
108#' Determine whether pandoc is currently available on the system (optionally
109#' checking for a specific version or greater). Determine the specific version
110#' of pandoc available.
111#'
112#' @param version Required version of pandoc
113#' @param error Whether to signal an error if pandoc with the required version
114#'   is not found
115#'
116#' @return \code{pandoc_available} returns a logical indicating whether the
117#'   required version of pandoc is available. \code{pandoc_version} returns a
118#'   \code{\link[base]{numeric_version}} with the version of pandoc found.
119#'
120#' @details
121#'
122#' The system environment variable \samp{PATH} as well as the version of pandoc
123#' shipped with RStudio (its location is set via the environment variable
124#' \samp{RSTUDIO_PANDOC} by RStudio products like the RStudio IDE, RStudio
125#' Server, Shiny Server, and RStudio Connect, etc) are scanned for pandoc and
126#' the highest version available is used. Please do not modify the environment
127#' varaible \samp{RSTUDIO_PANDOC} unless you know what it means.
128#'
129#' @examples
130#' \dontrun{
131#' library(rmarkdown)
132#'
133#' if (pandoc_available())
134#'   cat("pandoc", as.character(pandoc_version()), "is available!\n")
135#'
136#' if (pandoc_available("1.12.3"))
137#'   cat("requried version of pandoc is available!\n")
138#' }
139#' @export
140pandoc_available <- function(version = NULL, error = FALSE) {
141
142  # ensure we've scanned for pandoc
143  find_pandoc()
144
145  # check availability
146  found <- !is.null(.pandoc$dir) && (is.null(version) || .pandoc$version >= version)
147
148  msg <- c(
149    "pandoc", if (!is.null(version)) c("version", version, "or higher"),
150    "is required and was not found (see the help page ?rmarkdown::pandoc_available)."
151  )
152  if (error && !found) stop(paste(msg, collapse = " "), call. = FALSE)
153
154  found
155}
156
157
158#' @rdname pandoc_available
159#' @export
160pandoc_version <- function() {
161  find_pandoc()
162  .pandoc$version
163}
164
165#' Functions for generating pandoc command line arguments
166#'
167#' Functions that assist in creating various types of pandoc command line
168#' arguments (e.g. for templates, table of contents, highlighting, and content
169#' includes)
170#'
171#' @inheritParams includes
172#'
173#' @param name Name of template variable to set.
174#' @param value Value of template variable (defaults to \code{true} if missing).
175#' @param toc \code{TRUE} to include a table of contents in the output.
176#' @param toc_depth Depth of headers to include in table of contents.
177#' @param highlight The name of a pandoc syntax highlighting theme.
178#' @param latex_engine LaTeX engine for producing PDF output. Options are
179#'   "pdflatex", "lualatex", and "xelatex".
180#' @param default The highlighting theme to use if "default"
181#'   is specified.
182#'
183#' @return A character vector with pandoc command line arguments
184#'
185#' @details Non-absolute paths for resources referenced from the
186#'   \code{in_header}, \code{before_body}, and \code{after_body}
187#'   parameters are resolved relative to the directory of the input document.
188#'
189#' @examples
190#' \dontrun{
191#'
192#' library(rmarkdown)
193#'
194#' pandoc_include_args(before_body = "header.htm")
195#' pandoc_include_args(before_body = "header.tex")
196#'
197#' pancoc_highlight_args("kate")
198#'
199#' pandoc_latex_engine_args("pdflatex")
200#'
201#' pandoc_toc_args(toc = TRUE, toc_depth = 2)
202#'
203#' }
204#' @name pandoc_args
205NULL
206
207#' @rdname pandoc_args
208#' @export
209pandoc_variable_arg <- function(name, value) {
210  c("--variable", if (missing(value)) name else paste(name, "=", value, sep = ""))
211}
212
213
214#' @rdname pandoc_args
215#' @export
216pandoc_include_args <- function(in_header = NULL,
217                                before_body = NULL,
218                                after_body = NULL) {
219  args <- c()
220
221  for (file in in_header)
222    args <- c(args, "--include-in-header", pandoc_path_arg(file))
223
224  for (file in before_body)
225    args <- c(args, "--include-before-body", pandoc_path_arg(file))
226
227  for (file in after_body)
228    args <- c(args, "--include-after-body", pandoc_path_arg(file))
229
230  args
231}
232
233#' @rdname pandoc_args
234#' @export
235pandoc_highlight_args <- function(highlight, default = "tango") {
236
237  args <- c()
238
239  if (is.null(highlight))
240    args <- c(args, "--no-highlight")
241  else {
242    if (identical(highlight, "default"))
243      highlight <- default
244    args <- c(args, "--highlight-style", highlight)
245  }
246
247  args
248}
249
250#' @rdname pandoc_args
251#' @export
252pandoc_latex_engine_args <- function(latex_engine) {
253  c(if (pandoc2.0()) "--pdf-engine" else "--latex-engine",
254    find_latex_engine(latex_engine))
255}
256
257# For macOS, use a full path to the latex engine since the stripping
258# of the PATH environment variable by OSX 10.10 Yosemite prevents
259# pandoc from finding the engine in e.g. /usr/texbin
260find_latex_engine <- function(latex_engine) {
261  # do not need full path if latex_engine is available from PATH
262  if (!is_osx() || nzchar(Sys.which(latex_engine))) return(latex_engine)
263  # resolve path if it's not already an absolute path
264  if (!grepl("/", latex_engine) && nzchar(path <- find_program(latex_engine)))
265    latex_engine <- path
266  latex_engine
267}
268
269#' @rdname pandoc_args
270#' @export
271pandoc_toc_args <- function(toc, toc_depth = 3) {
272
273  args <- c()
274
275  if (toc) {
276    args <- c(args, "--table-of-contents")
277    args <- c(args, "--toc-depth", toc_depth)
278  }
279
280  args
281}
282
283
284#' Transform path for passing to pandoc
285#'
286#' Transform a path for passing to pandoc on the command line. Calls
287#' \code{\link[base:path.expand]{path.expand}} on all platforms. On Windows,
288#' transform it to a short path name if it contains spaces, and then convert
289#' forward slashes to back slashes (as required by pandoc for some path
290#' references)
291#'
292#' @param path Path to transform
293#' @param backslash Whether to replace forward slashes in \code{path} with
294#'   backslashes on Windows
295#'
296#' @return Transformed path that can be passed to pandoc on the command line
297#'
298#' @export
299pandoc_path_arg <- function(path, backslash = TRUE) {
300
301  path <- path.expand(path)
302
303  # remove redundant ./ prefix if present
304  path <- sub('^[.]/', '', path)
305
306  if (is_windows()) {
307    i <- grep(' ', path)
308    if (length(i))
309      path[i] <- utils::shortPathName(path[i])
310    if (backslash) path <- gsub('/', '\\\\', path)
311  }
312
313  path
314}
315
316
317#' Render a pandoc template.
318#'
319#' Use the pandoc templating engine to render a text file. Substitutions are
320#' done using the \code{metadata} list passed to the function.
321#'
322#' @param metadata A named list containing metadata to pass to template.
323#' @param template Path to a pandoc template.
324#' @param output Path to save output.
325#' @param verbose \code{TRUE} to show the pandoc command line which was
326#'   executed.
327#' @return (Invisibly) The path of the generated file.
328#'
329#' @export
330pandoc_template <- function(metadata, template, output, verbose = FALSE) {
331
332  tmp <- tempfile(fileext = ".md")
333  on.exit(unlink(tmp))
334
335  cat("---\n", file = tmp)
336  cat(yaml::as.yaml(metadata), file = tmp, append = TRUE)
337  cat("---\n", file = tmp, append = TRUE)
338  cat("\n", file = tmp, append = TRUE)
339
340  pandoc_convert(tmp, "markdown", output = output,
341                 options = paste0("--template=", template),
342                 verbose = verbose)
343
344  invisible(output)
345}
346
347#' Create a self-contained HTML document using pandoc.
348#'
349#' Create a self-contained HTML document by base64 encoding images,
350#' scripts, and stylesheets referended by the input document.
351#'
352#' @param input Input html file to create self-contained version of.
353#' @param output Path to save output.
354#'
355#' @return (Invisibly) The path of the generated file.
356#'
357#' @export
358pandoc_self_contained_html <- function(input, output) {
359
360  # make input file path absolute
361  input <- normalizePath(input)
362
363  # ensure output file exists and make it's path absolute
364  if (!file.exists(output))
365    file.create(output)
366  output <- normalizePath(output)
367
368  # create a simple body-only template
369  template <- tempfile(fileext = ".html")
370  on.exit(unlink(template), add = TRUE)
371  writeLines("$body$", template)
372
373  # convert from markdown to html to get base64 encoding
374  # (note there is no markdown in the source document but
375  # we still need to do this "conversion" to get the
376  # base64 encoding)
377
378  # determine from (there are bugs in pandoc < 1.17 that
379  # cause markdown_strict to hang on very large script
380  # elements)
381  from <- if (pandoc_available("1.17"))
382            "markdown_strict"
383          else
384            "markdown"
385
386  # do the conversion
387  pandoc_convert(
388    input = input,
389    from = from,
390    output = output,
391    options = c(
392      "--self-contained",
393      "--template", template
394    )
395  )
396
397  invisible(output)
398}
399
400
401
402validate_self_contained <- function(mathjax) {
403  if (identical(mathjax, "local"))
404    stop("Local MathJax isn't compatible with self_contained\n",
405         "(you should set self_contained to FALSE)", call. = FALSE)
406}
407
408pandoc_mathjax_args <- function(mathjax,
409                                template,
410                                self_contained,
411                                files_dir,
412                                output_dir) {
413  args <- c()
414
415  if (!is.null(mathjax)) {
416
417    if (identical(mathjax, "default")) {
418      if (identical(template, "default"))
419        mathjax <- default_mathjax()
420      else
421        mathjax <- NULL
422    }
423    else if (identical(mathjax, "local")) {
424      mathjax_path <- pandoc_mathjax_local_path()
425      mathjax_path <- render_supporting_files(mathjax_path,
426                                              files_dir,
427                                              "mathjax-local")
428      mathjax <- paste(normalized_relative_to(output_dir, mathjax_path), "/",
429                       mathjax_config(), sep = "")
430    }
431
432    if (identical(template, "default")) {
433      args <- c(args, "--mathjax")
434      args <- c(args, "--variable", paste0("mathjax-url:", mathjax))
435    } else if (!self_contained) {
436      args <- c(args, "--mathjax")
437      if (!is.null(mathjax))
438        args <- c(args, mathjax)
439    } else {
440      warning("MathJax doesn't work with self_contained when not ",
441              "using the rmarkdown \"default\" template.", call. = FALSE)
442    }
443
444  }
445
446  args
447}
448
449
450pandoc_mathjax_local_path <- function() {
451
452  local_path <- Sys.getenv("RMARKDOWN_MATHJAX_PATH", unset = NA)
453  if (is.na(local_path)) {
454    local_path <- unix_mathjax_path()
455    if (is.na(local_path)) {
456      stop("For mathjax = \"local\", please set the RMARKDOWN_MATHJAX_PATH ",
457           "environment variable to the location of MathJax. ",
458           "On Linux systems you can also install MathJax using your ",
459           "system package manager.")
460    } else {
461      local_path
462    }
463  } else {
464    local_path
465  }
466}
467
468
469unix_mathjax_path <- function() {
470  if (identical(.Platform$OS.type, "unix")) {
471    mathjax_path <- "/usr/share/javascript/mathjax"
472    if (file.exists(file.path(mathjax_path, "MathJax.js")))
473      mathjax_path
474    else
475      NA
476  } else {
477    NA
478  }
479}
480
481
482pandoc_html_highlight_args <- function(template, highlight) {
483
484  args <- c()
485
486  if (is.null(highlight)) {
487    args <- c(args, "--no-highlight")
488  }
489  else if (!identical(template, "default")) {
490    if (identical(highlight, "default"))
491      highlight <- "pygments"
492    args <- c(args, "--highlight-style", highlight)
493  }
494  else {
495    highlight <- match.arg(highlight, html_highlighters())
496    if (is_highlightjs(highlight)) {
497      args <- c(args, "--no-highlight")
498      args <- c(args, "--variable", "highlightjs=1")
499    }
500    else {
501      args <- c(args, "--highlight-style", highlight)
502    }
503  }
504
505  args
506}
507
508is_highlightjs <- function(highlight) {
509  !is.null(highlight) && (highlight %in% c("default", "textmate"))
510}
511
512# Scan for a copy of pandoc and set the internal cache if it's found.
513find_pandoc <- function() {
514
515  if (is.null(.pandoc$dir)) {
516
517    # define potential sources
518    sys_pandoc <- find_program("pandoc")
519    sources <- c(Sys.getenv("RSTUDIO_PANDOC"),
520                 ifelse(nzchar(sys_pandoc), dirname(sys_pandoc), ""))
521    if (!is_windows())
522      sources <- c(sources, path.expand("~/opt/pandoc"))
523
524    # determine the versions of the sources
525    versions <- lapply(sources, function(src) {
526      if (dir_exists(src))
527        get_pandoc_version(src)
528      else
529        numeric_version("0")
530    })
531
532    # find the maximum version
533    found_src <- NULL
534    found_ver <- numeric_version("0")
535    for (i in 1:length(sources)) {
536      ver <- versions[[i]]
537      if (ver > found_ver) {
538        found_ver <- ver
539        found_src <- sources[[i]]
540      }
541    }
542
543    # did we find a version?
544    if (!is.null(found_src)) {
545      .pandoc$dir <- found_src
546      .pandoc$version <- found_ver
547    }
548  }
549}
550
551# Get an S3 numeric_version for the pandoc utility at the specified path
552get_pandoc_version <- function(pandoc_dir) {
553  pandoc_path <- file.path(pandoc_dir, "pandoc")
554  if (is_windows()) pandoc_path <- paste0(pandoc_path, ".exe")
555  if (!utils::file_test("-x", pandoc_path)) return(numeric_version("0"))
556  with_pandoc_safe_environment({
557    version_info <- system(paste(shQuote(pandoc_path), "--version"),
558                           intern = TRUE)
559  })
560  version <- strsplit(version_info, "\n")[[1]][1]
561  version <- strsplit(version, " ")[[1]][2]
562  numeric_version(version)
563}
564
565# wrap a system call to pandoc so that LC_ALL is not set
566# see: https://github.com/rstudio/rmarkdown/issues/31
567# see: https://ghc.haskell.org/trac/ghc/ticket/7344
568with_pandoc_safe_environment <- function(code) {
569  lc_all <- Sys.getenv("LC_ALL", unset = NA)
570  if (!is.na(lc_all)) {
571    Sys.unsetenv("LC_ALL")
572    on.exit(Sys.setenv(LC_ALL = lc_all), add = TRUE)
573  }
574  lc_ctype <- Sys.getenv("LC_CTYPE", unset = NA)
575  if (!is.na(lc_ctype)) {
576    Sys.unsetenv("LC_CTYPE")
577    on.exit(Sys.setenv(LC_CTYPE = lc_ctype), add = TRUE)
578  }
579  if (Sys.info()['sysname'] == "Linux" &&
580        is.na(Sys.getenv("HOME", unset = NA))) {
581    stop("The 'HOME' environment variable must be set before running Pandoc.")
582  }
583  if (Sys.info()['sysname'] == "Linux" &&
584        is.na(Sys.getenv("LANG", unset = NA))) {
585    # fill in a the LANG environment variable if it doesn't exist
586    Sys.setenv(LANG = detect_generic_lang())
587    on.exit(Sys.unsetenv("LANG"), add = TRUE)
588  }
589  if (Sys.info()['sysname'] == "Linux" &&
590    identical(Sys.getenv("LANG"), "en_US")) {
591    Sys.setenv(LANG = "en_US.UTF-8")
592    on.exit(Sys.setenv(LANG = "en_US"), add = TRUE)
593  }
594  force(code)
595}
596
597# if there is no LANG environment variable set pandoc is going to hang so
598# we need to specify a "generic" lang setting. With glibc >= 2.13 you can
599# specify C.UTF-8 so we prefer that. If we can't find that then we fall back
600# to en_US.UTF-8.
601detect_generic_lang <- function() {
602
603  locale_util <- Sys.which("locale")
604
605  if (nzchar(locale_util)) {
606    locales <- system(paste(locale_util, "-a"), intern = TRUE)
607    locales <- suppressWarnings(
608        strsplit(locales, split = "\n", fixed = TRUE)
609    )
610    if ("C.UTF-8" %in% locales)
611      return("C.UTF-8")
612  }
613
614  # default to en_US.UTF-8
615  "en_US.UTF-8"
616}
617
618
619
620# get the path to the pandoc binary
621pandoc <- function() {
622  find_pandoc()
623  file.path(.pandoc$dir, "pandoc")
624}
625
626# get the path to the pandoc-citeproc binary
627pandoc_citeproc <- function() {
628  find_pandoc()
629  citeproc_path = file.path(.pandoc$dir, "pandoc-citeproc")
630  if (file.exists(citeproc_path))
631    citeproc_path
632  else
633    "pandoc-citeproc"
634}
635
636# quote args if they need it
637quoted <- function(args) {
638  # some characters are legal in filenames but without quoting are likely to be
639  # interpreted by the shell (e.g. redirection, wildcard expansion, etc.) --
640  # wrap arguments containing these characters in quotes.
641  shell_chars <- grepl(.shell_chars_regex, args)
642  args[shell_chars] <- shQuote(args[shell_chars])
643  args
644}
645
646find_pandoc_theme_variable <- function(args) {
647  range <- length(args) - 1
648  for (i in 1:range) {
649    if (args[[i]] == "--variable" && grepl("^theme:", args[[i + 1]])) {
650      return(substring(args[[i + 1]], nchar("theme:") + 1))
651    }
652  }
653  # none found, return NULL
654  NULL
655}
656
657
658# Environment used to cache the current pandoc directory and version
659.pandoc <- new.env()
660.pandoc$dir <- NULL
661.pandoc$version <- NULL
662
663pandoc2.0 <- function() pandoc_available("2.0")
664