1/*
2 * pandoc_converter.ts
3 *
4 * Copyright (C) 2021 by RStudio, PBC
5 *
6 * Unless you have received this program directly from RStudio pursuant
7 * to the terms of a commercial license agreement with RStudio, then
8 * this program is licensed to you under the terms of version 3 of the
9 * GNU Affero General Public License. This program is distributed WITHOUT
10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13 *
14 */
15
16import { Schema, Node as ProsemirrorNode } from 'prosemirror-model';
17
18import {
19  PandocServer,
20  PandocTokenReader,
21  PandocNodeWriter,
22  PandocMarkWriter,
23  PandocPreprocessorFn,
24  PandocBlockReaderFn,
25  PandocPostprocessorFn,
26  PandocInlineHTMLReaderFn,
27  PandocWriterOptions,
28  PandocTokensFilterFn,
29} from '../api/pandoc';
30
31import { haveTableCellsWithInlineRcode } from '../api/rmd';
32
33import { pandocFormatWith, PandocFormat, kGfmFormat, kCommonmarkFormat } from '../api/pandoc_format';
34import { PandocCapabilities } from '../api/pandoc_capabilities';
35import { PandocBlockCapsuleFilter, pandocMarkdownWithBlockCapsules } from '../api/pandoc_capsule';
36
37import { ExtensionManager } from '../editor/editor-extensions';
38
39import { pandocToProsemirror } from './pandoc_to_prosemirror';
40import { pandocFromProsemirror } from './pandoc_from_prosemirror';
41
42export type PandocLineWrapping = 'none' | 'column' | 'sentence';
43
44export interface PandocToProsemirrorResult {
45  doc: ProsemirrorNode;
46  line_wrapping: PandocLineWrapping;
47  unrecognized: string[];
48  example_lists: boolean;
49  unparsed_meta: { [key: string]: any };
50}
51
52export class PandocConverter {
53  private readonly schema: Schema;
54  private readonly preprocessors: readonly PandocPreprocessorFn[];
55  private readonly postprocessors: readonly PandocPostprocessorFn[];
56  private readonly readers: readonly PandocTokenReader[];
57  private readonly tokensFilters: readonly PandocTokensFilterFn[];
58  private readonly blockReaders: readonly PandocBlockReaderFn[];
59  private readonly inlineHTMLReaders: readonly PandocInlineHTMLReaderFn[];
60  private readonly blockCapsuleFilters: readonly PandocBlockCapsuleFilter[];
61  private readonly nodeWriters: readonly PandocNodeWriter[];
62  private readonly markWriters: readonly PandocMarkWriter[];
63  private readonly pandoc: PandocServer;
64  private readonly pandocCapabilities: PandocCapabilities;
65
66  constructor(
67    schema: Schema,
68    extensions: ExtensionManager,
69    pandoc: PandocServer,
70    pandocCapabilities: PandocCapabilities,
71  ) {
72    this.schema = schema;
73
74    this.preprocessors = extensions.pandocPreprocessors();
75    this.postprocessors = extensions.pandocPostprocessors();
76    this.readers = extensions.pandocReaders();
77    this.tokensFilters = extensions.pandocTokensFilters();
78    this.blockReaders = extensions.pandocBlockReaders();
79    this.inlineHTMLReaders = extensions.pandocInlineHTMLReaders();
80    this.blockCapsuleFilters = extensions.pandocBlockCapsuleFilters();
81    this.nodeWriters = extensions.pandocNodeWriters();
82    this.markWriters = extensions.pandocMarkWriters();
83
84    this.pandoc = pandoc;
85    this.pandocCapabilities = pandocCapabilities;
86  }
87
88  public async toProsemirror(markdown: string, format: PandocFormat): Promise<PandocToProsemirrorResult> {
89    // save original markdown (for aligning capsule positions)
90    const original = markdown;
91
92    // adjust format. we always need to *read* raw_html, raw_attribute, and backtick_code_blocks b/c
93    // that's how preprocessors hoist content through pandoc into our prosemirror token parser.
94    // we always need to read with auto_identifiers so we can catch any auto-generated ids
95    // required to fulfill links inside the document (we will strip out heading ids that
96    // aren't explicit or a link target using the heading_ids returned with the ast). we also
97    // disable 'smart' b/c that causes pandoc to insert non-breaking spaces before selected
98    // abbreviations like e.g. rather, we do our own implementation of 'smart' when we read
99    // PandocTokenType.Str from the ast
100
101    // determine type of auto_ids
102    const autoIds = format.extensions.gfm_auto_identifiers ? 'gfm_auto_identifiers' : 'auto_identifiers';
103    const targetFormat = adjustedFormat(
104      format.fullName,
105      ['raw_html', 'raw_attribute', 'backtick_code_blocks', autoIds],
106      ['smart'],
107    );
108
109    // run preprocessors
110    this.preprocessors.forEach(preprocessor => {
111      markdown = preprocessor(markdown);
112    });
113
114    // create source capsules
115    this.blockCapsuleFilters.forEach(filter => {
116      markdown = pandocMarkdownWithBlockCapsules(original, markdown, filter);
117    });
118
119    const ast = await this.pandoc.markdownToAst(markdown, targetFormat, []);
120    const result = pandocToProsemirror(
121      ast,
122      this.schema,
123      format.extensions,
124      this.readers,
125      this.tokensFilters,
126      this.blockReaders,
127      this.inlineHTMLReaders,
128      this.blockCapsuleFilters,
129    );
130
131    // run post-processors
132    this.postprocessors.forEach(postprocessor => {
133      result.doc = postprocessor(result.doc);
134    });
135
136    // return the doc
137    return result;
138  }
139
140  // NOTE: For a plain markdown file, this is the closest we can come to cannonicalizing w/ just pandoc:
141  //
142  //   pandoc MANUAL.md --to markdown-auto_identifiers-smart -o MANUAL.md --self-contained --atx-headers --wrap=none
143  //
144  // For R Mardown files, we would need to pull out the Rmd chunks before sending to pandoc.
145  //
146
147  public async fromProsemirror(
148    doc: ProsemirrorNode,
149    pandocFormat: PandocFormat,
150    options: PandocWriterOptions,
151  ): Promise<string> {
152    // generate pandoc ast
153    const output = pandocFromProsemirror(
154      doc,
155      this.pandocCapabilities.api_version,
156      pandocFormat,
157      this.nodeWriters,
158      this.markWriters,
159    );
160
161    // adjust format. we always need to be able to write raw_attribute b/c that's how preprocessors
162    // hoist content through pandoc into our prosemirror token parser. since we open this door when
163    // reading, users could end up writing raw inlines, and in that case we want them to echo back
164    // to the source document just the way they came in. for writing markdown from pm we don't
165    // ever want to generate auto identifiers so we disable them here. we also disable smart b/c
166    // we do this manually above in pandocFromProsemirror (so we can avoid pandoc's insertion of
167    // nbsp's after abbreviations, which is more approriate for final output than editing)
168    let format = adjustedFormat(
169      pandocFormat.fullName,
170      ['raw_html', 'raw_attribute'], // always enable
171      ['auto_identifiers', 'gfm_auto_identifiers', 'smart'],
172    ); // always disable
173
174    // disable selected format options
175    format = pandocFormatWith(format, disabledFormatOptions(format, doc), '');
176
177    // prepare pandoc options
178    let pandocOptions: string[] = [];
179    if (!options.atxHeaders) {
180      pandocOptions.push('--markdown-headings=setext');
181    }
182    if (options.dpi) {
183      pandocOptions.push('--dpi');
184    }
185    // default to block level references (validate known types)
186    let referenceLocation = 'block';
187    if (options.references?.location) {
188      referenceLocation = ['block', 'section', 'document'].includes(options.references.location)
189        ? options.references.location
190        : 'block';
191    }
192    pandocOptions.push(`--reference-location=${referenceLocation}`);
193
194    // references prefix (if any)
195    if (options.references?.prefix) {
196      pandocOptions.push('--id-prefix', options.references.prefix);
197    }
198
199    // provide wrap options
200    pandocOptions = pandocOptions.concat(wrapOptions(options));
201
202    // render to markdown
203    const markdown = await this.pandoc.astToMarkdown(output.ast, format, pandocOptions);
204
205    // normalize newlines (don't know if pandoc uses \r\n on windows)
206    return markdown.replace(/\r\n|\n\r|\r/g, '\n');
207  }
208}
209
210// adjust the specified format
211function adjustedFormat(format: string, extensions: string[], disabled: string[]) {
212  let newFormat = pandocFormatWith(
213    format,
214    '',
215    extensions.map(ext => `+${ext}`).join('') + disabled.map(ext => `-${ext}`).join(''),
216  );
217
218  // any extension specified needs to not have a - anywhere in the format
219  extensions.forEach(ext => {
220    newFormat = newFormat.replace('-' + ext, '');
221  });
222
223  return newFormat;
224}
225
226function disabledFormatOptions(format: string, doc: ProsemirrorNode) {
227  // (prefer pipe and grid tables). users can still force the availability of these by
228  // adding those format flags but all known markdown variants that support tables also
229  // support pipe tables so this seems unlikely to ever be required.
230  let disabledTableTypes = '-simple_tables-multiline_tables';
231
232  // if there are tables with inline R code then disable grid tables (as the inline
233  // R code will mess up the column boundaries)
234  if (haveTableCellsWithInlineRcode(doc)) {
235    disabledTableTypes += '-grid_tables';
236  }
237
238  // gfm and commonmark variants don't allow simple/multiline/grid tables (just pipe tables)
239  // and it's an error to even include these in the markdown format specifier -- so for
240  // these modes we just nix the disabling
241  if (format.startsWith(kGfmFormat) || format.startsWith(kCommonmarkFormat)) {
242    disabledTableTypes = '';
243  }
244
245  // return
246  return disabledTableTypes;
247}
248
249function wrapOptions(options: PandocWriterOptions) {
250  const pandocOptions: string[] = [];
251  if (options.wrap) {
252    if (options.wrap === 'none' || options.wrap === 'sentence') {
253      pandocOptions.push('--wrap=none');
254    } else {
255      const column = parseInt(options.wrap, 10);
256      if (column) {
257        pandocOptions.push('--wrap=auto');
258        pandocOptions.push(`--columns=${column}`);
259      } else {
260        pandocOptions.push('--wrap=none');
261      }
262    }
263  } else {
264    pandocOptions.push('--wrap=none');
265  }
266  return pandocOptions;
267}
268