1/*
2 * pandoc_format.ts
3 *
4 * Copyright (C) 2021 by RStudio, PBC
5 *
6 * Unless you have received this program directly from RStudio pursuant
7 * to the terms of a commercial license agreement with RStudio, then
8 * this program is licensed to you under the terms of version 3 of the
9 * GNU Affero General Public License. This program is distributed WITHOUT
10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13 *
14 */
15
16import { Node as ProsemirrorNode } from 'prosemirror-model';
17
18import { PandocServer, PandocExtensions } from './pandoc';
19import { EditorFormat, kHugoDocType } from './format';
20import { firstYamlBlock, yamlMetadataNodes } from './yaml';
21import { findValue } from './object';
22
23export const kMarkdownFormat = 'markdown';
24export const kMarkdownPhpextraFormat = 'markdown_phpextra';
25export const kMarkdownGithubFormat = 'markdown_github';
26export const kMarkdownMmdFormat = 'markdown_mmd';
27export const kMarkdownStrictFormat = 'markdown_strict';
28export const kGfmFormat = 'gfm';
29export const kCommonmarkFormat = 'commonmark';
30export const kCommonmarkXFormat = 'commonmark_x';
31
32export interface PandocFormat {
33  mode: string;
34  baseName: string;
35  fullName: string;
36  extensions: PandocExtensions;
37  warnings: PandocFormatWarnings;
38}
39
40export interface PandocFormatWarnings {
41  invalidFormat: string;
42  invalidOptions: string[];
43}
44
45export interface PandocFormatConfig {
46  mode?: string;
47  extensions?: string;
48  rmdExtensions?: string;
49  wrap?: string;
50  doctypes?: string[];
51  references_location?: string;
52  references_prefix?: string;
53  canonical?: boolean;
54}
55
56export function matchPandocFormatComment(code: string) {
57  const magicCommentRegEx = /^<!--\s+-\*-([\s\S]*?)-\*-\s+-->\s*$/m;
58  return code.match(magicCommentRegEx);
59}
60
61export function pandocFormatConfigFromDoc(doc: ProsemirrorNode, isRmd: boolean) {
62  return pandocFormatConfigFromYamlInDoc(doc, isRmd) || pandocFormatConfigFromCommentInDoc(doc) || {};
63}
64
65export function pandocFormatConfigFromCode(code: string, isRmd: boolean): PandocFormatConfig {
66  return pandocFormatConfigFromYamlInCode(code, isRmd) || pandocFormatConfigFromCommentInCode(code) || {};
67}
68
69function pandocFormatConfigFromYamlInCode(code: string, isRmd: boolean): PandocFormatConfig | null {
70  // get the first yaml block in the file
71  const yaml = firstYamlBlock(code);
72
73  // did we find yaml?
74  if (yaml) {
75    // see if we have any md_extensions defined
76    let mdExtensions : string | undefined = isRmd ? findValue('md_extensions', yaml?.output) : undefined;
77    if (!mdExtensions) {
78      // look for quarto 'from'
79      const from = findValue('from', yaml);
80      if (from) {
81        const fromStr = String(from);
82        const extensions = fromStr.match(/^\w+([+-][\w+-]+)$/);
83        if (extensions) {
84          mdExtensions = extensions[1];
85        }
86      }
87    }
88
89    // see if we have any markdown options defined
90    let yamlFormatConfig: PandocFormatConfig | undefined;
91    const yamlMarkdownOptions = yaml?.editor_options?.markdown;
92    if (yamlMarkdownOptions instanceof Object) {
93      yamlFormatConfig = readPandocFormatConfig(yamlMarkdownOptions);
94    }
95
96    // combine and return
97    if (mdExtensions || yamlFormatConfig) {
98      const formatConfig: PandocFormatConfig = yamlFormatConfig ? yamlFormatConfig : {};
99      if (mdExtensions) {
100        formatConfig.extensions = mdExtensions + (formatConfig.extensions || '');
101      }
102      return formatConfig;
103    } else {
104      return null;
105    }
106  } else {
107    return null;
108  }
109}
110
111function pandocFormatConfigFromYamlInDoc(doc: ProsemirrorNode, isRmd: boolean): PandocFormatConfig | null {
112  const yamlNodes = yamlMetadataNodes(doc);
113  if (yamlNodes.length > 0) {
114    return pandocFormatConfigFromYamlInCode(yamlNodes[0].node.textContent, isRmd);
115  } else {
116    return null;
117  }
118}
119
120function pandocFormatConfigFromCommentInCode(code: string): PandocFormatConfig | null {
121  const keyValueRegEx = /^([^:]+):\s*(.*)$/;
122  const match = matchPandocFormatComment(code);
123  if (match) {
124    const comment = match[1];
125    // split into semicolons
126    const fields = comment.split(/\s*;\s/).map(field => field.trim());
127    const variables: { [key: string]: string } = {};
128    fields.forEach(field => {
129      const keyValueMatch = field.match(keyValueRegEx);
130      if (keyValueMatch) {
131        variables[keyValueMatch[1].trim()] = keyValueMatch[2].trim();
132      }
133    });
134    return readPandocFormatConfig(variables);
135  } else {
136    return null;
137  }
138}
139
140function pandocFormatConfigFromCommentInDoc(doc: ProsemirrorNode): PandocFormatConfig | null {
141  let config: PandocFormatConfig | null = null;
142  let foundFirstRawInline = false;
143  doc.descendants((node, pos) => {
144    // don't search once we've found our target
145    if (foundFirstRawInline) {
146      return false;
147    }
148
149    // if it's a text node with a raw-html then scan it for the format comment
150    const schema = doc.type.schema;
151    if (
152      node.isText &&
153      schema.marks.raw_html_comment &&
154      schema.marks.raw_html_comment.isInSet(node.marks) &&
155      node.attrs.format
156    ) {
157      foundFirstRawInline = true;
158      config = pandocFormatConfigFromCommentInCode(node.textContent);
159      return false;
160    } else {
161      return true;
162    }
163  });
164  return config;
165}
166
167function readPandocFormatConfig(source: { [key: string]: any }) {
168  const asString = (obj: any): string => {
169    if (typeof obj === 'string') {
170      return obj;
171    } else if (obj) {
172      return obj.toString();
173    } else {
174      return '';
175    }
176  };
177
178  const asBoolean = (obj: any) => {
179    if (typeof obj === 'boolean') {
180      return obj;
181    } else {
182      const str = asString(obj).toLowerCase();
183      return str === 'true' || str === '1';
184    }
185  };
186
187  const readWrap = () => {
188    const wrap = source.wrap || source.wrap_column || source['fill-column'];
189    if (wrap) {
190      return asString(wrap);
191    } else {
192      return undefined;
193    }
194  };
195
196  const formatConfig: PandocFormatConfig = {};
197  if (source.mode) {
198    formatConfig.mode = asString(source.mode);
199  }
200  if (source.extensions) {
201    formatConfig.extensions = asString(source.extensions);
202  }
203  if (source.rmd_extensions) {
204    formatConfig.rmdExtensions = asString(source.rmd_extensions);
205  }
206  formatConfig.wrap = readWrap();
207  if (source.doctype) {
208    formatConfig.doctypes = asString(source.doctype)
209      .split(',')
210      .map(str => str.trim());
211  }
212  if (source.references) {
213    if (typeof source.references === 'string') {
214      formatConfig.references_location = source.references;
215    } else {
216      formatConfig.references_location = source.references.location;
217      formatConfig.references_prefix = source.references.prefix;
218    }
219  }
220  if (source.canonical) {
221    formatConfig.canonical = asBoolean(source.canonical);
222  }
223  return formatConfig;
224}
225
226export async function resolvePandocFormat(pandoc: PandocServer, format: EditorFormat): Promise<PandocFormat> {
227  // additional markdown variants we support
228  const kMarkdownVariants: { [key: string]: string[] } = {
229    [kCommonmarkFormat]: commonmarkExtensions(),
230    [kCommonmarkXFormat]: commonmarkXExtensions(),
231    [kGfmFormat]: gfmExtensions(),
232    goldmark: goldmarkExtensions(format),
233    blackfriday: blackfridayExtensions(format),
234  };
235
236  // setup warnings
237  const warnings: PandocFormatWarnings = { invalidFormat: '', invalidOptions: [] };
238
239  // alias options and basename
240  let options = format.pandocExtensions;
241  let baseName = format.pandocMode;
242
243  // validate the base format (fall back to markdown if it's not known)
244  if (
245    ![
246      kMarkdownFormat,
247      kMarkdownPhpextraFormat,
248      kMarkdownGithubFormat,
249      kMarkdownMmdFormat,
250      kMarkdownStrictFormat,
251      kGfmFormat,
252      kCommonmarkFormat,
253      kCommonmarkXFormat
254    ]
255      .concat(Object.keys(kMarkdownVariants))
256      .includes(baseName)
257  ) {
258    warnings.invalidFormat = baseName;
259    baseName = 'markdown';
260  }
261
262  // format options we will be building
263  let formatOptions: string;
264
265  // if we are using a variant then get it's base options and merge with user options
266  if (kMarkdownVariants[baseName]) {
267    const variant = kMarkdownVariants[baseName];
268    options = variant.map(option => `${option}`).join('') + options;
269    baseName = 'markdown_strict';
270  }
271
272  // query for format options
273  formatOptions = await pandoc.listExtensions(baseName);
274
275  // active pandoc extensions
276  const pandocExtensions: { [key: string]: boolean } = {};
277
278  // first parse extensions for format
279  parseExtensions(formatOptions).forEach(option => {
280    pandocExtensions[option.name] = option.enabled;
281  });
282
283  // now parse extensions for user options (validate and build format name)
284  const validOptionNames = parseExtensions(formatOptions).map(option => option.name);
285
286  let fullName = baseName;
287  parseExtensions(options).forEach(option => {
288    // validate that the option is valid
289    if (validOptionNames.includes(option.name)) {
290      // add option
291      fullName += (option.enabled ? '+' : '-') + option.name;
292      pandocExtensions[option.name] = option.enabled;
293    } else {
294      warnings.invalidOptions.push(option.name);
295    }
296  });
297
298  // return format name, enabled extensiosn, and warnings
299  return {
300    mode: format.pandocMode,
301    baseName,
302    fullName,
303    extensions: (pandocExtensions as unknown) as PandocExtensions,
304    warnings,
305  };
306}
307
308function parseExtensions(options: string) {
309  // remove any linebreaks
310  options = options.split('\n').join();
311
312  // parse into separate entries
313  const extensions: Array<{ name: string; enabled: boolean }> = [];
314  const re = /([+-])([a-z_]+)/g;
315  let match = re.exec(options);
316  while (match) {
317    extensions.push({ name: match[2], enabled: match[1] === '+' });
318    match = re.exec(options);
319  }
320
321  return extensions;
322}
323
324export function pandocFormatWith(format: string, prepend: string, append: string) {
325  const split = splitPandocFormatString(format);
326  return `${split.format}${prepend}${split.options}${append}`;
327}
328
329export function splitPandocFormatString(format: string) {
330  // split out base format from options
331  let optionsPos = format.indexOf('-');
332  if (optionsPos === -1) {
333    optionsPos = format.indexOf('+');
334  }
335  const base = optionsPos === -1 ? format : format.substr(0, optionsPos);
336  const options = optionsPos === -1 ? '' : format.substr(optionsPos);
337  return {
338    format: base,
339    options,
340  };
341}
342
343export function hasFencedCodeBlocks(pandocExtensions: PandocExtensions) {
344  return pandocExtensions.backtick_code_blocks || pandocExtensions.fenced_code_blocks;
345}
346
347// e.g. [My Heading] to link to ## My Heading
348export function hasShortcutHeadingLinks(pandocExtensions: PandocExtensions) {
349  return pandocExtensions.implicit_header_references && pandocExtensions.shortcut_reference_links;
350}
351
352function commonmarkExtensions(rawHTML = true) {
353  const extensions = [
354    rawHTML ? '+raw_html' : '-raw_html',
355    '+all_symbols_escapable',
356    '+backtick_code_blocks',
357    '+fenced_code_blocks',
358    '+space_in_atx_header',
359    '+intraword_underscores',
360    '+lists_without_preceding_blankline',
361    '+shortcut_reference_links',
362  ];
363  return extensions;
364}
365
366// https://github.com/jgm/pandoc/commit/0aed9dd589189a9bbe5cae99e0e024e2d4a92c36
367function commonmarkXExtensions() {
368  const extensions = [
369    '+pipe_tables',
370    '+raw_html',
371    '+auto_identifiers',
372    '+strikeout',
373    '+task_lists',
374    '+emoji',
375    '+raw_tex',
376    '+smart',
377    '+tex_math_dollars',
378    '+superscript',
379    '+subscript',
380    '+definition_lists',
381    '+footnotes',
382    '+fancy_lists',
383    '+fenced_divs',
384    '+bracketed_spans',
385    '+raw_attribute',
386    '+implicit_header_references',
387    // '+attributes' (not yet)
388  ];
389  return extensions;
390}
391
392function gfmExtensions() {
393  const extensions = [
394    ...commonmarkExtensions(),
395    '+auto_identifiers',
396    '+autolink_bare_uris',
397    '+emoji',
398    '+gfm_auto_identifiers',
399    '+pipe_tables',
400    '+strikeout',
401    '+task_lists',
402  ];
403  return extensions;
404}
405
406// https://gohugo.io/getting-started/configuration-markup/#goldmark
407// https://github.com/yuin/goldmark/#html-renderer-options
408function goldmarkExtensions(format: EditorFormat) {
409  const extensions = [
410    // start with commonmark
411    ...commonmarkExtensions(false),
412
413    // adds most of gfm
414    '+pipe_tables',
415    '+strikeout',
416    '+autolink_bare_uris',
417    '+task_lists',
418    '+backtick_code_blocks',
419
420    // plus some extras
421    '+definition_lists',
422    '+footnotes',
423    '+smart',
424
425    // hugo preprocessor supports yaml metadata
426    '+yaml_metadata_block',
427  ];
428
429  if (includeTexMathDollars(format)) {
430    extensions.push('+tex_math_dollars');
431  }
432
433  return extensions;
434}
435
436// https://github.com/russross/blackfriday/tree/v2#extensions
437function blackfridayExtensions(format: EditorFormat) {
438  const extensions = [
439    '+intraword_underscores',
440    '+pipe_tables',
441    '+backtick_code_blocks',
442    '+definition_lists',
443    '+footnotes',
444    '+autolink_bare_uris',
445    '+strikeout',
446    '+smart',
447    '+yaml_metadata_block',
448  ];
449
450  if (includeTexMathDollars(format)) {
451    extensions.push('+tex_math_dollars');
452  }
453
454  return extensions;
455}
456
457function includeTexMathDollars(format: EditorFormat) {
458  // hugo users often sort out some way to include math so we enable it for hugo
459  return format.docTypes.includes(kHugoDocType) || format.rmdExtensions.blogdownMathInCode;
460}
461